diff options
Diffstat (limited to 'main.py')
-rwxr-xr-x | main.py | 62 |
1 files changed, 50 insertions, 12 deletions
@@ -2,11 +2,17 @@ from optparse import OptionParser import os import re +import sys +import datetime + report = {} wordscore = {} filescore = {} filelist = list() skipped = 0 +opened = 0 +datasize = 0 +progresstext = "" def sortscore(score, reverse=False): sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse) @@ -22,12 +28,14 @@ def printscore(report): print i[0] + ':' + str(i[1]) def wholeword(word, string): + re.purge() matches = [] - regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + word.lower() + r')([A-Z]|[^a-zA-Z]|\b)' - regexL = r'([a-z]|[^a-zA-Z]|\b)(' + word.upper() + r')([a-z]|[^a-zA-Z]|\b)' + regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)' + regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)' mU = re.search(regexU, string) if "groups" in dir(mU): matches.append(mU.groups()) + re.purge() mL = re.search(regexL, string) if "groups" in dir(mL): matches.append(mL.groups()) @@ -41,11 +49,13 @@ def skipfile(filename,skippedexts): return True return False -def scoretext(wordlist, text): +def scoretext(wordlist, text, maxwholewordlen = -1): score = {} for word in wordlist: - score[word] = len(wholeword(word,text)) - + if int(len(word)) > int(maxwholewordlen): + score[word] = text.lower().count(word.lower()) + else: + score[word] = len(wholeword(word,text)) return score parser = OptionParser() @@ -56,22 +66,37 @@ parser.add_option("-v", "--verbose", dest="verbose", help="print verberose infor parser.add_option("-r", "--report", dest="printreport", default="w", help="print score") parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true") parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true") +parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true") +parser.add_option("-l", "--max-wholeword-length", dest="maxwholewordlength", type="int", default=-1, help="maximun length of a word allowed to only find matches on whole word") (options, args) = parser.parse_args() if options.wordlistfilename: - wordlist = open(options.wordlistfilename).read().lower().strip().split('\n') + wordlist = list(set(open(options.wordlistfilename).read().lower().strip().split('\n'))) if options.show_wordlist: print wordlist; exit() for a in args: + #filelist.append(a) for (path, dirs, files) in os.walk(a): + if 'CVS' in dirs: + dirs.remove('CVS') + if '.git' in dirs: + dirs.remove('.git') + if '.bzr' in dirs: + dirs.remove('.bzr') + if '.hg' in dirs: + dirs.remove('.hg') + if '.svn' in dirs: + dirs.remove('.svn') + for file in files: filelist.append(path + '/' + file) if options.suspiciousfilename: filelist += options.suspiciousfilename +start = datetime.datetime.now() for file in filelist: if skipfile(file, options.skipfileextensions): skipped += 1 @@ -81,10 +106,21 @@ for file in filelist: except: print "failed to open: " + file continue - + opened +=1 + now = datetime.datetime.now() + estimate = (((now - start) / (opened + skipped)) * len(filelist)) + if options.display_progress: + print '\r' + " " * len(progresstext) + '\r', + progresstext = str(((opened + skipped)*1.0/len(filelist))*100)[:5] + '% '+ " time left:" + str(estimate).split('.')[0] + ' ' + file + '\r' + print progresstext, + sys.stdout.flush() filecontents = f.read() - - report[file] = scoretext(wordlist, filecontents) + datasize += len(filecontents) + filenamescore = scoretext(wordlist, file, options.maxwholewordlength) + filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength) + report[file] = {} + for k in filecontentsscore.keys(): + report[file][k] = filenamescore[k] + filecontentsscore[k] for file in report.keys(): for word in report[file].keys(): @@ -109,9 +145,11 @@ if options.printreport: printscore(sortscore(wordscore)) if options.display_counts: - print "total files: " + str(len(filelist)) , - print "suspicious files: " + str(len(sortscore(filescore))) , - print "skipped files: " + str(skipped) + print "total files:" + str(len(filelist)) , + print "suspicious files:" + str(len(sortscore(filescore))) , + print "skipped files:" + str(skipped) , + print "searched:" + str(datasize) + 'B', + print "time:" + str(datetime.datetime.now() - start).split('.')[0] def test(): print wholeword("ear","bearth") |