summaryrefslogtreecommitdiff
path: root/main.py
diff options
context:
space:
mode:
Diffstat (limited to 'main.py')
-rwxr-xr-xmain.py62
1 files changed, 50 insertions, 12 deletions
diff --git a/main.py b/main.py
index 473946d..ae36453 100755
--- a/main.py
+++ b/main.py
@@ -2,11 +2,17 @@
from optparse import OptionParser
import os
import re
+import sys
+import datetime
+
report = {}
wordscore = {}
filescore = {}
filelist = list()
skipped = 0
+opened = 0
+datasize = 0
+progresstext = ""
def sortscore(score, reverse=False):
sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse)
@@ -22,12 +28,14 @@ def printscore(report):
print i[0] + ':' + str(i[1])
def wholeword(word, string):
+ re.purge()
matches = []
- regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + word.lower() + r')([A-Z]|[^a-zA-Z]|\b)'
- regexL = r'([a-z]|[^a-zA-Z]|\b)(' + word.upper() + r')([a-z]|[^a-zA-Z]|\b)'
+ regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)'
+ regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)'
mU = re.search(regexU, string)
if "groups" in dir(mU):
matches.append(mU.groups())
+ re.purge()
mL = re.search(regexL, string)
if "groups" in dir(mL):
matches.append(mL.groups())
@@ -41,11 +49,13 @@ def skipfile(filename,skippedexts):
return True
return False
-def scoretext(wordlist, text):
+def scoretext(wordlist, text, maxwholewordlen = -1):
score = {}
for word in wordlist:
- score[word] = len(wholeword(word,text))
-
+ if int(len(word)) > int(maxwholewordlen):
+ score[word] = text.lower().count(word.lower())
+ else:
+ score[word] = len(wholeword(word,text))
return score
parser = OptionParser()
@@ -56,22 +66,37 @@ parser.add_option("-v", "--verbose", dest="verbose", help="print verberose infor
parser.add_option("-r", "--report", dest="printreport", default="w", help="print score")
parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true")
parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true")
+parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true")
+parser.add_option("-l", "--max-wholeword-length", dest="maxwholewordlength", type="int", default=-1, help="maximun length of a word allowed to only find matches on whole word")
(options, args) = parser.parse_args()
if options.wordlistfilename:
- wordlist = open(options.wordlistfilename).read().lower().strip().split('\n')
+ wordlist = list(set(open(options.wordlistfilename).read().lower().strip().split('\n')))
if options.show_wordlist: print wordlist; exit()
for a in args:
+ #filelist.append(a)
for (path, dirs, files) in os.walk(a):
+ if 'CVS' in dirs:
+ dirs.remove('CVS')
+ if '.git' in dirs:
+ dirs.remove('.git')
+ if '.bzr' in dirs:
+ dirs.remove('.bzr')
+ if '.hg' in dirs:
+ dirs.remove('.hg')
+ if '.svn' in dirs:
+ dirs.remove('.svn')
+
for file in files:
filelist.append(path + '/' + file)
if options.suspiciousfilename:
filelist += options.suspiciousfilename
+start = datetime.datetime.now()
for file in filelist:
if skipfile(file, options.skipfileextensions):
skipped += 1
@@ -81,10 +106,21 @@ for file in filelist:
except:
print "failed to open: " + file
continue
-
+ opened +=1
+ now = datetime.datetime.now()
+ estimate = (((now - start) / (opened + skipped)) * len(filelist))
+ if options.display_progress:
+ print '\r' + " " * len(progresstext) + '\r',
+ progresstext = str(((opened + skipped)*1.0/len(filelist))*100)[:5] + '% '+ " time left:" + str(estimate).split('.')[0] + ' ' + file + '\r'
+ print progresstext,
+ sys.stdout.flush()
filecontents = f.read()
-
- report[file] = scoretext(wordlist, filecontents)
+ datasize += len(filecontents)
+ filenamescore = scoretext(wordlist, file, options.maxwholewordlength)
+ filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength)
+ report[file] = {}
+ for k in filecontentsscore.keys():
+ report[file][k] = filenamescore[k] + filecontentsscore[k]
for file in report.keys():
for word in report[file].keys():
@@ -109,9 +145,11 @@ if options.printreport:
printscore(sortscore(wordscore))
if options.display_counts:
- print "total files: " + str(len(filelist)) ,
- print "suspicious files: " + str(len(sortscore(filescore))) ,
- print "skipped files: " + str(skipped)
+ print "total files:" + str(len(filelist)) ,
+ print "suspicious files:" + str(len(sortscore(filescore))) ,
+ print "skipped files:" + str(skipped) ,
+ print "searched:" + str(datasize) + 'B',
+ print "time:" + str(datetime.datetime.now() - start).split('.')[0]
def test():
print wholeword("ear","bearth")