From 203ff264c859eb03a027d1b07f2670611e2240e4 Mon Sep 17 00:00:00 2001 From: Marc Jones Date: Wed, 5 Feb 2014 12:12:25 -0500 Subject: added ability to save and display summary files --- main.py | 28 ++++++++++++++++++---------- main.py~ | 28 ++++++++++++++++++---------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/main.py b/main.py index 029c4eb..b157ab5 100755 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ import os import re import sys import datetime +import string report = {} wordscore = {} @@ -64,16 +65,16 @@ def wholeword(word, string): re.purge() matches = [] - try: + if word.isdigit(): int(word) regexNum = r'([^0-9]|\b)(' + word + r')([^0-9]|\b)' mN = re.search(regexNum, string) if "groups" in dir(mN): matches.append(mN.groups()) - except ValueError: - regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)' - regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)' + else: + regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + word.lower() + r')([A-Z]|[^a-zA-Z]|\b)' + regexL = r'([a-z]|[^a-zA-Z]|\b)(' + word.upper() + r')([a-z]|[^a-zA-Z]|\b)' mU = re.search(regexU, string) if "groups" in dir(mU): matches.append(mU.groups()) @@ -94,10 +95,16 @@ def skipfile(filename,skippedexts): def scoretext(wordlist, text, maxwholewordlen = -1): score = {} for word in wordlist: - if int(len(word)) > int(maxwholewordlen): - score[word] = text.lower().count(word.lower()) + wordreg = word.replace('-', ' ') + wordreg = wordreg.replace(' ', '['+string.punctuation+' ]*') + if int(len(word)) > int(maxwholewordlen): + matches = [] + m = re.search(wordreg.lower(),text.lower()) + if "groups" in dir(m): + matches.append(m.groups()) + score[word] = len(matches) else: - score[word] = len(wholeword(word,text)) + score[word] = len(wholeword(wordreg,text)) return score usage = "%prog [options] DIRECTORY ... DIRECTORYN" @@ -203,7 +210,8 @@ for file in filelist: for k in filecontentsscore.keys(): report[file][k] = filenamescore[k] + filecontentsscore[k] - +if options.display_progress: + print '\r' + " " * len(progresstext) + '\r', if options.printreport: if options.printreport == "f": @@ -215,7 +223,7 @@ if options.printreport: if options.display_counts: print "total files:" + str(len(filelist)) , - print "suspicious files:" + str(len(sortscore(filescore))) , + print "suspicious files:" + str(len(sortscore(scorefile(report)))) , print "skipped files:" + str(skipped) , print "searched:" + str(datasize) + 'B', print "time:" + str(datetime.datetime.now() - start).split('.')[0] @@ -226,7 +234,7 @@ if options.summaryfile and len(filelist) > 0 and not options.displaysummary: while os.path.isfile(summaryfilename): counter +=1 summaryfilename = options.summaryfile + '.' + str(counter) - summaryfile = open(summaryfile, 'w+') + summaryfile = open(summaryfilename, 'w+') summaryfile.write(summary(report)) summaryfile.close() diff --git a/main.py~ b/main.py~ index 7fa6f0e..2f9fdf3 100755 --- a/main.py~ +++ b/main.py~ @@ -6,6 +6,7 @@ import os import re import sys import datetime +import string report = {} wordscore = {} @@ -64,16 +65,16 @@ def wholeword(word, string): re.purge() matches = [] - try: + if word.isdigit(): int(word) regexNum = r'([^0-9]|\b)(' + word + r')([^0-9]|\b)' mN = re.search(regexNum, string) if "groups" in dir(mN): matches.append(mN.groups()) - except ValueError: - regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)' - regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)' + else: + regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + word.lower() + r')([A-Z]|[^a-zA-Z]|\b)' + regexL = r'([a-z]|[^a-zA-Z]|\b)(' + word.upper() + r')([a-z]|[^a-zA-Z]|\b)' mU = re.search(regexU, string) if "groups" in dir(mU): matches.append(mU.groups()) @@ -94,10 +95,16 @@ def skipfile(filename,skippedexts): def scoretext(wordlist, text, maxwholewordlen = -1): score = {} for word in wordlist: - if int(len(word)) > int(maxwholewordlen): - score[word] = text.lower().count(word.lower()) + wordreg = word.replace('-', ' ') + wordreg = wordreg.replace(' ', '['+string.punctuation+' ]*') + if int(len(word)) > int(maxwholewordlen): + matches = [] + m = re.search(wordreg.lower(),text.lower()) + if "groups" in dir(m): + matches.append(m.groups()) + score[word] = len(matches) else: - score[word] = len(wholeword(word,text)) + score[word] = len(wholeword(wordreg,text)) return score usage = "%prog [options] DIRECTORY ... DIRECTORYN" @@ -203,7 +210,8 @@ for file in filelist: for k in filecontentsscore.keys(): report[file][k] = filenamescore[k] + filecontentsscore[k] - +if options.display_progress: + print '\r' + " " * len(progresstext) + '\r', if options.printreport: if options.printreport == "f": @@ -215,7 +223,7 @@ if options.printreport: if options.display_counts: print "total files:" + str(len(filelist)) , - print "suspicious files:" + str(len(sortscore(filescore))) , + print "suspicious files:" + str(len(sortscore(scorefile(report)))) , print "skipped files:" + str(skipped) , print "searched:" + str(datasize) + 'B', print "time:" + str(datetime.datetime.now() - start).split('.')[0] @@ -246,4 +254,4 @@ def test(): print wholeword("ear","ear\n\r") print wholeword("ear","myEAR() MYear: myEAR()") -test() +#test() -- cgit v1.2.3