From 81d7583b05c546a894cdf33e0d8426af5edca1ae Mon Sep 17 00:00:00 2001 From: Marc Jones Date: Wed, 5 Feb 2014 11:07:37 -0500 Subject: only match words made only of digits that are not adjacent to other digits if wordboundaries are turn on --- git_and_bzr_results_2-5-14.txt | 3 -- main.py | 52 ++++++++++++------ main.py~ | 119 ++++++++++++++++++++++++++++------------- 3 files changed, 119 insertions(+), 55 deletions(-) diff --git a/git_and_bzr_results_2-5-14.txt b/git_and_bzr_results_2-5-14.txt index f2aadf4..b3cb11f 100644 --- a/git_and_bzr_results_2-5-14.txt +++ b/git_and_bzr_results_2-5-14.txt @@ -1,4 +1,3 @@ -mjones@hand:~/Documents/LF (export controls)/suspicious$ ./main.py ../bzr.lf/ ../git.lf -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 4 ../bzr.lf/openprinting/foomatic/foomatic-db/db/source/printer/Brother-HL-2030.xml(1): url(1); ../bzr.lf/lsb/devel/build_env/headers/All/5.0/libxml2/libxml/xmlerror.h.defs(1): url(1); ../bzr.lf/openprinting/foomatic/foomatic-db-devel/db/source/printer/Panasonic-KX-P1180i.xml(1): url(1); @@ -17956,5 +17955,3 @@ mjones@hand:~/Documents/LF (export controls)/suspicious$ ./main.py ../bzr.lf/ .. ../bzr.lf/openprinting/foomatic/foomatic-db/db/source/PPD/Lexmark/Lexmark_C522.ppd(229): 22(4); password(11); des(1); magenta(213); ../git.lf/ltsi-kernel/patches.lttng/lttng-2.3.4.patch(324): random(59); entropy(65); 22(2); parity(4); export(100); ice(1); hash(1); bob(1); ca(1); encryption(2); rc6(1); enc(1); secure(8); crypt(2); padding(75); url(1); ../bzr.lf/ally/devel/a11yspecs-atspi/adoc/AccessibilityAPIComparisons.htm(346): 22(2); password(2); signature(3); padding(338); url(1); -total files:66547 suspicious files:17957 skipped files:535 searched:594180174B time:7:03:22 - diff --git a/main.py b/main.py index 22da098..7c4102b 100755 --- a/main.py +++ b/main.py @@ -1,4 +1,6 @@ #!/usr/bin/python +#example command + from optparse import OptionParser import os import re @@ -61,15 +63,23 @@ def summary(report): def wholeword(word, string): re.purge() matches = [] - regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)' - regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)' - mU = re.search(regexU, string) - if "groups" in dir(mU): - matches.append(mU.groups()) - re.purge() - mL = re.search(regexL, string) - if "groups" in dir(mL): - matches.append(mL.groups()) + + try int(word): + regexNum = r'([^0-9]|\b)(' + word + r')([^0-9]|\b)' + m = re.search(regexNum, string) + if "groups" in dir(mU): + matches.append(mU.groups()) + + except ValueError: + regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)' + regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)' + mU = re.search(regexU, string) + if "groups" in dir(mU): + matches.append(mU.groups()) + re.purge() + mL = re.search(regexL, string) + if "groups" in dir(mL): + matches.append(mL.groups()) return matches def skipfile(filename,skippedexts): @@ -89,12 +99,14 @@ def scoretext(wordlist, text, maxwholewordlen = -1): score[word] = len(wholeword(word,text)) return score -parser = OptionParser() +usage = "%prog [options] DIRECTORY ... DIRECTORYN" +epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3" +parser = OptionParser(usage = usage, epilog = epilog) parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append") parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for") parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append") parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true") -parser.add_option("-r", "--report", dest="printreport", default="w", help="print score") +parser.add_option("-r", "--report", dest="printreport", default="wf", help="print score") parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true") parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true") parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true") @@ -109,7 +121,7 @@ if options.wordlistfilename: if options.show_wordlist: print wordlist; exit() -if options.displaysummary and options.summaryfile +if options.displaysummary and options.summaryfile: report = dict() try: summaryfile = open(options.summaryfile) @@ -130,9 +142,17 @@ if options.displaysummary and options.summaryfile w = w.strip() word = w[:w.find('(')] wcount = w[w.find('(')+1:w.find(')')] - report[filename][word] = wcount - - print summary(report) + report[filename][word] = int(wcount) + + if options.printreport: + if options.printreport == "f": + printscore(sortscore(scorefile(report))) + elif options.printreport == "w": + printscore(sortscore(scorewords(report))) + elif options.printreport == "wf" or options.printreport == "fw": + print summary(report) + else: + print summary(report) exit() @@ -202,7 +222,7 @@ if options.display_counts: if options.summaryfile and len(filelist) > 0 and not options.displaysummary: summaryfilename = options.summaryfile counter = None - while os.path.isfile(summaryfilename) + while os.path.isfile(summaryfilename): counter +=1 summaryfilename = options.summaryfile + '.' + str(counter) summaryfile = open(summaryfile, 'w+') diff --git a/main.py~ b/main.py~ index 8ad0891..fe318c1 100755 --- a/main.py~ +++ b/main.py~ @@ -1,4 +1,6 @@ #!/usr/bin/python +#example command + from optparse import OptionParser import os import re @@ -27,6 +29,37 @@ def printscore(report): for i in report: print i[0] + ':' + str(i[1]) +def scorewords(report): + for file in report.keys(): + for word in report[file].keys(): + if not word in wordscore: + wordscore[word] = 0 + if not file in filescore: + filescore[file] = 0 + wordscore[word] += report[file][word] + return wordscore + +def scorefile(report): + for file in report.keys(): + for word in report[file].keys(): + if not word in wordscore: + wordscore[word] = 0 + if not file in filescore: + filescore[file] = 0 + filescore[file] += report[file][word] + return filescore + +def summary(report): + filescore = scorefile(report) + text = "" + for file in sortscore(filescore): + text += file[0] + '(' + str(file[1]) + '):' + for word in report[file[0]].keys(): + if report[file[0]][word] > 0: + text += word + '(' + str(report[file[0]][word]) + ');' + text += '\n' + return text + def wholeword(word, string): re.purge() matches = [] @@ -58,12 +91,14 @@ def scoretext(wordlist, text, maxwholewordlen = -1): score[word] = len(wholeword(word,text)) return score -parser = OptionParser() +usage = "%prog [options] DIRECTORY ... DIRECTORYN" +epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3" +parser = OptionParser(usage = usage, epilog = epilog) parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append") parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for") parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append") parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true") -parser.add_option("-r", "--report", dest="printreport", default="w", help="print score") +parser.add_option("-r", "--report", dest="printreport", default="wf", help="print score") parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true") parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true") parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true") @@ -78,6 +113,41 @@ if options.wordlistfilename: if options.show_wordlist: print wordlist; exit() +if options.displaysummary and options.summaryfile: + report = dict() + try: + summaryfile = open(options.summaryfile) + except: + print "no summary file: " + options.summaryfile + exit() + #sample input + #../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1); + for line in summaryfile: + #find the file name which is before the matching parathsis before the last colon on the line + filename = line[:line[:line.rfind(':')].rfind('(')] + #find the total number of words found by locating the end of the filename and taking the number in parathesis right before the : + totalfilecount = line[line[:line.rfind(':')].rfind('(')+1:line[:line.rfind(':')].rfind(')')] + #find the list of words following the :, and split them by the ;, and then drop the last item on the list which is always a \n + foundwords = line[line.rfind(':')+1:].split(';')[:-1] + report[filename] = dict() + for w in foundwords: + w = w.strip() + word = w[:w.find('(')] + wcount = w[w.find('(')+1:w.find(')')] + report[filename][word] = int(wcount) + + if options.printreport: + if options.printreport == "f": + printscore(sortscore(scorefile(report))) + elif options.printreport == "w": + printscore(sortscore(scorewords(report))) + elif options.printreport == "wf" or options.printreport == "fw": + print summary(report) + else: + print summary(report) + exit() + + for a in args: #filelist.append(a) for (path, dirs, files) in os.walk(a): @@ -98,9 +168,6 @@ for a in args: if options.suspiciousfilename: filelist += options.suspiciousfilename -if options.summaryfile: - summaryfile = open(summaryfile, 'w') - start = datetime.datetime.now() for file in filelist: if skipfile(file, options.skipfileextensions): @@ -128,36 +195,6 @@ for file in filelist: report[file][k] = filenamescore[k] + filecontentsscore[k] -def scorewords(report): - for file in report.keys(): - for word in report[file].keys(): - if not word in wordscore: - wordscore[word] = 0 - if not file in filescore: - filescore[file] = 0 - wordscore[word] += report[file][word] - return wordscore - -def scorefile(report): - for file in report.keys(): - for word in report[file].keys(): - if not word in wordscore: - wordscore[word] = 0 - if not file in filescore: - filescore[file] = 0 - filescore[file] += report[file][word] - return filescore - -def summary(report): - filescore = scorefile(report) - text = "" - for file in sortscore(filescore): - text += file[0] + '(' + str(file[1]) + '):' - for word in report[file[0]].keys(): - if report[file[0]][word] > 0: - text += word + '(' + str(report[file[0]][word]) + ');' - text += '\n' - return text if options.printreport: if options.printreport == "f": @@ -173,7 +210,17 @@ if options.display_counts: print "skipped files:" + str(skipped) , print "searched:" + str(datasize) + 'B', print "time:" + str(datetime.datetime.now() - start).split('.')[0] - + +if options.summaryfile and len(filelist) > 0 and not options.displaysummary: + summaryfilename = options.summaryfile + counter = None + while os.path.isfile(summaryfilename): + counter +=1 + summaryfilename = options.summaryfile + '.' + str(counter) + summaryfile = open(summaryfile, 'w+') + summaryfile.write(summary(report)) + summaryfile.close() + def test(): print wholeword("ear","bearth") print wholeword("ear","BearTH") -- cgit v1.2.1