From 81d7583b05c546a894cdf33e0d8426af5edca1ae Mon Sep 17 00:00:00 2001
From: Marc Jones <mjones@softwarefreedom.org>
Date: Wed, 5 Feb 2014 11:07:37 -0500
Subject: only match words made only of digits that are not adjacent to other
 digits if wordboundaries are turn on

---
 git_and_bzr_results_2-5-14.txt |   3 --
 main.py                        |  52 ++++++++++++------
 main.py~                       | 119 ++++++++++++++++++++++++++++-------------
 3 files changed, 119 insertions(+), 55 deletions(-)

diff --git a/git_and_bzr_results_2-5-14.txt b/git_and_bzr_results_2-5-14.txt
index f2aadf4..b3cb11f 100644
--- a/git_and_bzr_results_2-5-14.txt
+++ b/git_and_bzr_results_2-5-14.txt
@@ -1,4 +1,3 @@
-mjones@hand:~/Documents/LF (export controls)/suspicious$ ./main.py ../bzr.lf/ ../git.lf -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 4
 ../bzr.lf/openprinting/foomatic/foomatic-db/db/source/printer/Brother-HL-2030.xml(1): url(1);                                                         
 ../bzr.lf/lsb/devel/build_env/headers/All/5.0/libxml2/libxml/xmlerror.h.defs(1): url(1); 
 ../bzr.lf/openprinting/foomatic/foomatic-db-devel/db/source/printer/Panasonic-KX-P1180i.xml(1): url(1); 
@@ -17956,5 +17955,3 @@ mjones@hand:~/Documents/LF (export controls)/suspicious$ ./main.py ../bzr.lf/ ..
 ../bzr.lf/openprinting/foomatic/foomatic-db/db/source/PPD/Lexmark/Lexmark_C522.ppd(229): 22(4); password(11); des(1); magenta(213); 
 ../git.lf/ltsi-kernel/patches.lttng/lttng-2.3.4.patch(324): random(59); entropy(65); 22(2); parity(4); export(100); ice(1); hash(1); bob(1); ca(1); encryption(2); rc6(1); enc(1); secure(8); crypt(2); padding(75); url(1); 
 ../bzr.lf/ally/devel/a11yspecs-atspi/adoc/AccessibilityAPIComparisons.htm(346): 22(2); password(2); signature(3); padding(338); url(1); 
-total files:66547 suspicious files:17957 skipped files:535 searched:594180174B time:7:03:22
-
diff --git a/main.py b/main.py
index 22da098..7c4102b 100755
--- a/main.py
+++ b/main.py
@@ -1,4 +1,6 @@
 #!/usr/bin/python
+#example command
+
 from optparse import OptionParser
 import os
 import re
@@ -61,15 +63,23 @@ def summary(report):
 def wholeword(word, string):
 	re.purge()
 	matches = []
-	regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)'
-	regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)'
-	mU = re.search(regexU, string)
-	if "groups" in dir(mU):
-		matches.append(mU.groups())
-	re.purge()
-	mL = re.search(regexL, string)
-	if "groups" in dir(mL):
-		matches.append(mL.groups())
+	
+	try int(word):
+		regexNum = r'([^0-9]|\b)(' + word + r')([^0-9]|\b)'
+		m = re.search(regexNum, string)
+		if "groups" in dir(mU):
+			matches.append(mU.groups())
+	
+	except ValueError:
+		regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)'
+		regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)'
+		mU = re.search(regexU, string)
+		if "groups" in dir(mU):
+			matches.append(mU.groups())
+		re.purge()
+		mL = re.search(regexL, string)
+		if "groups" in dir(mL):
+			matches.append(mL.groups())
 	return matches
 
 def skipfile(filename,skippedexts):
@@ -89,12 +99,14 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
 			score[word] = len(wholeword(word,text))
 	return score
 
-parser = OptionParser()
+usage = "%prog [options] DIRECTORY ... DIRECTORYN"
+epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
+parser = OptionParser(usage = usage, epilog = epilog)
 parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append")
 parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for")
 parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append")
 parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true")
-parser.add_option("-r", "--report", dest="printreport", default="w", help="print score")
+parser.add_option("-r", "--report", dest="printreport", default="wf", help="print score")
 parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true")
 parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true")
 parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true")
@@ -109,7 +121,7 @@ if options.wordlistfilename:
 			
 if options.show_wordlist: print wordlist; exit()
 
-if options.displaysummary and options.summaryfile
+if options.displaysummary and options.summaryfile:
 	report = dict()
 	try:
 		summaryfile = open(options.summaryfile)
@@ -130,9 +142,17 @@ if options.displaysummary and options.summaryfile
 			w = w.strip()
 			word = w[:w.find('(')]
 			wcount = w[w.find('(')+1:w.find(')')]		
-			report[filename][word] = wcount
-
-	print summary(report)
+			report[filename][word] = int(wcount)
+
+	if options.printreport:
+		if options.printreport == "f":
+			printscore(sortscore(scorefile(report)))
+		elif options.printreport == "w":
+			printscore(sortscore(scorewords(report)))
+		elif options.printreport == "wf" or options.printreport == "fw":
+			print summary(report)			
+	else:
+		print summary(report)
 	exit()
 
 
@@ -202,7 +222,7 @@ if options.display_counts:
 if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
 	summaryfilename = options.summaryfile	
 	counter = None
-	while os.path.isfile(summaryfilename)
+	while os.path.isfile(summaryfilename):
 		counter +=1
 		summaryfilename = options.summaryfile + '.' + str(counter)
 	summaryfile = open(summaryfile, 'w+')
diff --git a/main.py~ b/main.py~
index 8ad0891..fe318c1 100755
--- a/main.py~
+++ b/main.py~
@@ -1,4 +1,6 @@
 #!/usr/bin/python
+#example command
+
 from optparse import OptionParser
 import os
 import re
@@ -27,6 +29,37 @@ def printscore(report):
 	for i in report:
 		print i[0] + ':' + str(i[1])
 
+def scorewords(report):
+	for file in report.keys():
+		for word in report[file].keys():
+			if not word in wordscore:
+				wordscore[word] = 0
+			if not file in filescore:
+				filescore[file] = 0
+			wordscore[word] += report[file][word]
+	return wordscore
+
+def scorefile(report):
+	for file in report.keys():
+		for word in report[file].keys():
+			if not word in wordscore:
+				wordscore[word] = 0
+			if not file in filescore:
+				filescore[file] = 0
+			filescore[file] += report[file][word]
+	return filescore
+
+def summary(report):
+	filescore = scorefile(report)
+	text = ""
+	for file in sortscore(filescore):
+		text += file[0] + '(' + str(file[1]) + '):'
+		for word in report[file[0]].keys():
+			if report[file[0]][word] > 0:
+				text += word + '(' + str(report[file[0]][word]) + ');' 
+		text += '\n'
+	return text
+
 def wholeword(word, string):
 	re.purge()
 	matches = []
@@ -58,12 +91,14 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
 			score[word] = len(wholeword(word,text))
 	return score
 
-parser = OptionParser()
+usage = "%prog [options] DIRECTORY ... DIRECTORYN"
+epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
+parser = OptionParser(usage = usage, epilog = epilog)
 parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append")
 parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for")
 parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append")
 parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true")
-parser.add_option("-r", "--report", dest="printreport", default="w", help="print score")
+parser.add_option("-r", "--report", dest="printreport", default="wf", help="print score")
 parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true")
 parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true")
 parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true")
@@ -78,6 +113,41 @@ if options.wordlistfilename:
 			
 if options.show_wordlist: print wordlist; exit()
 
+if options.displaysummary and options.summaryfile:
+	report = dict()
+	try:
+		summaryfile = open(options.summaryfile)
+	except:
+		print "no summary file: " + options.summaryfile
+		exit()
+	#sample input
+	#../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1);
+	for line in summaryfile:
+		#find the file name which is before the matching parathsis before the last colon on the line
+		filename = line[:line[:line.rfind(':')].rfind('(')]
+		#find the total number of words found by locating the end of the filename and taking the number in parathesis right before the :
+		totalfilecount = line[line[:line.rfind(':')].rfind('(')+1:line[:line.rfind(':')].rfind(')')]
+		#find the list of words following the :, and split them by the ;, and then drop the last item on the list which is always a \n
+		foundwords = line[line.rfind(':')+1:].split(';')[:-1]
+		report[filename] = dict()		
+		for w in foundwords:
+			w = w.strip()
+			word = w[:w.find('(')]
+			wcount = w[w.find('(')+1:w.find(')')]		
+			report[filename][word] = int(wcount)
+
+	if options.printreport:
+		if options.printreport == "f":
+			printscore(sortscore(scorefile(report)))
+		elif options.printreport == "w":
+			printscore(sortscore(scorewords(report)))
+		elif options.printreport == "wf" or options.printreport == "fw":
+			print summary(report)			
+	else:
+		print summary(report)
+	exit()
+
+
 for a in args:
 	#filelist.append(a)
 	for (path, dirs, files) in os.walk(a):
@@ -98,9 +168,6 @@ for a in args:
 if options.suspiciousfilename:
 	filelist += options.suspiciousfilename
 
-if options.summaryfile:
-	summaryfile = open(summaryfile, 'w')
-
 start = datetime.datetime.now()
 for file in filelist:
 	if skipfile(file, options.skipfileextensions):
@@ -128,36 +195,6 @@ for file in filelist:
 		report[file][k] = filenamescore[k] + filecontentsscore[k]
 
 
-def scorewords(report):
-	for file in report.keys():
-		for word in report[file].keys():
-			if not word in wordscore:
-				wordscore[word] = 0
-			if not file in filescore:
-				filescore[file] = 0
-			wordscore[word] += report[file][word]
-	return wordscore
-
-def scorefile(report):
-	for file in report.keys():
-		for word in report[file].keys():
-			if not word in wordscore:
-				wordscore[word] = 0
-			if not file in filescore:
-				filescore[file] = 0
-			filescore[file] += report[file][word]
-	return filescore
-
-def summary(report):
-	filescore = scorefile(report)
-	text = ""
-	for file in sortscore(filescore):
-		text += file[0] + '(' + str(file[1]) + '):'
-		for word in report[file[0]].keys():
-			if report[file[0]][word] > 0:
-				text += word + '(' + str(report[file[0]][word]) + ');' 
-		text += '\n'
-	return text
 
 if options.printreport:
 	if options.printreport == "f":
@@ -173,7 +210,17 @@ if options.display_counts:
 	print "skipped files:" + str(skipped) ,
 	print "searched:" + str(datasize) + 'B', 
 	print "time:" + str(datetime.datetime.now() - start).split('.')[0]
- 
+
+if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
+	summaryfilename = options.summaryfile	
+	counter = None
+	while os.path.isfile(summaryfilename):
+		counter +=1
+		summaryfilename = options.summaryfile + '.' + str(counter)
+	summaryfile = open(summaryfile, 'w+')
+	summaryfile.write(summary(report))
+	summaryfile.close()
+
 def test():
 	print wholeword("ear","bearth")
 	print wholeword("ear","BearTH")
-- 
cgit v1.2.3