only match words made only of digits that are not adjacent to other digits if wordboundaries are turn on

author: Marc Jones <mjones@softwarefreedom.org> 2014-02-05 11:07:37 -0500
committer: Marc Jones <mjones@softwarefreedom.org> 2014-02-05 11:07:37 -0500
commit: 81d7583b05c546a894cdf33e0d8426af5edca1ae (patch)
tree: 16d7eec83d7ba55381d056bb0426fc2fe34adc9f /main.py~
parent: 4a33223eba7cf5cf66ccdf98abb1f0c81808cfc3 (diff)
1 files changed, 83 insertions, 36 deletions
diff --git a/main.py~ b/main.py~
index 8ad0891..fe318c1 100755
--- a/main.py~
+++ b/main.py~
@@ -1,4 +1,6 @@
 #!/usr/bin/python
+#example command
+
 from optparse import OptionParser
 import os
 import re
@@ -27,6 +29,37 @@ def printscore(report):
 	for i in report:
 		print i[0] + ':' + str(i[1])
 
+def scorewords(report):
+	for file in report.keys():
+		for word in report[file].keys():
+			if not word in wordscore:
+				wordscore[word] = 0
+			if not file in filescore:
+				filescore[file] = 0
+			wordscore[word] += report[file][word]
+	return wordscore
+
+def scorefile(report):
+	for file in report.keys():
+		for word in report[file].keys():
+			if not word in wordscore:
+				wordscore[word] = 0
+			if not file in filescore:
+				filescore[file] = 0
+			filescore[file] += report[file][word]
+	return filescore
+
+def summary(report):
+	filescore = scorefile(report)
+	text = ""
+	for file in sortscore(filescore):
+		text += file[0] + '(' + str(file[1]) + '):'
+		for word in report[file[0]].keys():
+			if report[file[0]][word] > 0:
+				text += word + '(' + str(report[file[0]][word]) + ');' 
+		text += '\n'
+	return text
+
 def wholeword(word, string):
 	re.purge()
 	matches = []
@@ -58,12 +91,14 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
 			score[word] = len(wholeword(word,text))
 	return score
 
-parser = OptionParser()
+usage = "%prog [options] DIRECTORY ... DIRECTORYN"
+epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
+parser = OptionParser(usage = usage, epilog = epilog)
 parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append")
 parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for")
 parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append")
 parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true")
-parser.add_option("-r", "--report", dest="printreport", default="w", help="print score")
+parser.add_option("-r", "--report", dest="printreport", default="wf", help="print score")
 parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true")
 parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true")
 parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true")
@@ -78,6 +113,41 @@ if options.wordlistfilename:
 			
 if options.show_wordlist: print wordlist; exit()
 
+if options.displaysummary and options.summaryfile:
+	report = dict()
+	try:
+		summaryfile = open(options.summaryfile)
+	except:
+		print "no summary file: " + options.summaryfile
+		exit()
+	#sample input
+	#../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1);
+	for line in summaryfile:
+		#find the file name which is before the matching parathsis before the last colon on the line
+		filename = line[:line[:line.rfind(':')].rfind('(')]
+		#find the total number of words found by locating the end of the filename and taking the number in parathesis right before the :
+		totalfilecount = line[line[:line.rfind(':')].rfind('(')+1:line[:line.rfind(':')].rfind(')')]
+		#find the list of words following the :, and split them by the ;, and then drop the last item on the list which is always a \n
+		foundwords = line[line.rfind(':')+1:].split(';')[:-1]
+		report[filename] = dict()		
+		for w in foundwords:
+			w = w.strip()
+			word = w[:w.find('(')]
+			wcount = w[w.find('(')+1:w.find(')')]		
+			report[filename][word] = int(wcount)
+
+	if options.printreport:
+		if options.printreport == "f":
+			printscore(sortscore(scorefile(report)))
+		elif options.printreport == "w":
+			printscore(sortscore(scorewords(report)))
+		elif options.printreport == "wf" or options.printreport == "fw":
+			print summary(report)			
+	else:
+		print summary(report)
+	exit()
+
+
 for a in args:
 	#filelist.append(a)
 	for (path, dirs, files) in os.walk(a):
@@ -98,9 +168,6 @@ for a in args:
 if options.suspiciousfilename:
 	filelist += options.suspiciousfilename
 
-if options.summaryfile:
-	summaryfile = open(summaryfile, 'w')
-
 start = datetime.datetime.now()
 for file in filelist:
 	if skipfile(file, options.skipfileextensions):
@@ -128,36 +195,6 @@ for file in filelist:
 		report[file][k] = filenamescore[k] + filecontentsscore[k]
 
 
-def scorewords(report):
-	for file in report.keys():
-		for word in report[file].keys():
-			if not word in wordscore:
-				wordscore[word] = 0
-			if not file in filescore:
-				filescore[file] = 0
-			wordscore[word] += report[file][word]
-	return wordscore
-
-def scorefile(report):
-	for file in report.keys():
-		for word in report[file].keys():
-			if not word in wordscore:
-				wordscore[word] = 0
-			if not file in filescore:
-				filescore[file] = 0
-			filescore[file] += report[file][word]
-	return filescore
-
-def summary(report):
-	filescore = scorefile(report)
-	text = ""
-	for file in sortscore(filescore):
-		text += file[0] + '(' + str(file[1]) + '):'
-		for word in report[file[0]].keys():
-			if report[file[0]][word] > 0:
-				text += word + '(' + str(report[file[0]][word]) + ');' 
-		text += '\n'
-	return text
 
 if options.printreport:
 	if options.printreport == "f":
@@ -173,7 +210,17 @@ if options.display_counts:
 	print "skipped files:" + str(skipped) ,
 	print "searched:" + str(datasize) + 'B', 
 	print "time:" + str(datetime.datetime.now() - start).split('.')[0]
- 
+
+if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
+	summaryfilename = options.summaryfile	
+	counter = None
+	while os.path.isfile(summaryfilename):
+		counter +=1
+		summaryfilename = options.summaryfile + '.' + str(counter)
+	summaryfile = open(summaryfile, 'w+')
+	summaryfile.write(summary(report))
+	summaryfile.close()
+
 def test():
 	print wholeword("ear","bearth")
 	print wholeword("ear","BearTH")
author	Marc Jones <mjones@softwarefreedom.org>	2014-02-05 11:07:37 -0500
committer	Marc Jones <mjones@softwarefreedom.org>	2014-02-05 11:07:37 -0500
commit	81d7583b05c546a894cdf33e0d8426af5edca1ae (patch)
tree	16d7eec83d7ba55381d056bb0426fc2fe34adc9f /main.py~
parent	4a33223eba7cf5cf66ccdf98abb1f0c81808cfc3 (diff)