From 981a5a7cbcc943c117101535601e5cd721552c02 Mon Sep 17 00:00:00 2001
From: Marc Jones <mjones@softwarefreedom.org>
Date: Mon, 3 Feb 2014 11:05:41 -0500
Subject: initial check in of file word search

---
 main.py | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100755 main.py

(limited to 'main.py')

diff --git a/main.py b/main.py
new file mode 100755
index 0000000..d5e1b37
--- /dev/null
+++ b/main.py
@@ -0,0 +1,110 @@
+#!/usr/bin/python -i
+from optparse import OptionParser
+import os
+import re
+report = {}
+wordscore = {}
+filescore = {}
+filelist = list()
+
+def sortscore(score, reverse=False):
+	sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse)
+	returnscore = []
+	for s in sortedscore:
+		if s[1] > 0:
+			returnscore.append(s)
+	
+	return returnscore
+
+def printscore(report):
+	for i in report:
+		print i[0] + ':' + str(i[1])
+
+def wholeword(word, string):
+	matches = []
+	regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + word.lower() + r')([A-Z]|[^a-zA-Z]|\b)'
+	regexL = r'([a-z]|[^a-zA-Z]|\b)(' + word.upper() + r')([a-z]|[^a-zA-Z]|\b)'
+	mU = re.search(regexU, string)
+	if "groups" in dir(mU):
+		matches.append(mU.groups())
+	mL = re.search(regexL, string)
+	if "groups" in dir(mL):
+		matches.append(mL.groups())
+	return matches
+
+def skipfile(filename,skippedexts):
+	if not isinstance(skippedexts, list):
+		return False
+	for skip in skippedexts:
+		if filename.endswith(skip):
+			return True
+	return False
+
+def scoretext(wordlist, text):
+	score = {}
+	for word in wordlist:
+		score[word] = len(wholeword(word,text))
+	
+	return score
+
+parser = OptionParser()
+parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append")
+parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for")
+parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append")
+parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False)
+parser.add_option("-r", "--report", dest="printreport", default="w", help="print score")
+
+(options, args) = parser.parse_args()
+
+if options.wordlistfilename:
+	wordlist = open(options.wordlistfilename).read().lower().split()
+
+for a in args:
+	for (path, dirs, files) in os.walk(a):
+		for file in files:
+			filelist.append(path + '/' + file)
+	
+if options.suspiciousfilename:
+	filelist += options.suspiciousfilename
+
+for file in filelist:
+	if skipfile(file, options.skipfileextensions):
+		#print "skip: " + file
+		continue
+	try:
+		f = open(file)
+	except:
+		print "failed to open: " + file
+		continue
+	
+	filecontents = f.read()
+			
+	report[file] = scoretext(wordlist, filecontents)
+
+for file in report.keys():
+	for word in report[file].keys():
+		if not word in wordscore:
+			wordscore[word] = 0
+		if not file in filescore:
+			filescore[file] = 0
+		wordscore[word] += report[file][word]
+		filescore[file] += report[file][word]
+
+if options.printreport:
+	if options.printreport == "f":
+		printscore(sortscore(filescore))
+	else:
+		printscore(sortscore(wordscore))
+
+def test():
+	print wholeword("ear","bearth")
+	print wholeword("ear","BearTH")
+	print wholeword("ear","bEARth")
+	print wholeword("ear","ear_")
+	print wholeword("ear","ear()")
+	print wholeword("ear","ear.")
+	print wholeword("ear","ear:")
+	print wholeword("ear","ear\n\r")
+	print wholeword("ear","myEAR() MYear: myEAR()")
+
+#test()
-- 
cgit v1.2.1