Adding word files

author: Marc Jones <mjones@softwarefreedom.org> 2014-02-24 16:53:52 -0500
committer: Marc Jones <mjones@softwarefreedom.org> 2014-02-24 16:53:52 -0500
commit: 625be804b15560bdb116d9cc94f280b1567f25c0 (patch)
tree: 2e0dd45943c6d5377e6415442a4e545fa5a0eb85 /suspicious
parent: 02f79fecf9fa49f3271beb0fc7e3c859e1920979 (diff)
1 files changed, 26 insertions, 5 deletions
diff --git a/suspicious b/suspicious
index 6211dcb..6302c34 100755
--- a/suspicious
+++ b/suspicious
@@ -16,7 +16,7 @@ opened = 0
 datasize = 0
 progresstext = "" 
 
-def sortscore(score, reverse=False):
+def sortscore(score, reverse=True):
 	sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse)
 	returnscore = []
 	for s in sortedscore:
@@ -93,12 +93,13 @@ def skipfile(filename,skippedexts):
 
 def scoretext(wordlist, text, maxwholewordlen = -1):
 	score = {}
+	ltext = text.lower()
 	for word in wordlist:
 		wordreg = word.replace('-', ' ')
-		wordreg = wordreg.replace(' ', '['+string.punctuation+' ]*')
+		wordreg = wordreg.replace(' ', '['+string.punctuation+' ]?')
 		if int(len(word)) > int(maxwholewordlen):
 			matches = [] 
-			m = re.search(wordreg.lower(),text.lower())
+			m = re.search(wordreg.lower(),ltext)
 			if "groups" in dir(m):
 				matches.append(m.groups())
 			score[word] = len(matches)			
@@ -106,6 +107,24 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
 			score[word] = len(wholeword(wordreg,text))
 	return score
 
+def optimizewordlist(wordlist, maxwholewordlength):
+	shortwordlist = list()
+	for ww in wordlist:
+		swfound = False
+		
+		for sw in wordlist:
+			if len(sw) <= maxwholewordlength:
+				continue
+			
+			if sw in ww and not sw == ww:
+				swfound = True
+				break
+
+		if not swfound:
+			shortwordlist.append(ww)
+	return shortwordlist 
+
+
 usage = "%prog [options] DIRECTORY ... DIRECTORYN"
 epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
 parser = OptionParser(usage = usage, epilog = epilog)
@@ -121,12 +140,14 @@ parser.add_option("-l", "--max-wholeword-length", dest="maxwholewordlength", typ
 parser.add_option("-o", "--summary-file", dest="summaryfile", help="name of the file to store the summary in")
 parser.add_option("-x", "--display-summary", dest="displaysummary", default=False, help="Display a summary from the summary file", action="store_true")
 parser.add_option("-X", "--dont-display-summary", dest="dontdisplaysummary", default=False, help="Dont Display a summary after running a scan", action="store_true")
+parser.add_option("-t", "--test", dest="test", default=False, help="Run internal tests on pattern matching", action="store_true")
 
 (options, args) = parser.parse_args()
 
 if options.wordlistfilename:
 	wordlist = list(set(open(options.wordlistfilename).read().lower().strip().split('\n')))
-			
+	wordlist = optimizewordlist(wordlist, options.maxwholewordlength)
+
 if options.show_wordlist: print wordlist; exit()
 
 if options.displaysummary and options.summaryfile:
@@ -259,5 +280,5 @@ def test():
 	print wholeword("ear","ear:")
 	print wholeword("ear","ear\n\r")
 	print wholeword("ear","myEAR() MYear: myEAR()")
-
+	print wholeword("a5.[0123456789]0","a5-9")
 #test()
author	Marc Jones <mjones@softwarefreedom.org>	2014-02-24 16:53:52 -0500
committer	Marc Jones <mjones@softwarefreedom.org>	2014-02-24 16:53:52 -0500
commit	625be804b15560bdb116d9cc94f280b1567f25c0 (patch)
tree	2e0dd45943c6d5377e6415442a4e545fa5a0eb85 /suspicious
parent	02f79fecf9fa49f3271beb0fc7e3c859e1920979 (diff)