summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarc Jones <mjones@softwarefreedom.org>2015-02-11 15:35:02 -0500
committerMarc Jones <mjones@softwarefreedom.org>2015-02-11 15:35:02 -0500
commit628bcafc14e91e5950fe6971934cbdbbb1dc41c6 (patch)
treeb593375a8e8188fdad5d1853a4eba06052a5aff4
parent0dcd92bf522b88b62393f3e00b023e8900430f5c (diff)
updating suspicious
-rw-r--r--commonwords.txt3
-rwxr-xr-xsuspicious99
2 files changed, 73 insertions, 29 deletions
diff --git a/commonwords.txt b/commonwords.txt
index 6b80d77..f35d80d 100644
--- a/commonwords.txt
+++ b/commonwords.txt
@@ -8,5 +8,6 @@ url
idea
set
key
-export
drm
+export
+padding
diff --git a/suspicious b/suspicious
index df6d449..e703f1e 100755
--- a/suspicious
+++ b/suspicious
@@ -1,10 +1,30 @@
-#!/usr/bin/python
-#Author: Marc Jones <mjones@softwarefreedom.org>
-#Date: Feb 26, 2014
-#Version 0.1.2
+#!/usr/bin/python
+
+# Copyright 2014 Software Freedom Law Center (www.softwarefreedom.org)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
-#Add weight score function to remove scores from files that otherwise would not have scores
+#Author: Marc Jones <mjones@softwarefreedom.org>
+#Date: June 30, 2014
+#Version 0.2.1
+#Added weight score function to remove scores from files that otherwise would not have scores
+#Added Remove superstrings of other words from search to speed things up if they are greater than wordlength
+##TODO
+#need to verify that word score counts each instance, not just 0 or 1
+#need to discount found words if they are substrings of common strings in text
+
from optparse import OptionParser
import os
import re
@@ -47,19 +67,24 @@ def scorewords(report):
def weightreport(report, commonwords):
notsuspiciousfiles = []
+ weightedout = 0
for file in report:
suspicious = False
+ filescore = 0
for word in report[file]:
- if not word in commonwords:
- if report[file][word] > 0:
- suspicious = True
-
- if not suspicious:
- notsuspiciousfiles.append(file)
+ filescore += report[file][word]
+ if filescore > 0:
+ for word in report[file]:
+ if report[file][word] > 0 and not word in commonwords:
+ suspicious = True
+ if not suspicious and filescore > 0:
+ notsuspiciousfiles.append(file)
+
for file in notsuspiciousfiles:
report.pop(file)
+ weightedout +=1
- return report
+ return report, weightedout
def scorefile(report):
for file in report.keys():
@@ -192,25 +217,33 @@ parser.add_option("-t", "--test",
default=False,
help="Run internal tests on pattern matching",
action="store_true")
+parser.add_option("--donotoptimizewordlist",
+ dest="optimizewordlist",
+ default=True,
+ help="Reduce the number of words to look for by removing words from the wordlist that contain other words on the list as substrings",
+ action="store_false")
+parser.add_option("--dontweightreport",
+ dest="dontweightreport",
+ default=False,
+ help="If the only suspicious words in a file are common words, remove the file from the report",
+ action="store_true")
(options, args) = parser.parse_args()
-if options.commonwordfilename:
- commonwords = list(set(open(options.commonwordfilename).read().lower().strip().split('\n')))
if options.wordlistfilename:
wordlist = list(set(open(options.wordlistfilename).read().lower().strip().split('\n')))
-# uncommonwordlist = wordlist
-# if options.commonwordfilename:
-# for word in commonwords:
-# if word in uncommonwordlist:
-# uncommonwordlist.remove(word)
-
-# uncommonwordlist = optimizewordlist(uncommonwordlist, options.maxwholewordlength)
-# if options.commonwordfilename:
-# wordlist = list(set(uncommonwordlist + commonwords))
-# else:
-# wordlist = uncommonwordlist
+if options.commonwordfilename:
+ commonwords = list(set(open(options.commonwordfilename).read().lower().strip().split('\n')))
+
+if options.optimizewordlist and options.wordlistfilename:
+ for word in wordlist:
+ if len(word) > options.maxwholewordlength:
+ for check_word in wordlist:
+ if check_word.find(word)> 0:
+# print word + " in " + check_word
+ wordlist.remove(check_word)
+ break
if options.show_wordlist: print wordlist; exit()
@@ -236,8 +269,9 @@ if options.displaysummary and options.summaryfile:
word = w[:w.find('(')]
wcount = w[w.find('(')+1:w.find(')')]
report[filename][word] = int(wcount)
- if options.commonwordfilename:
- report = weightreport(report, commonwords)
+
+ if options.commonwordfilename and not(options.dontweightreport):
+ report, weightedfiles = weightreport(report, commonwords)
if options.printreport:
if options.printreport == "f":
@@ -250,6 +284,8 @@ if options.displaysummary and options.summaryfile:
print summary(report)
exit()
+
+#Run a serarch if not displaying a existing report
if len(args) > 0:
for a in args:
for (path, dirs, files) in os.walk(a):
@@ -300,9 +336,11 @@ for file in filelist:
for k in filecontentsscore.keys():
report[file][k] = filenamescore[k] + filecontentsscore[k]
+#Clear screen of proggress text now that finished scoring file
if options.display_progress:
print '\r' + " " * len(progresstext) + '\r',
+#Save summary as a file, but if the filename exists do not overwrite, append a number
if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
summaryfilename = options.summaryfile
counter = 0
@@ -318,6 +356,9 @@ if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
print report
print "error saving summary as " + summaryfilename
+if options.commonwordfilename and not(options.dontweightreport):
+ report, weightedfiles = weightreport(report, commonwords)
+
if options.printreport and not options.dontdisplaysummary:
if options.printreport == "f":
printscore(sortscore(scorefile(report)))
@@ -329,6 +370,8 @@ if options.printreport and not options.dontdisplaysummary:
if options.display_counts:
print "total files:" + str(len(filelist)) ,
print "suspicious files:" + str(len(sortscore(scorefile(report)))) ,
- print "skipped files:" + str(skipped) ,
+ print "skipped files:" + str(skipped),
+ if options.commonwordfilename and not(options.dontweightreport):
+ print "removed weighted files:" + str(weightedfiles),
print "searched:" + str(datasize) + 'B',
print "time:" + str(datetime.datetime.now() - start).split('.')[0]