#!/usr/bin/python
# Copyright 2014 Software Freedom Law Center (www.softwarefreedom.org)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#Author: Marc Jones
#Date: June 30, 2014
#Version 0.2.1
#Added weight score function to remove scores from files that otherwise would not have scores
#Added Remove superstrings of other words from search to speed things up if they are greater than wordlength
##TODO
#need to verify that word score counts each instance, not just 0 or 1
#need to discount found words if they are substrings of common strings in text
from optparse import OptionParser
import os
import re
import sys
import datetime
import string
report = {}
wordscore = {}
filescore = {}
filelist = list()
skipped = 0
opened = 0
datasize = 0
progresstext = ""
def sortscore(score, reverse=True):
sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse)
returnscore = []
for s in sortedscore:
if s[1] > 0:
returnscore.append(s)
return returnscore
def printscore(report):
for i in report:
print i[0] + ':' + str(i[1])
def scorewords(report):
for file in report.keys():
for word in report[file].keys():
if not word in wordscore:
wordscore[word] = 0
if not file in filescore:
filescore[file] = 0
wordscore[word] += report[file][word]
return wordscore
def weightreport(report, commonwords):
notsuspiciousfiles = []
weightedout = 0
for file in report:
suspicious = False
filescore = 0
for word in report[file]:
filescore += report[file][word]
if filescore > 0:
for word in report[file]:
if report[file][word] > 0 and not word in commonwords:
suspicious = True
if not suspicious and filescore > 0:
notsuspiciousfiles.append(file)
for file in notsuspiciousfiles:
report.pop(file)
weightedout +=1
return report, weightedout
def scorefile(report):
for file in report.keys():
for word in report[file].keys():
if not word in wordscore:
wordscore[word] = 0
if not file in filescore:
filescore[file] = 0
filescore[file] += report[file][word]
return filescore
def summary(report):
filescore = scorefile(report)
text = ""
for file in sortscore(filescore):
text += file[0] + '(' + str(file[1]) + '):'
for word in report[file[0]].keys():
if report[file[0]][word] > 0:
text += word + '(' + str(report[file[0]][word]) + ');'
text += '\n'
return text
def wholeword(word, string):
re.purge()
matches = []
if word.isdigit():
int(word)
regexNum = r'([^0-9]|\b)(' + word + r')([^0-9]|\b)'
mN = re.search(regexNum, string)
if "groups" in dir(mN):
matches.append(mN.groups())
else:
regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + word.lower() + r')([A-Z]|[^a-zA-Z]|\b)'
regexL = r'([a-z]|[^a-zA-Z]|\b)(' + word.upper() + r')([a-z]|[^a-zA-Z]|\b)'
mU = re.search(regexU, string)
if "groups" in dir(mU):
matches.append(mU.groups())
re.purge()
mL = re.search(regexL, string)
if "groups" in dir(mL):
matches.append(mL.groups())
return matches
def skipfile(filename,skippedexts):
if not isinstance(skippedexts, list):
return False
for skip in skippedexts:
if filename.endswith(skip):
return True
return False
def scoretext(wordlist, text, maxwholewordlen = -1):
score = {}
ltext = text.lower()
for word in wordlist:
wordreg = word.replace('-', ' ')
wordreg = wordreg.replace(' ', '['+string.punctuation+' ]?')
if int(len(word)) > int(maxwholewordlen):
matches = []
m = re.search(wordreg.lower(),ltext)
if "groups" in dir(m):
matches.append(m.groups())
score[word] = len(matches)
else:
score[word] = len(wholeword(wordreg,text))
return score
usage = "%prog [options] DIRECTORY ... DIRECTORYN"
epilog = "example: ./suspicious ../gitcheckout -s .tar -s .gz -s .bmp -s .zip -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
parser = OptionParser(usage = usage, epilog = epilog)
parser.add_option("-f", "--file",
dest="suspiciousfilename",
help="specify file to scan", action="append")
parser.add_option("-w", "--wordlist",
dest="wordlistfilename",
help="file containing all of the words to look for")
parser.add_option("-s", "--skip",
dest="skipfileextensions",
help="file extensions to skip",
action="append")
parser.add_option("-v", "--verbose",
dest="verbose",
help="print verberose information",
default=False,
action="store_true")
parser.add_option("-r", "--report",
dest="printreport",
default="wf",
help="print score")
parser.add_option("--show-wordlist",
dest="show_wordlist",
default=False,
help="print list of words to detect",
action="store_true")
parser.add_option("-c", "--display-counts",
dest="display_counts",
default=False,
help="Show the number of files processed",
action="store_true")
parser.add_option("-p", "--display_progress",
dest="display_progress",
default=False,
help="show percentage complete",
action="store_true")
parser.add_option("-l", "--max-wholeword-length",
dest="maxwholewordlength",
type="int",
default=-1,
help="maximun length of a word allowed to only find matches on whole word")
parser.add_option("-o", "--summary-file",
dest="summaryfile",
help="name of the file to store the summary in")
parser.add_option("-x", "--display-summary",
dest="displaysummary",
default=False,
help="Display a summary from the summary file",
action="store_true")
parser.add_option("-X", "--dont-display-summary",
dest="dontdisplaysummary",
default=False,
help="Dont Display a summary after running a scan",
action="store_true")
parser.add_option("-k", "--commonwords",
dest="commonwordfilename",
help="file containing commmon words that allow do not indicate a suspicious file")
parser.add_option("-t", "--test",
dest="test",
default=False,
help="Run internal tests on pattern matching",
action="store_true")
parser.add_option("--donotoptimizewordlist",
dest="optimizewordlist",
default=True,
help="Reduce the number of words to look for by removing words from the wordlist that contain other words on the list as substrings",
action="store_false")
parser.add_option("--dontweightreport",
dest="dontweightreport",
default=False,
help="If the only suspicious words in a file are common words, remove the file from the report",
action="store_true")
(options, args) = parser.parse_args()
if options.wordlistfilename:
wordlist = list(set(open(options.wordlistfilename).read().lower().strip().split('\n')))
if options.commonwordfilename:
commonwords = list(set(open(options.commonwordfilename).read().lower().strip().split('\n')))
if options.optimizewordlist and options.wordlistfilename:
for word in wordlist:
if len(word) > options.maxwholewordlength:
for check_word in wordlist:
if check_word.find(word)> 0:
# print word + " in " + check_word
wordlist.remove(check_word)
break
if options.show_wordlist: print wordlist; exit()
if options.displaysummary and options.summaryfile:
report = dict()
try:
summaryfile = open(options.summaryfile)
except:
print "no summary file: " + options.summaryfile
exit()
#sample input
#../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1);
for line in summaryfile:
#find the file name which is before the matching parathsis before the last colon on the line
filename = line[:line[:line.rfind(':')].rfind('(')]
#find the total number of words found by locating the end of the filename and taking the number in parathesis right before the :
totalfilecount = line[line[:line.rfind(':')].rfind('(')+1:line[:line.rfind(':')].rfind(')')]
#find the list of words following the :, and split them by the ;, and then drop the last item on the list which is always a \n
foundwords = line[line.rfind(':')+1:].split(';')[:-1]
report[filename] = dict()
for w in foundwords:
w = w.strip()
word = w[:w.find('(')]
wcount = w[w.find('(')+1:w.find(')')]
report[filename][word] = int(wcount)
if options.commonwordfilename and not(options.dontweightreport):
report, weightedfiles = weightreport(report, commonwords)
if options.printreport:
if options.printreport == "f":
printscore(sortscore(scorefile(report)))
elif options.printreport == "w":
printscore(sortscore(scorewords(report)))
elif options.printreport == "wf" or options.printreport == "fw":
print summary(report)
else:
print summary(report)
exit()
#Run a serarch if not displaying a existing report
if len(args) > 0:
for a in args:
for (path, dirs, files) in os.walk(a):
if 'CVS' in dirs:
dirs.remove('CVS')
if '.git' in dirs:
dirs.remove('.git')
if '.bzr' in dirs:
dirs.remove('.bzr')
if '.hg' in dirs:
dirs.remove('.hg')
if '.svn' in dirs:
dirs.remove('.svn')
for file in files:
filelist.append(path + '/' + file)
if options.suspiciousfilename:
filelist += options.suspiciousfilename
start = datetime.datetime.now()
for file in filelist:
if skipfile(file, options.skipfileextensions):
skipped += 1
continue
try:
f = open(file)
except:
print "failed to open: " + file
continue
opened +=1
now = datetime.datetime.now()
estimate = (((now - start) / (opened + skipped)) * len(filelist))
if options.display_progress:
if len(file)> 60:
prog_file = file.split('/')[0] + "/.../" + file.split('/')[-1]
else:
prog_file = file
print '\r' + " " * len(progresstext) + '\r',
progresstext = str(((opened + skipped)*1.0/len(filelist))*100)[:5] + '% '+ " time left:" + str(estimate).split('.')[0] + ' ' + prog_file + '\r'
print progresstext,
sys.stdout.flush()
filecontents = f.read()
datasize += len(filecontents)
filenamescore = scoretext(wordlist, file, options.maxwholewordlength)
filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength)
report[file] = {}
for k in filecontentsscore.keys():
report[file][k] = filenamescore[k] + filecontentsscore[k]
#Clear screen of proggress text now that finished scoring file
if options.display_progress:
print '\r' + " " * len(progresstext) + '\r',
#Save summary as a file, but if the filename exists do not overwrite, append a number
if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
summaryfilename = options.summaryfile
counter = 0
while os.path.isfile(summaryfilename):
counter +=1
summaryfilename = options.summaryfile + '.' + str(counter)
try:
if counter > 1: print "saving as " + summaryfilename + "...."
summaryfile = open(summaryfilename, 'w+')
summaryfile.write(summary(report))
summaryfile.close()
except:
print report
print "error saving summary as " + summaryfilename
if options.commonwordfilename and not(options.dontweightreport):
report, weightedfiles = weightreport(report, commonwords)
if options.printreport and not options.dontdisplaysummary:
if options.printreport == "f":
printscore(sortscore(scorefile(report)))
elif options.printreport == "wf" or options.printreport == "fw":
print summary(report)
else:
printscore(sortscore(scorewords(report)))
if options.display_counts:
print "total files:" + str(len(filelist)) ,
print "suspicious files:" + str(len(sortscore(scorefile(report)))) ,
print "skipped files:" + str(skipped),
if options.commonwordfilename and not(options.dontweightreport):
print "removed weighted files:" + str(weightedfiles),
print "searched:" + str(datasize) + 'B',
print "time:" + str(datetime.datetime.now() - start).split('.')[0]