#!/usr/bin/python

# Copyright 2014 Software Freedom Law Center (www.softwarefreedom.org)
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

#Author: Marc Jones <mjones@softwarefreedom.org>
#Date: June 30, 2014
#Version 0.2.1
#Added weight score function to remove scores from files that otherwise would not have scores
#Added Remove superstrings of other words from search to speed things up if they are greater than wordlength

##TODO
#need to verify that word score counts each instance, not just 0 or 1
#need to discount found words if they are substrings of common strings in text
 
from optparse import OptionParser
import os
import re
import sys
import datetime
import string

report = {}
wordscore = {}
filescore = {}
filelist = list()
skipped = 0
opened = 0
datasize = 0
progresstext = "" 

	
def sortscore(score, reverse=True):
	sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse)
	returnscore = []
	for s in sortedscore:
		if s[1] > 0:
			returnscore.append(s)
	
	return returnscore

def printscore(report):
	for i in report:
		print i[0] + ':' + str(i[1])

def scorewords(report):
	for file in report.keys():
		for word in report[file].keys():
			if not word in wordscore:
				wordscore[word] = 0
			if not file in filescore:
				filescore[file] = 0
			wordscore[word] += report[file][word]
	return wordscore

def weightreport(report, commonwords):
	notsuspiciousfiles = []
        weightedout = 0
	for file in report:
		suspicious = False
		filescore = 0
		for word in report[file]:
		        filescore += report[file][word]
		if filescore > 0:
		        for word in report[file]:
		                if report[file][word] > 0 and not word in commonwords:
			                suspicious = True
		if not suspicious and filescore > 0:
		        notsuspiciousfiles.append(file)

	for file in notsuspiciousfiles:
		report.pop(file)
                weightedout +=1

	return report, weightedout

def scorefile(report):
	for file in report.keys():
		for word in report[file].keys():
			if not word in wordscore:
				wordscore[word] = 0
			if not file in filescore:
				filescore[file] = 0
			filescore[file] += report[file][word]
	return filescore

def summary(report):
	filescore = scorefile(report)
	text = ""
	for file in sortscore(filescore):
		text += file[0] + '(' + str(file[1]) + '):'
		for word in report[file[0]].keys():
			if report[file[0]][word] > 0:
				text += word + '(' + str(report[file[0]][word]) + ');' 
		text += '\n'
	return text

def wholeword(word, string):
	re.purge()
	matches = []
	
	if word.isdigit():
		int(word)
		regexNum = r'([^0-9]|\b)(' + word + r')([^0-9]|\b)'
		mN = re.search(regexNum, string)
		if "groups" in dir(mN):
			matches.append(mN.groups())
	
	else:
		regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + word.lower() + r')([A-Z]|[^a-zA-Z]|\b)'
		regexL = r'([a-z]|[^a-zA-Z]|\b)(' + word.upper() + r')([a-z]|[^a-zA-Z]|\b)'
		mU = re.search(regexU, string)
		if "groups" in dir(mU):
			matches.append(mU.groups())
		re.purge()
		mL = re.search(regexL, string)
		if "groups" in dir(mL):
			matches.append(mL.groups())
	return matches

def skipfile(filename,skippedexts):
	if not isinstance(skippedexts, list):
		return False
	for skip in skippedexts:
		if filename.endswith(skip):
			return True
	return False

def scoretext(wordlist, text, maxwholewordlen = -1):
	score = {}
	ltext = text.lower()
	for word in wordlist:
		wordreg = word.replace('-', ' ')
		wordreg = wordreg.replace(' ', '['+string.punctuation+' ]?')
		if int(len(word)) > int(maxwholewordlen):
			matches = [] 
			m = re.search(wordreg.lower(),ltext)
			if "groups" in dir(m):
				matches.append(m.groups())
			score[word] = len(matches)			
		else:
			score[word] = len(wholeword(wordreg,text))
	return score

usage = "%prog [options] DIRECTORY ... DIRECTORYN"
epilog = "example: ./suspicious ../gitcheckout -s .tar -s .gz -s .bmp -s .zip -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
parser = OptionParser(usage = usage, epilog = epilog)
parser.add_option("-f", "--file", 
		dest="suspiciousfilename", 
		help="specify file to scan", action="append")
parser.add_option("-w", "--wordlist", 
		dest="wordlistfilename", 
		help="file containing all of the words to look for")
parser.add_option("-s", "--skip", 
		dest="skipfileextensions", 
		help="file extensions to skip", 
		action="append")
parser.add_option("-v", "--verbose", 
		dest="verbose", 
		help="print verberose information", 
		default=False, 
		action="store_true")
parser.add_option("-r", "--report", 
		dest="printreport", 
		default="wf", 
		help="print score")
parser.add_option("--show-wordlist", 
		dest="show_wordlist", 
		default=False, 
		help="print list of words to detect", 
		action="store_true")
parser.add_option("-c", "--display-counts", 
		dest="display_counts", 
		default=False, 
		help="Show the number of files processed", 
		action="store_true")
parser.add_option("-p", "--display_progress", 
		dest="display_progress", 
		default=False, 
		help="show percentage complete", 
		action="store_true")
parser.add_option("-l", "--max-wholeword-length", 
		dest="maxwholewordlength", 
		type="int", 
		default=-1, 
		help="maximun length of a word allowed to only find matches on whole word")
parser.add_option("-o", "--summary-file", 
		dest="summaryfile", 
		help="name of the file to store the summary in")
parser.add_option("-x", "--display-summary", 
		dest="displaysummary", 
		default=False, 
		help="Display a summary from the summary file", 
		action="store_true")
parser.add_option("-X", "--dont-display-summary", 
		dest="dontdisplaysummary", 
		default=False, 
		help="Dont Display a summary after running a scan", 
		action="store_true")
parser.add_option("-k", "--commonwords", 
		dest="commonwordfilename", 
		help="file containing commmon words that allow do not indicate a suspicious file")
parser.add_option("-t", "--test", 
		dest="test", 
		default=False, 
		help="Run internal tests on pattern matching", 
		action="store_true")
parser.add_option("--donotoptimizewordlist",
                dest="optimizewordlist",
                default=True,
                help="Reduce the number of words to look for by removing words from the wordlist that contain other words on the list as substrings",
                action="store_false")
parser.add_option("--dontweightreport",
                dest="dontweightreport",
                default=False,
                help="If the only suspicious words in a file are common words, remove the file from the report",
                action="store_true")

(options, args) = parser.parse_args()

if options.wordlistfilename:
	wordlist = list(set(open(options.wordlistfilename).read().lower().strip().split('\n')))

if options.commonwordfilename:
	commonwords = list(set(open(options.commonwordfilename).read().lower().strip().split('\n')))
	
if options.optimizewordlist and options.wordlistfilename:
    for word in wordlist:
        if len(word) > options.maxwholewordlength:
            for check_word in wordlist:
                if check_word.find(word)> 0:
#                    print word + " in " + check_word
                    wordlist.remove(check_word)
                    break

if options.show_wordlist: print wordlist; exit()

if options.displaysummary and options.summaryfile:
	report = dict()
	try:
		summaryfile = open(options.summaryfile)
	except:
		print "no summary file: " + options.summaryfile
		exit()
	#sample input
	#../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1);
	for line in summaryfile:
		#find the file name which is before the matching parathsis before the last colon on the line
		filename = line[:line[:line.rfind(':')].rfind('(')]
		#find the total number of words found by locating the end of the filename and taking the number in parathesis right before the :
		totalfilecount = line[line[:line.rfind(':')].rfind('(')+1:line[:line.rfind(':')].rfind(')')]
		#find the list of words following the :, and split them by the ;, and then drop the last item on the list which is always a \n
		foundwords = line[line.rfind(':')+1:].split(';')[:-1]
		report[filename] = dict()		
		for w in foundwords:
			w = w.strip()
			word = w[:w.find('(')]
			wcount = w[w.find('(')+1:w.find(')')]		
			report[filename][word] = int(wcount)

	if options.commonwordfilename and not(options.dontweightreport):
		report, weightedfiles = weightreport(report, commonwords)

	if options.printreport:
		if options.printreport == "f":
			printscore(sortscore(scorefile(report)))
		elif options.printreport == "w":
			printscore(sortscore(scorewords(report)))
		elif options.printreport == "wf" or options.printreport == "fw":
			print summary(report)			
	else:
		print summary(report)
	
	exit()

#Run a serarch if not displaying a existing report
if len(args) > 0:
	for a in args:
		for (path, dirs, files) in os.walk(a):
			if 'CVS' in dirs:
				dirs.remove('CVS')
			if '.git' in dirs:
				dirs.remove('.git')
			if '.bzr' in dirs:
				dirs.remove('.bzr')
			if '.hg' in dirs:
				dirs.remove('.hg')
			if '.svn' in dirs:
				dirs.remove('.svn')
	
			for file in files:
				filelist.append(path + '/' + file)
	
if options.suspiciousfilename:
	filelist += options.suspiciousfilename

start = datetime.datetime.now()
for file in filelist:
	if skipfile(file, options.skipfileextensions):
		skipped += 1
		continue
	try:
		f = open(file)
	except:
		print "failed to open: " + file
		continue
	opened +=1
	now = datetime.datetime.now()
	estimate = (((now - start) / (opened + skipped)) * len(filelist)) 
	if options.display_progress: 
		if len(file)> 60:
			prog_file = file.split('/')[0] + "/.../" + file.split('/')[-1]
		else:
			prog_file = file
		print '\r' + " " * len(progresstext) + '\r',
		progresstext = str(((opened + skipped)*1.0/len(filelist))*100)[:5] + '% '+ " time left:" + str(estimate).split('.')[0] + ' ' + prog_file + '\r'
		print progresstext,
	sys.stdout.flush()
	filecontents = f.read()
	datasize += len(filecontents)		
	filenamescore = scoretext(wordlist, file, options.maxwholewordlength)
	filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength)
	report[file] = {}
	for k in filecontentsscore.keys():
		report[file][k] = filenamescore[k] + filecontentsscore[k]

#Clear screen of proggress text now that finished scoring file
if options.display_progress: 
	print '\r' + " " * len(progresstext) + '\r',

#Save summary as a file, but if the filename exists do not overwrite, append a number
if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
	summaryfilename = options.summaryfile	
	counter = 0
	while os.path.isfile(summaryfilename):
		counter +=1
		summaryfilename = options.summaryfile + '.' + str(counter)
	try:
		if counter > 1: print "saving as " + summaryfilename + "...."	
		summaryfile = open(summaryfilename, 'w+')
		summaryfile.write(summary(report))
		summaryfile.close()		
	except:
		print report
		print "error saving summary as " + summaryfilename

if options.commonwordfilename and not(options.dontweightreport):
	report, weightedfiles = weightreport(report, commonwords)
        
if options.printreport and not options.dontdisplaysummary:
	if options.printreport == "f":
		printscore(sortscore(scorefile(report)))
	elif options.printreport == "wf" or options.printreport == "fw":
		print summary(report)
	else:
		printscore(sortscore(scorewords(report)))

if options.display_counts:
	print "total files:" + str(len(filelist)) ,
	print "suspicious files:" + str(len(sortscore(scorefile(report)))) ,
	print "skipped files:" + str(skipped),
	if options.commonwordfilename and not(options.dontweightreport):
		print "removed weighted files:" + str(weightedfiles),
	print "searched:" + str(datasize) + 'B', 
	print "time:" + str(datetime.datetime.now() - start).split('.')[0]