#!/usr/bin/python
from optparse import OptionParser
import os
import re
import sys
import datetime

report = {}
wordscore = {}
filescore = {}
filelist = list()
skipped = 0
opened = 0
datasize = 0
progresstext = "" 

def sortscore(score, reverse=False):
	sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse)
	returnscore = []
	for s in sortedscore:
		if s[1] > 0:
			returnscore.append(s)
	
	return returnscore

def printscore(report):
	for i in report:
		print i[0] + ':' + str(i[1])

def wholeword(word, string):
	re.purge()
	matches = []
	regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)'
	regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)'
	mU = re.search(regexU, string)
	if "groups" in dir(mU):
		matches.append(mU.groups())
	re.purge()
	mL = re.search(regexL, string)
	if "groups" in dir(mL):
		matches.append(mL.groups())
	return matches

def skipfile(filename,skippedexts):
	if not isinstance(skippedexts, list):
		return False
	for skip in skippedexts:
		if filename.endswith(skip):
			return True
	return False

def scoretext(wordlist, text, maxwholewordlen = -1):
	score = {}
	for word in wordlist:
		if int(len(word)) > int(maxwholewordlen): 
			score[word] = text.lower().count(word.lower())
		else:
			score[word] = len(wholeword(word,text))
	return score

parser = OptionParser()
parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append")
parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for")
parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append")
parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true")
parser.add_option("-r", "--report", dest="printreport", default="w", help="print score")
parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true")
parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true")
parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true")
parser.add_option("-l", "--max-wholeword-length", dest="maxwholewordlength", type="int", default=-1, help="maximun length of a word allowed to only find matches on whole word")

(options, args) = parser.parse_args()

if options.wordlistfilename:
	wordlist = list(set(open(options.wordlistfilename).read().lower().strip().split('\n')))
			
if options.show_wordlist: print wordlist; exit()

for a in args:
	#filelist.append(a)
	for (path, dirs, files) in os.walk(a):
		if 'CVS' in dirs:
			dirs.remove('CVS')
		if '.git' in dirs:
			dirs.remove('.git')
		if '.bzr' in dirs:
			dirs.remove('.bzr')
		if '.hg' in dirs:
			dirs.remove('.hg')
		if '.svn' in dirs:
			dirs.remove('.svn')
	
		for file in files:
			filelist.append(path + '/' + file)
	
if options.suspiciousfilename:
	filelist += options.suspiciousfilename

start = datetime.datetime.now()
for file in filelist:
	if skipfile(file, options.skipfileextensions):
		skipped += 1
		continue
	try:
		f = open(file)
	except:
		print "failed to open: " + file
		continue
	opened +=1
	now = datetime.datetime.now()
	estimate = (((now - start) / (opened + skipped)) * len(filelist)) 
	if options.display_progress: 
		print '\r' + " " * len(progresstext) + '\r',
		progresstext = str(((opened + skipped)*1.0/len(filelist))*100)[:5] + '% '+ " time left:" + str(estimate).split('.')[0] + ' ' + file + '\r'
		print progresstext,
	sys.stdout.flush()
	filecontents = f.read()
	datasize += len(filecontents)		
	filenamescore = scoretext(wordlist, file, options.maxwholewordlength)
	filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength)
	report[file] = {}
	for k in filecontentsscore.keys():
		report[file][k] = filenamescore[k] + filecontentsscore[k]

for file in report.keys():
	for word in report[file].keys():
		if not word in wordscore:
			wordscore[word] = 0
		if not file in filescore:
			filescore[file] = 0
		wordscore[word] += report[file][word]
		filescore[file] += report[file][word]

if options.printreport:
	if options.printreport == "f":
		printscore(sortscore(filescore))
	elif options.printreport == "wf" or options.printreport == "fw":
		for file in sortscore(filescore):
			print file[0] + '(' + str(file[1]) + '):',
			for word in report[file[0]].keys():
				if report[file[0]][word] > 0:
					print word + '(' + str(report[file[0]][word]) + ');', 
			print ""
	else:
		printscore(sortscore(wordscore))

if options.display_counts:
	print "total files:" + str(len(filelist)) ,
	print "suspicious files:" + str(len(sortscore(filescore))) ,
	print "skipped files:" + str(skipped) ,
	print "searched:" + str(datasize) + 'B', 
	print "time:" + str(datetime.datetime.now() - start).split('.')[0]
 
def test():
	print wholeword("ear","bearth")
	print wholeword("ear","BearTH")
	print wholeword("ear","bEARth")
	print wholeword("ear","ear_")
	print wholeword("ear","ear()")
	print wholeword("ear","ear.")
	print wholeword("ear","ear:")
	print wholeword("ear","ear\n\r")
	print wholeword("ear","myEAR() MYear: myEAR()")

#test()