diff options
| author | Marc Jones <mjones@softwarefreedom.org> | 2014-02-05 11:07:37 -0500 | 
|---|---|---|
| committer | Marc Jones <mjones@softwarefreedom.org> | 2014-02-05 11:07:37 -0500 | 
| commit | 81d7583b05c546a894cdf33e0d8426af5edca1ae (patch) | |
| tree | 16d7eec83d7ba55381d056bb0426fc2fe34adc9f /main.py~ | |
| parent | 4a33223eba7cf5cf66ccdf98abb1f0c81808cfc3 (diff) | |
only match words made only of digits that are not adjacent to other digits if wordboundaries are turn on
Diffstat (limited to 'main.py~')
| -rwxr-xr-x | main.py~ | 119 | 
1 files changed, 83 insertions, 36 deletions
| @@ -1,4 +1,6 @@  #!/usr/bin/python +#example command +  from optparse import OptionParser  import os  import re @@ -27,6 +29,37 @@ def printscore(report):  	for i in report:  		print i[0] + ':' + str(i[1]) +def scorewords(report): +	for file in report.keys(): +		for word in report[file].keys(): +			if not word in wordscore: +				wordscore[word] = 0 +			if not file in filescore: +				filescore[file] = 0 +			wordscore[word] += report[file][word] +	return wordscore + +def scorefile(report): +	for file in report.keys(): +		for word in report[file].keys(): +			if not word in wordscore: +				wordscore[word] = 0 +			if not file in filescore: +				filescore[file] = 0 +			filescore[file] += report[file][word] +	return filescore + +def summary(report): +	filescore = scorefile(report) +	text = "" +	for file in sortscore(filescore): +		text += file[0] + '(' + str(file[1]) + '):' +		for word in report[file[0]].keys(): +			if report[file[0]][word] > 0: +				text += word + '(' + str(report[file[0]][word]) + ');'  +		text += '\n' +	return text +  def wholeword(word, string):  	re.purge()  	matches = [] @@ -58,12 +91,14 @@ def scoretext(wordlist, text, maxwholewordlen = -1):  			score[word] = len(wholeword(word,text))  	return score -parser = OptionParser() +usage = "%prog [options] DIRECTORY ... DIRECTORYN" +epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3" +parser = OptionParser(usage = usage, epilog = epilog)  parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append")  parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for")  parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append")  parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true") -parser.add_option("-r", "--report", dest="printreport", default="w", help="print score") +parser.add_option("-r", "--report", dest="printreport", default="wf", help="print score")  parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true")  parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true")  parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true") @@ -78,6 +113,41 @@ if options.wordlistfilename:  if options.show_wordlist: print wordlist; exit() +if options.displaysummary and options.summaryfile: +	report = dict() +	try: +		summaryfile = open(options.summaryfile) +	except: +		print "no summary file: " + options.summaryfile +		exit() +	#sample input +	#../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1); +	for line in summaryfile: +		#find the file name which is before the matching parathsis before the last colon on the line +		filename = line[:line[:line.rfind(':')].rfind('(')] +		#find the total number of words found by locating the end of the filename and taking the number in parathesis right before the : +		totalfilecount = line[line[:line.rfind(':')].rfind('(')+1:line[:line.rfind(':')].rfind(')')] +		#find the list of words following the :, and split them by the ;, and then drop the last item on the list which is always a \n +		foundwords = line[line.rfind(':')+1:].split(';')[:-1] +		report[filename] = dict()		 +		for w in foundwords: +			w = w.strip() +			word = w[:w.find('(')] +			wcount = w[w.find('(')+1:w.find(')')]		 +			report[filename][word] = int(wcount) + +	if options.printreport: +		if options.printreport == "f": +			printscore(sortscore(scorefile(report))) +		elif options.printreport == "w": +			printscore(sortscore(scorewords(report))) +		elif options.printreport == "wf" or options.printreport == "fw": +			print summary(report)			 +	else: +		print summary(report) +	exit() + +  for a in args:  	#filelist.append(a)  	for (path, dirs, files) in os.walk(a): @@ -98,9 +168,6 @@ for a in args:  if options.suspiciousfilename:  	filelist += options.suspiciousfilename -if options.summaryfile: -	summaryfile = open(summaryfile, 'w') -  start = datetime.datetime.now()  for file in filelist:  	if skipfile(file, options.skipfileextensions): @@ -128,36 +195,6 @@ for file in filelist:  		report[file][k] = filenamescore[k] + filecontentsscore[k] -def scorewords(report): -	for file in report.keys(): -		for word in report[file].keys(): -			if not word in wordscore: -				wordscore[word] = 0 -			if not file in filescore: -				filescore[file] = 0 -			wordscore[word] += report[file][word] -	return wordscore - -def scorefile(report): -	for file in report.keys(): -		for word in report[file].keys(): -			if not word in wordscore: -				wordscore[word] = 0 -			if not file in filescore: -				filescore[file] = 0 -			filescore[file] += report[file][word] -	return filescore - -def summary(report): -	filescore = scorefile(report) -	text = "" -	for file in sortscore(filescore): -		text += file[0] + '(' + str(file[1]) + '):' -		for word in report[file[0]].keys(): -			if report[file[0]][word] > 0: -				text += word + '(' + str(report[file[0]][word]) + ');'  -		text += '\n' -	return text  if options.printreport:  	if options.printreport == "f": @@ -173,7 +210,17 @@ if options.display_counts:  	print "skipped files:" + str(skipped) ,  	print "searched:" + str(datasize) + 'B',   	print "time:" + str(datetime.datetime.now() - start).split('.')[0] -  + +if options.summaryfile and len(filelist) > 0 and not options.displaysummary: +	summaryfilename = options.summaryfile	 +	counter = None +	while os.path.isfile(summaryfilename): +		counter +=1 +		summaryfilename = options.summaryfile + '.' + str(counter) +	summaryfile = open(summaryfile, 'w+') +	summaryfile.write(summary(report)) +	summaryfile.close() +  def test():  	print wholeword("ear","bearth")  	print wholeword("ear","BearTH") | 
