summaryrefslogtreecommitdiff
path: root/main.py~
diff options
context:
space:
mode:
authorMarc Jones <mjones@softwarefreedom.org>2014-02-05 11:07:37 -0500
committerMarc Jones <mjones@softwarefreedom.org>2014-02-05 11:07:37 -0500
commit81d7583b05c546a894cdf33e0d8426af5edca1ae (patch)
tree16d7eec83d7ba55381d056bb0426fc2fe34adc9f /main.py~
parent4a33223eba7cf5cf66ccdf98abb1f0c81808cfc3 (diff)
only match words made only of digits that are not adjacent to other digits if wordboundaries are turn on
Diffstat (limited to 'main.py~')
-rwxr-xr-xmain.py~119
1 files changed, 83 insertions, 36 deletions
diff --git a/main.py~ b/main.py~
index 8ad0891..fe318c1 100755
--- a/main.py~
+++ b/main.py~
@@ -1,4 +1,6 @@
#!/usr/bin/python
+#example command
+
from optparse import OptionParser
import os
import re
@@ -27,6 +29,37 @@ def printscore(report):
for i in report:
print i[0] + ':' + str(i[1])
+def scorewords(report):
+ for file in report.keys():
+ for word in report[file].keys():
+ if not word in wordscore:
+ wordscore[word] = 0
+ if not file in filescore:
+ filescore[file] = 0
+ wordscore[word] += report[file][word]
+ return wordscore
+
+def scorefile(report):
+ for file in report.keys():
+ for word in report[file].keys():
+ if not word in wordscore:
+ wordscore[word] = 0
+ if not file in filescore:
+ filescore[file] = 0
+ filescore[file] += report[file][word]
+ return filescore
+
+def summary(report):
+ filescore = scorefile(report)
+ text = ""
+ for file in sortscore(filescore):
+ text += file[0] + '(' + str(file[1]) + '):'
+ for word in report[file[0]].keys():
+ if report[file[0]][word] > 0:
+ text += word + '(' + str(report[file[0]][word]) + ');'
+ text += '\n'
+ return text
+
def wholeword(word, string):
re.purge()
matches = []
@@ -58,12 +91,14 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
score[word] = len(wholeword(word,text))
return score
-parser = OptionParser()
+usage = "%prog [options] DIRECTORY ... DIRECTORYN"
+epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
+parser = OptionParser(usage = usage, epilog = epilog)
parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append")
parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for")
parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append")
parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true")
-parser.add_option("-r", "--report", dest="printreport", default="w", help="print score")
+parser.add_option("-r", "--report", dest="printreport", default="wf", help="print score")
parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true")
parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true")
parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true")
@@ -78,6 +113,41 @@ if options.wordlistfilename:
if options.show_wordlist: print wordlist; exit()
+if options.displaysummary and options.summaryfile:
+ report = dict()
+ try:
+ summaryfile = open(options.summaryfile)
+ except:
+ print "no summary file: " + options.summaryfile
+ exit()
+ #sample input
+ #../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1);
+ for line in summaryfile:
+ #find the file name which is before the matching parathsis before the last colon on the line
+ filename = line[:line[:line.rfind(':')].rfind('(')]
+ #find the total number of words found by locating the end of the filename and taking the number in parathesis right before the :
+ totalfilecount = line[line[:line.rfind(':')].rfind('(')+1:line[:line.rfind(':')].rfind(')')]
+ #find the list of words following the :, and split them by the ;, and then drop the last item on the list which is always a \n
+ foundwords = line[line.rfind(':')+1:].split(';')[:-1]
+ report[filename] = dict()
+ for w in foundwords:
+ w = w.strip()
+ word = w[:w.find('(')]
+ wcount = w[w.find('(')+1:w.find(')')]
+ report[filename][word] = int(wcount)
+
+ if options.printreport:
+ if options.printreport == "f":
+ printscore(sortscore(scorefile(report)))
+ elif options.printreport == "w":
+ printscore(sortscore(scorewords(report)))
+ elif options.printreport == "wf" or options.printreport == "fw":
+ print summary(report)
+ else:
+ print summary(report)
+ exit()
+
+
for a in args:
#filelist.append(a)
for (path, dirs, files) in os.walk(a):
@@ -98,9 +168,6 @@ for a in args:
if options.suspiciousfilename:
filelist += options.suspiciousfilename
-if options.summaryfile:
- summaryfile = open(summaryfile, 'w')
-
start = datetime.datetime.now()
for file in filelist:
if skipfile(file, options.skipfileextensions):
@@ -128,36 +195,6 @@ for file in filelist:
report[file][k] = filenamescore[k] + filecontentsscore[k]
-def scorewords(report):
- for file in report.keys():
- for word in report[file].keys():
- if not word in wordscore:
- wordscore[word] = 0
- if not file in filescore:
- filescore[file] = 0
- wordscore[word] += report[file][word]
- return wordscore
-
-def scorefile(report):
- for file in report.keys():
- for word in report[file].keys():
- if not word in wordscore:
- wordscore[word] = 0
- if not file in filescore:
- filescore[file] = 0
- filescore[file] += report[file][word]
- return filescore
-
-def summary(report):
- filescore = scorefile(report)
- text = ""
- for file in sortscore(filescore):
- text += file[0] + '(' + str(file[1]) + '):'
- for word in report[file[0]].keys():
- if report[file[0]][word] > 0:
- text += word + '(' + str(report[file[0]][word]) + ');'
- text += '\n'
- return text
if options.printreport:
if options.printreport == "f":
@@ -173,7 +210,17 @@ if options.display_counts:
print "skipped files:" + str(skipped) ,
print "searched:" + str(datasize) + 'B',
print "time:" + str(datetime.datetime.now() - start).split('.')[0]
-
+
+if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
+ summaryfilename = options.summaryfile
+ counter = None
+ while os.path.isfile(summaryfilename):
+ counter +=1
+ summaryfilename = options.summaryfile + '.' + str(counter)
+ summaryfile = open(summaryfile, 'w+')
+ summaryfile.write(summary(report))
+ summaryfile.close()
+
def test():
print wholeword("ear","bearth")
print wholeword("ear","BearTH")