summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarc Jones <mjones@softwarefreedom.org>2014-02-05 11:07:37 -0500
committerMarc Jones <mjones@softwarefreedom.org>2014-02-05 11:07:37 -0500
commit81d7583b05c546a894cdf33e0d8426af5edca1ae (patch)
tree16d7eec83d7ba55381d056bb0426fc2fe34adc9f
parent4a33223eba7cf5cf66ccdf98abb1f0c81808cfc3 (diff)
only match words made only of digits that are not adjacent to other digits if wordboundaries are turn on
-rw-r--r--git_and_bzr_results_2-5-14.txt3
-rwxr-xr-xmain.py52
-rwxr-xr-xmain.py~119
3 files changed, 119 insertions, 55 deletions
diff --git a/git_and_bzr_results_2-5-14.txt b/git_and_bzr_results_2-5-14.txt
index f2aadf4..b3cb11f 100644
--- a/git_and_bzr_results_2-5-14.txt
+++ b/git_and_bzr_results_2-5-14.txt
@@ -1,4 +1,3 @@
-mjones@hand:~/Documents/LF (export controls)/suspicious$ ./main.py ../bzr.lf/ ../git.lf -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 4
../bzr.lf/openprinting/foomatic/foomatic-db/db/source/printer/Brother-HL-2030.xml(1): url(1);
../bzr.lf/lsb/devel/build_env/headers/All/5.0/libxml2/libxml/xmlerror.h.defs(1): url(1);
../bzr.lf/openprinting/foomatic/foomatic-db-devel/db/source/printer/Panasonic-KX-P1180i.xml(1): url(1);
@@ -17956,5 +17955,3 @@ mjones@hand:~/Documents/LF (export controls)/suspicious$ ./main.py ../bzr.lf/ ..
../bzr.lf/openprinting/foomatic/foomatic-db/db/source/PPD/Lexmark/Lexmark_C522.ppd(229): 22(4); password(11); des(1); magenta(213);
../git.lf/ltsi-kernel/patches.lttng/lttng-2.3.4.patch(324): random(59); entropy(65); 22(2); parity(4); export(100); ice(1); hash(1); bob(1); ca(1); encryption(2); rc6(1); enc(1); secure(8); crypt(2); padding(75); url(1);
../bzr.lf/ally/devel/a11yspecs-atspi/adoc/AccessibilityAPIComparisons.htm(346): 22(2); password(2); signature(3); padding(338); url(1);
-total files:66547 suspicious files:17957 skipped files:535 searched:594180174B time:7:03:22
-
diff --git a/main.py b/main.py
index 22da098..7c4102b 100755
--- a/main.py
+++ b/main.py
@@ -1,4 +1,6 @@
#!/usr/bin/python
+#example command
+
from optparse import OptionParser
import os
import re
@@ -61,15 +63,23 @@ def summary(report):
def wholeword(word, string):
re.purge()
matches = []
- regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)'
- regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)'
- mU = re.search(regexU, string)
- if "groups" in dir(mU):
- matches.append(mU.groups())
- re.purge()
- mL = re.search(regexL, string)
- if "groups" in dir(mL):
- matches.append(mL.groups())
+
+ try int(word):
+ regexNum = r'([^0-9]|\b)(' + word + r')([^0-9]|\b)'
+ m = re.search(regexNum, string)
+ if "groups" in dir(mU):
+ matches.append(mU.groups())
+
+ except ValueError:
+ regexU = r'([A-Z]|[^a-zA-Z]|\b)(' + re.escape(word.lower()) + r')([A-Z]|[^a-zA-Z]|\b)'
+ regexL = r'([a-z]|[^a-zA-Z]|\b)(' + re.escape(word.upper()) + r')([a-z]|[^a-zA-Z]|\b)'
+ mU = re.search(regexU, string)
+ if "groups" in dir(mU):
+ matches.append(mU.groups())
+ re.purge()
+ mL = re.search(regexL, string)
+ if "groups" in dir(mL):
+ matches.append(mL.groups())
return matches
def skipfile(filename,skippedexts):
@@ -89,12 +99,14 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
score[word] = len(wholeword(word,text))
return score
-parser = OptionParser()
+usage = "%prog [options] DIRECTORY ... DIRECTORYN"
+epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
+parser = OptionParser(usage = usage, epilog = epilog)
parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append")
parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for")
parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append")
parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true")
-parser.add_option("-r", "--report", dest="printreport", default="w", help="print score")
+parser.add_option("-r", "--report", dest="printreport", default="wf", help="print score")
parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true")
parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true")
parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true")
@@ -109,7 +121,7 @@ if options.wordlistfilename:
if options.show_wordlist: print wordlist; exit()
-if options.displaysummary and options.summaryfile
+if options.displaysummary and options.summaryfile:
report = dict()
try:
summaryfile = open(options.summaryfile)
@@ -130,9 +142,17 @@ if options.displaysummary and options.summaryfile
w = w.strip()
word = w[:w.find('(')]
wcount = w[w.find('(')+1:w.find(')')]
- report[filename][word] = wcount
-
- print summary(report)
+ report[filename][word] = int(wcount)
+
+ if options.printreport:
+ if options.printreport == "f":
+ printscore(sortscore(scorefile(report)))
+ elif options.printreport == "w":
+ printscore(sortscore(scorewords(report)))
+ elif options.printreport == "wf" or options.printreport == "fw":
+ print summary(report)
+ else:
+ print summary(report)
exit()
@@ -202,7 +222,7 @@ if options.display_counts:
if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
summaryfilename = options.summaryfile
counter = None
- while os.path.isfile(summaryfilename)
+ while os.path.isfile(summaryfilename):
counter +=1
summaryfilename = options.summaryfile + '.' + str(counter)
summaryfile = open(summaryfile, 'w+')
diff --git a/main.py~ b/main.py~
index 8ad0891..fe318c1 100755
--- a/main.py~
+++ b/main.py~
@@ -1,4 +1,6 @@
#!/usr/bin/python
+#example command
+
from optparse import OptionParser
import os
import re
@@ -27,6 +29,37 @@ def printscore(report):
for i in report:
print i[0] + ':' + str(i[1])
+def scorewords(report):
+ for file in report.keys():
+ for word in report[file].keys():
+ if not word in wordscore:
+ wordscore[word] = 0
+ if not file in filescore:
+ filescore[file] = 0
+ wordscore[word] += report[file][word]
+ return wordscore
+
+def scorefile(report):
+ for file in report.keys():
+ for word in report[file].keys():
+ if not word in wordscore:
+ wordscore[word] = 0
+ if not file in filescore:
+ filescore[file] = 0
+ filescore[file] += report[file][word]
+ return filescore
+
+def summary(report):
+ filescore = scorefile(report)
+ text = ""
+ for file in sortscore(filescore):
+ text += file[0] + '(' + str(file[1]) + '):'
+ for word in report[file[0]].keys():
+ if report[file[0]][word] > 0:
+ text += word + '(' + str(report[file[0]][word]) + ');'
+ text += '\n'
+ return text
+
def wholeword(word, string):
re.purge()
matches = []
@@ -58,12 +91,14 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
score[word] = len(wholeword(word,text))
return score
-parser = OptionParser()
+usage = "%prog [options] DIRECTORY ... DIRECTORYN"
+epilog = "example: ./main.py ../git.lf/janitor -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
+parser = OptionParser(usage = usage, epilog = epilog)
parser.add_option("-f", "--file", dest="suspiciousfilename", help="specify file to scan", action="append")
parser.add_option("-w", "--wordlist", dest="wordlistfilename", help="file containing all of the words to look for")
parser.add_option("-s", "--skip", dest="skipfileextensions", help="file extensions to skip", action="append")
parser.add_option("-v", "--verbose", dest="verbose", help="print verberose information", default=False, action="store_true")
-parser.add_option("-r", "--report", dest="printreport", default="w", help="print score")
+parser.add_option("-r", "--report", dest="printreport", default="wf", help="print score")
parser.add_option("--show-wordlist", dest="show_wordlist", default=False, help="print list of words to detect", action="store_true")
parser.add_option("-c", "--display-counts", dest="display_counts", default=False, help="Show the num ber of files processed", action="store_true")
parser.add_option("-p", "--display_progress", dest="display_progress", default=False, help="show percentage complete", action="store_true")
@@ -78,6 +113,41 @@ if options.wordlistfilename:
if options.show_wordlist: print wordlist; exit()
+if options.displaysummary and options.summaryfile:
+ report = dict()
+ try:
+ summaryfile = open(options.summaryfile)
+ except:
+ print "no summary file: " + options.summaryfile
+ exit()
+ #sample input
+ #../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1);
+ for line in summaryfile:
+ #find the file name which is before the matching parathsis before the last colon on the line
+ filename = line[:line[:line.rfind(':')].rfind('(')]
+ #find the total number of words found by locating the end of the filename and taking the number in parathesis right before the :
+ totalfilecount = line[line[:line.rfind(':')].rfind('(')+1:line[:line.rfind(':')].rfind(')')]
+ #find the list of words following the :, and split them by the ;, and then drop the last item on the list which is always a \n
+ foundwords = line[line.rfind(':')+1:].split(';')[:-1]
+ report[filename] = dict()
+ for w in foundwords:
+ w = w.strip()
+ word = w[:w.find('(')]
+ wcount = w[w.find('(')+1:w.find(')')]
+ report[filename][word] = int(wcount)
+
+ if options.printreport:
+ if options.printreport == "f":
+ printscore(sortscore(scorefile(report)))
+ elif options.printreport == "w":
+ printscore(sortscore(scorewords(report)))
+ elif options.printreport == "wf" or options.printreport == "fw":
+ print summary(report)
+ else:
+ print summary(report)
+ exit()
+
+
for a in args:
#filelist.append(a)
for (path, dirs, files) in os.walk(a):
@@ -98,9 +168,6 @@ for a in args:
if options.suspiciousfilename:
filelist += options.suspiciousfilename
-if options.summaryfile:
- summaryfile = open(summaryfile, 'w')
-
start = datetime.datetime.now()
for file in filelist:
if skipfile(file, options.skipfileextensions):
@@ -128,36 +195,6 @@ for file in filelist:
report[file][k] = filenamescore[k] + filecontentsscore[k]
-def scorewords(report):
- for file in report.keys():
- for word in report[file].keys():
- if not word in wordscore:
- wordscore[word] = 0
- if not file in filescore:
- filescore[file] = 0
- wordscore[word] += report[file][word]
- return wordscore
-
-def scorefile(report):
- for file in report.keys():
- for word in report[file].keys():
- if not word in wordscore:
- wordscore[word] = 0
- if not file in filescore:
- filescore[file] = 0
- filescore[file] += report[file][word]
- return filescore
-
-def summary(report):
- filescore = scorefile(report)
- text = ""
- for file in sortscore(filescore):
- text += file[0] + '(' + str(file[1]) + '):'
- for word in report[file[0]].keys():
- if report[file[0]][word] > 0:
- text += word + '(' + str(report[file[0]][word]) + ');'
- text += '\n'
- return text
if options.printreport:
if options.printreport == "f":
@@ -173,7 +210,17 @@ if options.display_counts:
print "skipped files:" + str(skipped) ,
print "searched:" + str(datasize) + 'B',
print "time:" + str(datetime.datetime.now() - start).split('.')[0]
-
+
+if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
+ summaryfilename = options.summaryfile
+ counter = None
+ while os.path.isfile(summaryfilename):
+ counter +=1
+ summaryfilename = options.summaryfile + '.' + str(counter)
+ summaryfile = open(summaryfile, 'w+')
+ summaryfile.write(summary(report))
+ summaryfile.close()
+
def test():
print wholeword("ear","bearth")
print wholeword("ear","BearTH")