summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Gnoutcheff <gnoutchd@softwarefreedom.org>2016-06-29 16:24:26 -0400
committerDaniel Gnoutcheff <gnoutchd@softwarefreedom.org>2016-06-29 16:26:46 -0400
commitd013f8534767cf906d55076797afd50df14733af (patch)
treea7f424b53311f6e1381a34745d9ca04da8eab7b2
parent66f706f3669af849397b408f2522db5d7d5f2ff3 (diff)
python 3.x port, improve avfs support
Explicitly use ~/.avfs/, and call `mountavfs` to ensure it's available.
-rwxr-xr-xsuspicious232
1 files changed, 125 insertions, 107 deletions
diff --git a/suspicious b/suspicious
index d4ea346..4d62ad9 100755
--- a/suspicious
+++ b/suspicious
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
# Copyright 2014, 2016 Software Freedom Law Center (www.softwarefreedom.org)
#
@@ -25,11 +25,109 @@
#need to discount found words if they are substrings of common strings in text
from optparse import OptionParser
-import os, os.path
+import os
+import os.path
import re
import sys
import datetime
import string
+import subprocess
+
+# AVFS stuff ------------------------------------------------------------------
+
+avfs_sys_mount = os.environ['HOME'] + '/.avfs'
+avfs_started = False
+def avfs_sys_name(fpath):
+ if not avfs_started:
+ subprocess.check_call(['mountavfs'])
+ return avfs_sys_mount + os.path.abspath(fpath)
+
+def avfs_exists(fpath):
+ return os.path.exists(avfs_sys_name(fpath))
+
+def avfs_isdir(fpath):
+ return os.path.isdir(avfs_sys_name(fpath))
+
+def avfs_open(fpath, *pargs, **kwargs):
+ return open(avfs_sys_name(fpath), *pargs, **kwargs)
+
+# AVFS has its own automatic view selection using file extensions, but it
+# includes plugins (like #patch) that will lead us into an infinite loop
+# if we try to do a directory traversal. Also, there are a few
+# extensions we want to add.
+
+avfscmds = {
+ ('.gz', '#ugz'),
+ ('.tgz', '#ugz#utar'),
+ ('.tar.bz2', '#ubz2#utar'),
+ ('.bz2', '#ubz2'),
+ ('.bz', '#ubz2'),
+ ('.tbz2', '#ubz2#utar'),
+ ('.tbz', '#ubz2#utar'),
+ ('.Z', '#uz'),
+ ('.tpz', '#uz#utar'),
+ ('.tz', '#uz#utar'),
+ ('.taz', '#uz#utar'),
+ ('.a', '#uar'),
+ ('.deb', '#uar'),
+ ('.tar', '#utar'),
+ ('.gem', '#utar'), # Add upstream
+ ('.rar', '#urar'),
+ ('.sfx', '#urar'),
+ ('.zip', '#uzip'),
+ ('.jar', '#uzip'),
+ ('.ear', '#uzip'),
+ ('.war', '#uzip'),
+ ('.nupkg', '#uzip'), # Add upstream
+ ('.whl', '#uzip'), # Add upstream
+ ('.7z', '#u7z'),
+ ('.zoo', '#uzoo'),
+ ('.lha', '#ulha'),
+ ('.lhz', '#ulha'),
+ ('.arj', '#uarj'),
+ ('.cpio', '#ucpio'),
+ ('.rpm', '#rpm'),
+ ('.tar.xz', '#uxze#utar'),
+ ('.txz', '#uxze#utar'),
+ ('.xz', '#uxze'),
+ ('.lzma', '#uxze'),
+}
+
+def avfs_guesscmd(filename):
+ for ext, cmd in avfscmds:
+ if filename.endswith(ext):
+ return cmd + avfs_guesscmd(filename[:-len(ext)])
+ return ''
+
+def avfs_find(rootdir, prunedirs):
+ """
+ Recursively list all files under rootdir, including files in archives
+ supported by AVFS.
+ """
+
+ sys_rootdir = avfs_sys_name(rootdir)
+
+ for name in os.listdir(sys_rootdir):
+ path = rootdir + '/' + name
+ sys_path = sys_rootdir + '/' + name
+
+ if os.path.isdir(sys_path):
+ if name not in prunedirs:
+ yield from avfs_find(path, prunedirs)
+ else:
+ cmd = avfs_guesscmd(name)
+ filtered_path = path + cmd
+ sys_filtered_path = sys_path + cmd
+
+ if cmd and os.path.exists(sys_filtered_path):
+ if os.path.isdir(sys_filtered_path):
+ yield from avfs_find(filtered_path, prunedirs)
+ else:
+ yield filtered_path
+ else:
+ yield path
+
+# -----------------------------------------------------------------------------
report = {}
wordscore = {}
@@ -52,7 +150,7 @@ def sortscore(score, reverse=True):
def printscore(report):
for i in report:
- print i[0] + ':' + str(i[1])
+ print(i[0] + ':' + str(i[1]))
def scorewords(report):
for file in report.keys():
@@ -66,7 +164,7 @@ def scorewords(report):
def weightreport(report, commonwords):
notsuspiciousfiles = []
- weightedout = 0
+ weightedout = 0
for file in report:
suspicious = False
filescore = 0
@@ -81,7 +179,7 @@ def weightreport(report, commonwords):
for file in notsuspiciousfiles:
report.pop(file)
- weightedout +=1
+ weightedout +=1
return report, weightedout
@@ -153,87 +251,6 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
score[word] = len(wholeword(wordreg,text))
return score
-
-# AVFS stuff ------------------------------------------------------------------
-
-# AVFS has its own automatic view selection using file extensions, but it
-# includes plugins (like #patch) that will lead us into an infinite loop
-# if we try to do a directory traversal. Also, there are a few
-# extensions we want to add.
-
-avfscmds = {
- ('.gz', '#ugz'),
- ('.tgz', '#ugz#utar'),
- ('.tar.bz2', '#ubz2#utar'),
- ('.bz2', '#ubz2'),
- ('.bz', '#ubz2'),
- ('.tbz2', '#ubz2#utar'),
- ('.tbz', '#ubz2#utar'),
- ('.Z', '#uz'),
- ('.tpz', '#uz#utar'),
- ('.tz', '#uz#utar'),
- ('.taz', '#uz#utar'),
- ('.a', '#uar'),
- ('.deb', '#uar'),
- ('.tar', '#utar'),
- ('.gem', '#utar'), # Add upstream
- ('.rar', '#urar'),
- ('.sfx', '#urar'),
- ('.zip', '#uzip'),
- ('.jar', '#uzip'),
- ('.ear', '#uzip'),
- ('.war', '#uzip'),
- ('.nupkg', '#uzip'), # Add upstream
- ('.whl', '#uzip'), # Add upstream
- ('.7z', '#u7z'),
- ('.zoo', '#uzoo'),
- ('.lha', '#ulha'),
- ('.lhz', '#ulha'),
- ('.arj', '#uarj'),
- ('.cpio', '#ucpio'),
- ('.rpm', '#rpm'),
- ('.tar.xz', '#uxze#utar'),
- ('.txz', '#uxze#utar'),
- ('.xz', '#uxze'),
- ('.lzma', '#uxze'),
-}
-
-def avfs_guesscmd(filename):
- for ext, cmd in avfscmds:
- if filename.endswith(ext):
- return cmd + avfs_guesscmd(filename[:-len(ext)])
- return ''
-
-def mkfilelist(rootdir):
- """
- Produce a list of files to examine. Use AVFS paths if available.
-
- rootdir: path to directory to examine, as a string. Preferably
- somewhere inside an AVFS mount.
- """
- prunedirs = {'CVS', '.git', '.bzr', '.hg', '.svn'}
-
- for base, dirs, files in os.walk(rootdir):
- for dname in dirs:
- if dname in prunedirs:
- dirs.remove(dname)
-
- for fname in files:
- fpath = base + '/' + fname
-
- view_fname = fname + avfs_guesscmd(fname)
- view_fpath = base + '/' + view_fname
-
- if fname != view_fname and os.path.exists(view_fpath):
- if os.path.isdir(view_fpath):
- dirs.append(view_fname)
- else:
- yield view_fpath
- else:
- yield fpath
-
-# -----------------------------------------------------------------------------
-
usage = "%prog [options] DIRECTORY ... DIRECTORYN"
epilog = "example: ./suspicious ../gitcheckout -s .tar -s .gz -s .bmp -s .zip -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
parser = OptionParser(usage = usage, epilog = epilog)
@@ -322,14 +339,14 @@ if options.optimizewordlist and options.wordlistfilename:
wordlist.remove(check_word)
break
-if options.show_wordlist: print wordlist; exit()
+if options.show_wordlist: print(wordlist); exit()
if options.displaysummary and options.summaryfile:
report = dict()
try:
summaryfile = open(options.summaryfile)
except:
- print "no summary file: " + options.summaryfile
+ print("no summary file: " + options.summaryfile)
exit()
#sample input
#../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1);
@@ -356,26 +373,27 @@ if options.displaysummary and options.summaryfile:
elif options.printreport == "w":
printscore(sortscore(scorewords(report)))
elif options.printreport == "wf" or options.printreport == "fw":
- print summary(report)
+ print(summary(report))
else:
- print summary(report)
+ print(summary(report))
exit()
#Run a search if not displaying a existing report
+prunedirs = {'CVS', '.git', '.bzr', '.hg', '.svn'}
if len(args) > 0:
for a in args:
- filelist.extend(mkfilelist(a))
-
+ filelist.extend(avfs_find(a, prunedirs))
+
start = datetime.datetime.now()
for file in filelist:
if skipfile(file, options.skipfileextensions):
skipped += 1
continue
try:
- f = open(file)
+ f = avfs_open(file)
except:
- print "failed to open: " + file
+ print("failed to open: " + file)
continue
opened +=1
now = datetime.datetime.now()
@@ -385,9 +403,9 @@ for file in filelist:
prog_file = file.split('/')[0] + "/.../" + file.split('/')[-1]
else:
prog_file = file
- print '\r' + " " * len(progresstext) + '\r',
+ print('\r' + " " * len(progresstext) + '\r')
progresstext = str(((opened + skipped)*1.0/len(filelist))*100)[:5] + '% '+ " time left:" + str(estimate).split('.')[0] + ' ' + prog_file + '\r'
- print progresstext,
+ print(progresstext)
sys.stdout.flush()
filecontents = f.read()
datasize += len(filecontents)
@@ -399,7 +417,7 @@ for file in filelist:
#Clear screen of proggress text now that finished scoring file
if options.display_progress:
- print '\r' + " " * len(progresstext) + '\r',
+ print('\r' + " " * len(progresstext) + '\r')
#Save summary as a file, but if the filename exists do not overwrite, append a number
if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
@@ -409,13 +427,13 @@ if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
counter +=1
summaryfilename = options.summaryfile + '.' + str(counter)
try:
- if counter > 1: print "saving as " + summaryfilename + "...."
+ if counter > 1: print("saving as " + summaryfilename + "....")
summaryfile = open(summaryfilename, 'w+')
summaryfile.write(summary(report))
summaryfile.close()
except:
- print report
- print "error saving summary as " + summaryfilename
+ print(report)
+ print("error saving summary as " + summaryfilename)
if options.commonwordfilename and not(options.dontweightreport):
report, weightedfiles = weightreport(report, commonwords)
@@ -424,15 +442,15 @@ if options.printreport and not options.dontdisplaysummary:
if options.printreport == "f":
printscore(sortscore(scorefile(report)))
elif options.printreport == "wf" or options.printreport == "fw":
- print summary(report)
+ print(summary(report))
else:
printscore(sortscore(scorewords(report)))
if options.display_counts:
- print "total files:" + str(len(filelist)) ,
- print "suspicious files:" + str(len(sortscore(scorefile(report)))) ,
- print "skipped files:" + str(skipped),
+ print("total files:" + str(len(filelist)))
+ print("suspicious files:" + str(len(sortscore(scorefile(report)))))
+ print("skipped files:" + str(skipped))
if options.commonwordfilename and not(options.dontweightreport):
- print "removed weighted files:" + str(weightedfiles),
- print "searched:" + str(datasize) + 'B',
- print "time:" + str(datetime.datetime.now() - start).split('.')[0]
+ print("removed weighted files:" + str(weightedfiles))
+ print("searched:" + str(datasize) + 'B')
+ print("time:" + str(datetime.datetime.now() - start).split('.')[0])