From d013f8534767cf906d55076797afd50df14733af Mon Sep 17 00:00:00 2001 From: Daniel Gnoutcheff Date: Wed, 29 Jun 2016 16:24:26 -0400 Subject: python 3.x port, improve avfs support Explicitly use ~/.avfs/, and call `mountavfs` to ensure it's available. --- suspicious | 232 +++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 125 insertions(+), 107 deletions(-) diff --git a/suspicious b/suspicious index d4ea346..4d62ad9 100755 --- a/suspicious +++ b/suspicious @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Copyright 2014, 2016 Software Freedom Law Center (www.softwarefreedom.org) # @@ -25,11 +25,109 @@ #need to discount found words if they are substrings of common strings in text from optparse import OptionParser -import os, os.path +import os +import os.path import re import sys import datetime import string +import subprocess + +# AVFS stuff ------------------------------------------------------------------ + +avfs_sys_mount = os.environ['HOME'] + '/.avfs' +avfs_started = False +def avfs_sys_name(fpath): + if not avfs_started: + subprocess.check_call(['mountavfs']) + return avfs_sys_mount + os.path.abspath(fpath) + +def avfs_exists(fpath): + return os.path.exists(avfs_sys_name(fpath)) + +def avfs_isdir(fpath): + return os.path.isdir(avfs_sys_name(fpath)) + +def avfs_open(fpath, *pargs, **kwargs): + return open(avfs_sys_name(fpath), *pargs, **kwargs) + +# AVFS has its own automatic view selection using file extensions, but it +# includes plugins (like #patch) that will lead us into an infinite loop +# if we try to do a directory traversal. Also, there are a few +# extensions we want to add. + +avfscmds = { + ('.gz', '#ugz'), + ('.tgz', '#ugz#utar'), + ('.tar.bz2', '#ubz2#utar'), + ('.bz2', '#ubz2'), + ('.bz', '#ubz2'), + ('.tbz2', '#ubz2#utar'), + ('.tbz', '#ubz2#utar'), + ('.Z', '#uz'), + ('.tpz', '#uz#utar'), + ('.tz', '#uz#utar'), + ('.taz', '#uz#utar'), + ('.a', '#uar'), + ('.deb', '#uar'), + ('.tar', '#utar'), + ('.gem', '#utar'), # Add upstream + ('.rar', '#urar'), + ('.sfx', '#urar'), + ('.zip', '#uzip'), + ('.jar', '#uzip'), + ('.ear', '#uzip'), + ('.war', '#uzip'), + ('.nupkg', '#uzip'), # Add upstream + ('.whl', '#uzip'), # Add upstream + ('.7z', '#u7z'), + ('.zoo', '#uzoo'), + ('.lha', '#ulha'), + ('.lhz', '#ulha'), + ('.arj', '#uarj'), + ('.cpio', '#ucpio'), + ('.rpm', '#rpm'), + ('.tar.xz', '#uxze#utar'), + ('.txz', '#uxze#utar'), + ('.xz', '#uxze'), + ('.lzma', '#uxze'), +} + +def avfs_guesscmd(filename): + for ext, cmd in avfscmds: + if filename.endswith(ext): + return cmd + avfs_guesscmd(filename[:-len(ext)]) + return '' + +def avfs_find(rootdir, prunedirs): + """ + Recursively list all files under rootdir, including files in archives + supported by AVFS. + """ + + sys_rootdir = avfs_sys_name(rootdir) + + for name in os.listdir(sys_rootdir): + path = rootdir + '/' + name + sys_path = sys_rootdir + '/' + name + + if os.path.isdir(sys_path): + if name not in prunedirs: + yield from avfs_find(path, prunedirs) + else: + cmd = avfs_guesscmd(name) + filtered_path = path + cmd + sys_filtered_path = sys_path + cmd + + if cmd and os.path.exists(sys_filtered_path): + if os.path.isdir(sys_filtered_path): + yield from avfs_find(filtered_path, prunedirs) + else: + yield filtered_path + else: + yield path + +# ----------------------------------------------------------------------------- report = {} wordscore = {} @@ -52,7 +150,7 @@ def sortscore(score, reverse=True): def printscore(report): for i in report: - print i[0] + ':' + str(i[1]) + print(i[0] + ':' + str(i[1])) def scorewords(report): for file in report.keys(): @@ -66,7 +164,7 @@ def scorewords(report): def weightreport(report, commonwords): notsuspiciousfiles = [] - weightedout = 0 + weightedout = 0 for file in report: suspicious = False filescore = 0 @@ -81,7 +179,7 @@ def weightreport(report, commonwords): for file in notsuspiciousfiles: report.pop(file) - weightedout +=1 + weightedout +=1 return report, weightedout @@ -153,87 +251,6 @@ def scoretext(wordlist, text, maxwholewordlen = -1): score[word] = len(wholeword(wordreg,text)) return score - -# AVFS stuff ------------------------------------------------------------------ - -# AVFS has its own automatic view selection using file extensions, but it -# includes plugins (like #patch) that will lead us into an infinite loop -# if we try to do a directory traversal. Also, there are a few -# extensions we want to add. - -avfscmds = { - ('.gz', '#ugz'), - ('.tgz', '#ugz#utar'), - ('.tar.bz2', '#ubz2#utar'), - ('.bz2', '#ubz2'), - ('.bz', '#ubz2'), - ('.tbz2', '#ubz2#utar'), - ('.tbz', '#ubz2#utar'), - ('.Z', '#uz'), - ('.tpz', '#uz#utar'), - ('.tz', '#uz#utar'), - ('.taz', '#uz#utar'), - ('.a', '#uar'), - ('.deb', '#uar'), - ('.tar', '#utar'), - ('.gem', '#utar'), # Add upstream - ('.rar', '#urar'), - ('.sfx', '#urar'), - ('.zip', '#uzip'), - ('.jar', '#uzip'), - ('.ear', '#uzip'), - ('.war', '#uzip'), - ('.nupkg', '#uzip'), # Add upstream - ('.whl', '#uzip'), # Add upstream - ('.7z', '#u7z'), - ('.zoo', '#uzoo'), - ('.lha', '#ulha'), - ('.lhz', '#ulha'), - ('.arj', '#uarj'), - ('.cpio', '#ucpio'), - ('.rpm', '#rpm'), - ('.tar.xz', '#uxze#utar'), - ('.txz', '#uxze#utar'), - ('.xz', '#uxze'), - ('.lzma', '#uxze'), -} - -def avfs_guesscmd(filename): - for ext, cmd in avfscmds: - if filename.endswith(ext): - return cmd + avfs_guesscmd(filename[:-len(ext)]) - return '' - -def mkfilelist(rootdir): - """ - Produce a list of files to examine. Use AVFS paths if available. - - rootdir: path to directory to examine, as a string. Preferably - somewhere inside an AVFS mount. - """ - prunedirs = {'CVS', '.git', '.bzr', '.hg', '.svn'} - - for base, dirs, files in os.walk(rootdir): - for dname in dirs: - if dname in prunedirs: - dirs.remove(dname) - - for fname in files: - fpath = base + '/' + fname - - view_fname = fname + avfs_guesscmd(fname) - view_fpath = base + '/' + view_fname - - if fname != view_fname and os.path.exists(view_fpath): - if os.path.isdir(view_fpath): - dirs.append(view_fname) - else: - yield view_fpath - else: - yield fpath - -# ----------------------------------------------------------------------------- - usage = "%prog [options] DIRECTORY ... DIRECTORYN" epilog = "example: ./suspicious ../gitcheckout -s .tar -s .gz -s .bmp -s .zip -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3" parser = OptionParser(usage = usage, epilog = epilog) @@ -322,14 +339,14 @@ if options.optimizewordlist and options.wordlistfilename: wordlist.remove(check_word) break -if options.show_wordlist: print wordlist; exit() +if options.show_wordlist: print(wordlist); exit() if options.displaysummary and options.summaryfile: report = dict() try: summaryfile = open(options.summaryfile) except: - print "no summary file: " + options.summaryfile + print("no summary file: " + options.summaryfile) exit() #sample input #../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1); @@ -356,26 +373,27 @@ if options.displaysummary and options.summaryfile: elif options.printreport == "w": printscore(sortscore(scorewords(report))) elif options.printreport == "wf" or options.printreport == "fw": - print summary(report) + print(summary(report)) else: - print summary(report) + print(summary(report)) exit() #Run a search if not displaying a existing report +prunedirs = {'CVS', '.git', '.bzr', '.hg', '.svn'} if len(args) > 0: for a in args: - filelist.extend(mkfilelist(a)) - + filelist.extend(avfs_find(a, prunedirs)) + start = datetime.datetime.now() for file in filelist: if skipfile(file, options.skipfileextensions): skipped += 1 continue try: - f = open(file) + f = avfs_open(file) except: - print "failed to open: " + file + print("failed to open: " + file) continue opened +=1 now = datetime.datetime.now() @@ -385,9 +403,9 @@ for file in filelist: prog_file = file.split('/')[0] + "/.../" + file.split('/')[-1] else: prog_file = file - print '\r' + " " * len(progresstext) + '\r', + print('\r' + " " * len(progresstext) + '\r') progresstext = str(((opened + skipped)*1.0/len(filelist))*100)[:5] + '% '+ " time left:" + str(estimate).split('.')[0] + ' ' + prog_file + '\r' - print progresstext, + print(progresstext) sys.stdout.flush() filecontents = f.read() datasize += len(filecontents) @@ -399,7 +417,7 @@ for file in filelist: #Clear screen of proggress text now that finished scoring file if options.display_progress: - print '\r' + " " * len(progresstext) + '\r', + print('\r' + " " * len(progresstext) + '\r') #Save summary as a file, but if the filename exists do not overwrite, append a number if options.summaryfile and len(filelist) > 0 and not options.displaysummary: @@ -409,13 +427,13 @@ if options.summaryfile and len(filelist) > 0 and not options.displaysummary: counter +=1 summaryfilename = options.summaryfile + '.' + str(counter) try: - if counter > 1: print "saving as " + summaryfilename + "...." + if counter > 1: print("saving as " + summaryfilename + "....") summaryfile = open(summaryfilename, 'w+') summaryfile.write(summary(report)) summaryfile.close() except: - print report - print "error saving summary as " + summaryfilename + print(report) + print("error saving summary as " + summaryfilename) if options.commonwordfilename and not(options.dontweightreport): report, weightedfiles = weightreport(report, commonwords) @@ -424,15 +442,15 @@ if options.printreport and not options.dontdisplaysummary: if options.printreport == "f": printscore(sortscore(scorefile(report))) elif options.printreport == "wf" or options.printreport == "fw": - print summary(report) + print(summary(report)) else: printscore(sortscore(scorewords(report))) if options.display_counts: - print "total files:" + str(len(filelist)) , - print "suspicious files:" + str(len(sortscore(scorefile(report)))) , - print "skipped files:" + str(skipped), + print("total files:" + str(len(filelist))) + print("suspicious files:" + str(len(sortscore(scorefile(report))))) + print("skipped files:" + str(skipped)) if options.commonwordfilename and not(options.dontweightreport): - print "removed weighted files:" + str(weightedfiles), - print "searched:" + str(datasize) + 'B', - print "time:" + str(datetime.datetime.now() - start).split('.')[0] + print("removed weighted files:" + str(weightedfiles)) + print("searched:" + str(datasize) + 'B') + print("time:" + str(datetime.datetime.now() - start).split('.')[0]) -- cgit v1.2.1