python 3.x port, improve avfs support

Explicitly use ~/.avfs/, and call `mountavfs` to ensure it's available.
author: Daniel Gnoutcheff <gnoutchd@softwarefreedom.org> 2016-06-29 16:24:26 -0400
committer: Daniel Gnoutcheff <gnoutchd@softwarefreedom.org> 2016-06-29 16:26:46 -0400
commit: d013f8534767cf906d55076797afd50df14733af (patch)
tree: a7f424b53311f6e1381a34745d9ca04da8eab7b2
parent: 66f706f3669af849397b408f2522db5d7d5f2ff3 (diff)
1 files changed, 125 insertions, 107 deletions
diff --git a/suspicious b/suspicious
index d4ea346..4d62ad9 100755
--- a/suspicious
+++ b/suspicious
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 
 # Copyright 2014, 2016 Software Freedom Law Center (www.softwarefreedom.org)
 #
@@ -25,11 +25,109 @@
 #need to discount found words if they are substrings of common strings in text
  
 from optparse import OptionParser
-import os, os.path
+import os
+import os.path
 import re
 import sys
 import datetime
 import string
+import subprocess
+
+# AVFS stuff ------------------------------------------------------------------
+
+avfs_sys_mount = os.environ['HOME'] + '/.avfs'	
+avfs_started = False
+def avfs_sys_name(fpath):
+	if not avfs_started:
+		subprocess.check_call(['mountavfs'])
+	return avfs_sys_mount + os.path.abspath(fpath)
+
+def avfs_exists(fpath):
+	return os.path.exists(avfs_sys_name(fpath))
+
+def avfs_isdir(fpath):
+	return os.path.isdir(avfs_sys_name(fpath))
+
+def avfs_open(fpath, *pargs, **kwargs):
+	return open(avfs_sys_name(fpath), *pargs, **kwargs)
+
+# AVFS has its own automatic view selection using file extensions, but it
+# includes plugins (like #patch) that will lead us into an infinite loop
+# if we try to do a directory traversal.  Also, there are a few
+# extensions we want to add.
+
+avfscmds = {
+	('.gz', '#ugz'),
+	('.tgz', '#ugz#utar'),
+	('.tar.bz2', '#ubz2#utar'),
+	('.bz2', '#ubz2'),
+	('.bz', '#ubz2'),
+	('.tbz2', '#ubz2#utar'),
+	('.tbz', '#ubz2#utar'),
+	('.Z', '#uz'),
+	('.tpz', '#uz#utar'),
+	('.tz', '#uz#utar'),
+	('.taz', '#uz#utar'),
+	('.a', '#uar'),
+	('.deb', '#uar'),
+	('.tar', '#utar'),
+	('.gem', '#utar'),    # Add upstream
+	('.rar', '#urar'),
+	('.sfx', '#urar'),
+	('.zip', '#uzip'),
+	('.jar', '#uzip'),
+	('.ear', '#uzip'),
+	('.war', '#uzip'),
+	('.nupkg', '#uzip'),  # Add upstream
+	('.whl', '#uzip'),    # Add upstream
+	('.7z', '#u7z'),
+	('.zoo', '#uzoo'),
+	('.lha', '#ulha'),
+	('.lhz', '#ulha'),
+	('.arj', '#uarj'),
+	('.cpio', '#ucpio'),
+	('.rpm', '#rpm'),
+	('.tar.xz', '#uxze#utar'),
+	('.txz', '#uxze#utar'),
+	('.xz', '#uxze'),
+	('.lzma', '#uxze'),
+}
+
+def avfs_guesscmd(filename):
+	for ext, cmd in avfscmds:
+		if filename.endswith(ext):
+			return cmd + avfs_guesscmd(filename[:-len(ext)])
+	return ''
+
+def avfs_find(rootdir, prunedirs):
+	"""
+	Recursively list all files under rootdir, including files in archives
+	supported by AVFS.
+	"""
+
+	sys_rootdir = avfs_sys_name(rootdir)
+
+	for name in os.listdir(sys_rootdir):
+		path = rootdir + '/' + name
+		sys_path = sys_rootdir + '/' + name
+
+		if os.path.isdir(sys_path):
+			if name not in prunedirs:
+				yield from avfs_find(path, prunedirs)
+		else:
+			cmd = avfs_guesscmd(name)
+			filtered_path = path + cmd
+			sys_filtered_path = sys_path + cmd
+
+			if cmd and os.path.exists(sys_filtered_path):
+				if os.path.isdir(sys_filtered_path):
+					yield from avfs_find(filtered_path, prunedirs)
+				else:
+					yield filtered_path
+			else:
+				yield path
+
+# -----------------------------------------------------------------------------
 
 report = {}
 wordscore = {}
@@ -52,7 +150,7 @@ def sortscore(score, reverse=True):
 
 def printscore(report):
 	for i in report:
-		print i[0] + ':' + str(i[1])
+		print(i[0] + ':' + str(i[1]))
 
 def scorewords(report):
 	for file in report.keys():
@@ -66,7 +164,7 @@ def scorewords(report):
 
 def weightreport(report, commonwords):
 	notsuspiciousfiles = []
-        weightedout = 0
+	weightedout = 0
 	for file in report:
 		suspicious = False
 		filescore = 0
@@ -81,7 +179,7 @@ def weightreport(report, commonwords):
 
 	for file in notsuspiciousfiles:
 		report.pop(file)
-                weightedout +=1
+		weightedout +=1
 
 	return report, weightedout
 
@@ -153,87 +251,6 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
 			score[word] = len(wholeword(wordreg,text))
 	return score
 
-
-# AVFS stuff ------------------------------------------------------------------
-
-# AVFS has its own automatic view selection using file extensions, but it
-# includes plugins (like #patch) that will lead us into an infinite loop
-# if we try to do a directory traversal.  Also, there are a few
-# extensions we want to add.
-
-avfscmds = {
-	('.gz', '#ugz'),
-	('.tgz', '#ugz#utar'),
-	('.tar.bz2', '#ubz2#utar'),
-	('.bz2', '#ubz2'),
-	('.bz', '#ubz2'),
-	('.tbz2', '#ubz2#utar'),
-	('.tbz', '#ubz2#utar'),
-	('.Z', '#uz'),
-	('.tpz', '#uz#utar'),
-	('.tz', '#uz#utar'),
-	('.taz', '#uz#utar'),
-	('.a', '#uar'),
-	('.deb', '#uar'),
-	('.tar', '#utar'),
-	('.gem', '#utar'),    # Add upstream
-	('.rar', '#urar'),
-	('.sfx', '#urar'),
-	('.zip', '#uzip'),
-	('.jar', '#uzip'),
-	('.ear', '#uzip'),
-	('.war', '#uzip'),
-	('.nupkg', '#uzip'),  # Add upstream
-	('.whl', '#uzip'),    # Add upstream
-	('.7z', '#u7z'),
-	('.zoo', '#uzoo'),
-	('.lha', '#ulha'),
-	('.lhz', '#ulha'),
-	('.arj', '#uarj'),
-	('.cpio', '#ucpio'),
-	('.rpm', '#rpm'),
-	('.tar.xz', '#uxze#utar'),
-	('.txz', '#uxze#utar'),
-	('.xz', '#uxze'),
-	('.lzma', '#uxze'),
-}
-
-def avfs_guesscmd(filename):
-	for ext, cmd in avfscmds:
-		if filename.endswith(ext):
-			return cmd + avfs_guesscmd(filename[:-len(ext)])
-	return ''
-
-def mkfilelist(rootdir):
-	"""
-	Produce a list of files to examine.  Use AVFS paths if available.
-
-	rootdir: path to directory to examine, as a string.  Preferably
-	somewhere inside an AVFS mount.
-	"""
-	prunedirs = {'CVS', '.git', '.bzr', '.hg', '.svn'}
-
-	for base, dirs, files in os.walk(rootdir):
-		for dname in dirs:
-			if dname in prunedirs:
-				dirs.remove(dname)
-
-		for fname in files:
-			fpath = base + '/' + fname
-
-			view_fname = fname + avfs_guesscmd(fname)
-			view_fpath = base + '/' + view_fname
-
-			if fname != view_fname and os.path.exists(view_fpath):
-				if os.path.isdir(view_fpath):
-					dirs.append(view_fname)
-				else:
-					yield view_fpath
-			else:
-				yield fpath
-
-# -----------------------------------------------------------------------------
-
 usage = "%prog [options] DIRECTORY ... DIRECTORYN"
 epilog = "example: ./suspicious ../gitcheckout -s .tar -s .gz -s .bmp -s .zip -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
 parser = OptionParser(usage = usage, epilog = epilog)
@@ -322,14 +339,14 @@ if options.optimizewordlist and options.wordlistfilename:
                     wordlist.remove(check_word)
                     break
 
-if options.show_wordlist: print wordlist; exit()
+if options.show_wordlist: print(wordlist); exit()
 
 if options.displaysummary and options.summaryfile:
 	report = dict()
 	try:
 		summaryfile = open(options.summaryfile)
 	except:
-		print "no summary file: " + options.summaryfile
+		print("no summary file: " + options.summaryfile)
 		exit()
 	#sample input
 	#../bzr.lf/lsb/devel/build_env/headers/x86-64/4.1/glib-2.0/gio/gmenuexporter.h.defs(1): export(1);
@@ -356,26 +373,27 @@ if options.displaysummary and options.summaryfile:
 		elif options.printreport == "w":
 			printscore(sortscore(scorewords(report)))
 		elif options.printreport == "wf" or options.printreport == "fw":
-			print summary(report)			
+			print(summary(report))			
 	else:
-		print summary(report)
+		print(summary(report))
 	
 	exit()
 
 #Run a search if not displaying a existing report
+prunedirs = {'CVS', '.git', '.bzr', '.hg', '.svn'}
 if len(args) > 0:
 	for a in args:
-		filelist.extend(mkfilelist(a))
-	
+		filelist.extend(avfs_find(a, prunedirs))
+
 start = datetime.datetime.now()
 for file in filelist:
 	if skipfile(file, options.skipfileextensions):
 		skipped += 1
 		continue
 	try:
-		f = open(file)
+		f = avfs_open(file)
 	except:
-		print "failed to open: " + file
+		print("failed to open: " + file)
 		continue
 	opened +=1
 	now = datetime.datetime.now()
@@ -385,9 +403,9 @@ for file in filelist:
 			prog_file = file.split('/')[0] + "/.../" + file.split('/')[-1]
 		else:
 			prog_file = file
-		print '\r' + " " * len(progresstext) + '\r',
+		print('\r' + " " * len(progresstext) + '\r')
 		progresstext = str(((opened + skipped)*1.0/len(filelist))*100)[:5] + '% '+ " time left:" + str(estimate).split('.')[0] + ' ' + prog_file + '\r'
-		print progresstext,
+		print(progresstext)
 	sys.stdout.flush()
 	filecontents = f.read()
 	datasize += len(filecontents)		
@@ -399,7 +417,7 @@ for file in filelist:
 
 #Clear screen of proggress text now that finished scoring file
 if options.display_progress: 
-	print '\r' + " " * len(progresstext) + '\r',
+	print('\r' + " " * len(progresstext) + '\r')
 
 #Save summary as a file, but if the filename exists do not overwrite, append a number
 if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
@@ -409,13 +427,13 @@ if options.summaryfile and len(filelist) > 0 and not options.displaysummary:
 		counter +=1
 		summaryfilename = options.summaryfile + '.' + str(counter)
 	try:
-		if counter > 1: print "saving as " + summaryfilename + "...."	
+		if counter > 1: print("saving as " + summaryfilename + "....")
 		summaryfile = open(summaryfilename, 'w+')
 		summaryfile.write(summary(report))
 		summaryfile.close()		
 	except:
-		print report
-		print "error saving summary as " + summaryfilename
+		print(report)
+		print("error saving summary as " + summaryfilename)
 
 if options.commonwordfilename and not(options.dontweightreport):
 	report, weightedfiles = weightreport(report, commonwords)
@@ -424,15 +442,15 @@ if options.printreport and not options.dontdisplaysummary:
 	if options.printreport == "f":
 		printscore(sortscore(scorefile(report)))
 	elif options.printreport == "wf" or options.printreport == "fw":
-		print summary(report)
+		print(summary(report))
 	else:
 		printscore(sortscore(scorewords(report)))
 
 if options.display_counts:
-	print "total files:" + str(len(filelist)) ,
-	print "suspicious files:" + str(len(sortscore(scorefile(report)))) ,
-	print "skipped files:" + str(skipped),
+	print("total files:" + str(len(filelist)))
+	print("suspicious files:" + str(len(sortscore(scorefile(report)))))
+	print("skipped files:" + str(skipped))
 	if options.commonwordfilename and not(options.dontweightreport):
-		print "removed weighted files:" + str(weightedfiles),
-	print "searched:" + str(datasize) + 'B', 
-	print "time:" + str(datetime.datetime.now() - start).split('.')[0]
+		print("removed weighted files:" + str(weightedfiles))
+	print("searched:" + str(datasize) + 'B')
+	print("time:" + str(datetime.datetime.now() - start).split('.')[0])
author	Daniel Gnoutcheff <gnoutchd@softwarefreedom.org>	2016-06-29 16:24:26 -0400
committer	Daniel Gnoutcheff <gnoutchd@softwarefreedom.org>	2016-06-29 16:26:46 -0400
commit	d013f8534767cf906d55076797afd50df14733af (patch)
tree	a7f424b53311f6e1381a34745d9ca04da8eab7b2
parent	66f706f3669af849397b408f2522db5d7d5f2ff3 (diff)