preliminary avfs support

author: Daniel Gnoutcheff <gnoutchd@softwarefreedom.org> 2016-06-29 14:58:33 -0400
committer: Daniel Gnoutcheff <gnoutchd@softwarefreedom.org> 2016-06-29 14:58:33 -0400
commit: 66f706f3669af849397b408f2522db5d7d5f2ff3 (patch)
tree: 8db4dee67311134036c6a4a591a1701fae44a7bd /suspicious
parent: 10b7b82dd70ea48ec16d1bc9d67baf49ebef4308 (diff)
1 files changed, 89 insertions, 28 deletions
diff --git a/suspicious b/suspicious
index 487a844..d4ea346 100755
--- a/suspicious
+++ b/suspicious
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-# Copyright 2014 Software Freedom Law Center (www.softwarefreedom.org)
+# Copyright 2014, 2016 Software Freedom Law Center (www.softwarefreedom.org)
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
@@ -15,18 +15,17 @@
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-#Author: Marc Jones <mjones@softwarefreedom.org>
-#Date: June 30, 2014
-#Version 0.2.1
-#Added weight score function to remove scores from files that otherwise would not have scores
-#Added Remove superstrings of other words from search to speed things up if they are greater than wordlength
+#Authors: Marc Jones <mjones@softwarefreedom.org>, Daniel Gnoutcheff <gnoutchd@softwarefreedom.org>
+#Date: June 2016
+#Version 0.3.0
+#Added AVFS support
 
 ##TODO
 #need to verify that word score counts each instance, not just 0 or 1
 #need to discount found words if they are substrings of common strings in text
  
 from optparse import OptionParser
-import os
+import os, os.path
 import re
 import sys
 import datetime
@@ -154,12 +153,90 @@ def scoretext(wordlist, text, maxwholewordlen = -1):
 			score[word] = len(wholeword(wordreg,text))
 	return score
 
+
+# AVFS stuff ------------------------------------------------------------------
+
+# AVFS has its own automatic view selection using file extensions, but it
+# includes plugins (like #patch) that will lead us into an infinite loop
+# if we try to do a directory traversal.  Also, there are a few
+# extensions we want to add.
+
+avfscmds = {
+	('.gz', '#ugz'),
+	('.tgz', '#ugz#utar'),
+	('.tar.bz2', '#ubz2#utar'),
+	('.bz2', '#ubz2'),
+	('.bz', '#ubz2'),
+	('.tbz2', '#ubz2#utar'),
+	('.tbz', '#ubz2#utar'),
+	('.Z', '#uz'),
+	('.tpz', '#uz#utar'),
+	('.tz', '#uz#utar'),
+	('.taz', '#uz#utar'),
+	('.a', '#uar'),
+	('.deb', '#uar'),
+	('.tar', '#utar'),
+	('.gem', '#utar'),    # Add upstream
+	('.rar', '#urar'),
+	('.sfx', '#urar'),
+	('.zip', '#uzip'),
+	('.jar', '#uzip'),
+	('.ear', '#uzip'),
+	('.war', '#uzip'),
+	('.nupkg', '#uzip'),  # Add upstream
+	('.whl', '#uzip'),    # Add upstream
+	('.7z', '#u7z'),
+	('.zoo', '#uzoo'),
+	('.lha', '#ulha'),
+	('.lhz', '#ulha'),
+	('.arj', '#uarj'),
+	('.cpio', '#ucpio'),
+	('.rpm', '#rpm'),
+	('.tar.xz', '#uxze#utar'),
+	('.txz', '#uxze#utar'),
+	('.xz', '#uxze'),
+	('.lzma', '#uxze'),
+}
+
+def avfs_guesscmd(filename):
+	for ext, cmd in avfscmds:
+		if filename.endswith(ext):
+			return cmd + avfs_guesscmd(filename[:-len(ext)])
+	return ''
+
+def mkfilelist(rootdir):
+	"""
+	Produce a list of files to examine.  Use AVFS paths if available.
+
+	rootdir: path to directory to examine, as a string.  Preferably
+	somewhere inside an AVFS mount.
+	"""
+	prunedirs = {'CVS', '.git', '.bzr', '.hg', '.svn'}
+
+	for base, dirs, files in os.walk(rootdir):
+		for dname in dirs:
+			if dname in prunedirs:
+				dirs.remove(dname)
+
+		for fname in files:
+			fpath = base + '/' + fname
+
+			view_fname = fname + avfs_guesscmd(fname)
+			view_fpath = base + '/' + view_fname
+
+			if fname != view_fname and os.path.exists(view_fpath):
+				if os.path.isdir(view_fpath):
+					dirs.append(view_fname)
+				else:
+					yield view_fpath
+			else:
+				yield fpath
+
+# -----------------------------------------------------------------------------
+
 usage = "%prog [options] DIRECTORY ... DIRECTORYN"
 epilog = "example: ./suspicious ../gitcheckout -s .tar -s .gz -s .bmp -s .zip -s .ppt -s .docx -s .pdf -s .xls -s .xlsx -s .gif -s .png -s .jpg -s .css -r fw -w cryptology.txt -c -p -l 3"
 parser = OptionParser(usage = usage, epilog = epilog)
-parser.add_option("-f", "--file", 
-		dest="suspiciousfilename", 
-		help="specify file to scan", action="append")
 parser.add_option("-w", "--wordlist", 
 		dest="wordlistfilename", 
 		help="file containing all of the words to look for")
@@ -285,27 +362,11 @@ if options.displaysummary and options.summaryfile:
 	
 	exit()
 
-#Run a serarch if not displaying a existing report
+#Run a search if not displaying a existing report
 if len(args) > 0:
 	for a in args:
-		for (path, dirs, files) in os.walk(a):
-			if 'CVS' in dirs:
-				dirs.remove('CVS')
-			if '.git' in dirs:
-				dirs.remove('.git')
-			if '.bzr' in dirs:
-				dirs.remove('.bzr')
-			if '.hg' in dirs:
-				dirs.remove('.hg')
-			if '.svn' in dirs:
-				dirs.remove('.svn')
-	
-			for file in files:
-				filelist.append(path + '/' + file)
+		filelist.extend(mkfilelist(a))
 	
-if options.suspiciousfilename:
-	filelist += options.suspiciousfilename
-
 start = datetime.datetime.now()
 for file in filelist:
 	if skipfile(file, options.skipfileextensions):
author	Daniel Gnoutcheff <gnoutchd@softwarefreedom.org>	2016-06-29 14:58:33 -0400
committer	Daniel Gnoutcheff <gnoutchd@softwarefreedom.org>	2016-06-29 14:58:33 -0400
commit	66f706f3669af849397b408f2522db5d7d5f2ff3 (patch)
tree	8db4dee67311134036c6a4a591a1701fae44a7bd /suspicious
parent	10b7b82dd70ea48ec16d1bc9d67baf49ebef4308 (diff)