From 7fc88c0c901b68b6e223d6f1383bc1221eae8488 Mon Sep 17 00:00:00 2001
From: Daniel Gnoutcheff <gnoutchd@softwarefreedom.org>
Date: Mon, 25 Jul 2016 18:24:16 -0400
Subject: Don't break on strange filenames

---
 avfs.py    | 78 +++++++++++++++++++++++++++++++-------------------------------
 suspicious | 40 +++++++++++++++++++-------------
 2 files changed, 63 insertions(+), 55 deletions(-)

diff --git a/avfs.py b/avfs.py
index 5e877ef..5ab7bea 100644
--- a/avfs.py
+++ b/avfs.py
@@ -9,7 +9,7 @@ import os.path
 import subprocess
 import stat
 
-avfs_sys_mount = os.environ['HOME'] + '/.avfs'	
+avfs_sys_mount = os.fsencode(os.environ['HOME'] + '/.avfs')
 avfs_started = False
 
 def sys_name(fpath):
@@ -35,47 +35,47 @@ def open(fpath, *pargs, **kwargs):
 # extensions we want to add.
 
 avfscmds = {
-	('.gz', '#ugz'),
-	('.tgz', '#ugz#utar'),
-	('.tar.bz2', '#ubz2#utar'),
-	('.bz2', '#ubz2'),
-	('.bz', '#ubz2'),
-	('.tbz2', '#ubz2#utar'),
-	('.tbz', '#ubz2#utar'),
-	('.Z', '#uz'),
-	('.tpz', '#uz#utar'),
-	('.tz', '#uz#utar'),
-	('.taz', '#uz#utar'),
-	('.a', '#uar'),
-	('.deb', '#uar'),
-	('.tar', '#utar'),
-	('.gem', '#utar'),    # Add upstream
-	('.rar', '#urar'),
-	('.sfx', '#urar'),
-	('.zip', '#uzip'),
-	('.jar', '#uzip'),
-	('.ear', '#uzip'),
-	('.war', '#uzip'),
-	('.nupkg', '#uzip'),  # Add upstream
-	('.whl', '#uzip'),    # Add upstream
-	('.7z', '#u7z'),
-	('.zoo', '#uzoo'),
-	('.lha', '#ulha'),
-	('.lhz', '#ulha'),
-	('.arj', '#uarj'),
-	('.cpio', '#ucpio'),
-	('.rpm', '#rpm'),
-	('.tar.xz', '#uxze#utar'),
-	('.txz', '#uxze#utar'),
-	('.xz', '#uxze'),
-	('.lzma', '#uxze'),
+	(b'.gz', b'#ugz'),
+	(b'.tgz', b'#ugz#utar'),
+	(b'.tar.bz2', b'#ubz2#utar'),
+	(b'.bz2', b'#ubz2'),
+	(b'.bz', b'#ubz2'),
+	(b'.tbz2', b'#ubz2#utar'),
+	(b'.tbz', b'#ubz2#utar'),
+	(b'.Z', b'#uz'),
+	(b'.tpz', b'#uz#utar'),
+	(b'.tz', b'#uz#utar'),
+	(b'.taz', b'#uz#utar'),
+	(b'.a', b'#uar'),
+	(b'.deb', b'#uar'),
+	(b'.tar', b'#utar'),
+	(b'.gem', b'#utar'),    # Add upstream
+	(b'.rar', b'#urar'),
+	(b'.sfx', b'#urar'),
+	(b'.zip', b'#uzip'),
+	(b'.jar', b'#uzip'),
+	(b'.ear', b'#uzip'),
+	(b'.war', b'#uzip'),
+	(b'.nupkg', b'#uzip'),  # Add upstream
+	(b'.whl', b'#uzip'),    # Add upstream
+	(b'.7z', b'#u7z'),
+	(b'.zoo', b'#uzoo'),
+	(b'.lha', b'#ulha'),
+	(b'.lhz', b'#ulha'),
+	(b'.arj', b'#uarj'),
+	(b'.cpio', b'#ucpio'),
+	(b'.rpm', b'#rpm'),
+	(b'.tar.xz', b'#uxze#utar'),
+	(b'.txz', b'#uxze#utar'),
+	(b'.xz', b'#uxze'),
+	(b'.lzma', b'#uxze'),
 }
 
 def guesscmd(filename):
 	for ext, cmd in avfscmds:
 		if filename.endswith(ext):
 			return cmd + guesscmd(filename[:-len(ext)])
-	return ''
+	return b''
 
 def get_lstat_mode(filename):
 	"""
@@ -115,11 +115,11 @@ def find(path, excludes):
 
 	if stat.S_ISDIR(mode):
 		for entry in os.listdir(sys_path):
-			yield from find(path + '/' + entry, excludes)
+			yield from find(path + b'/' + entry, excludes)
 	elif stat.S_ISREG(mode):
 		yield path
 
 if __name__ == "__main__":
 	import sys
-	for f in find(sys.argv[1], {'.git'}):
-		print(f)
+	for f in find(os.fsencode(sys.argv[1]), {b'.git'}):
+		print(f.decode('utf-8', 'replace'))
diff --git a/suspicious b/suspicious
index 178ae4b..72dd9b0 100755
--- a/suspicious
+++ b/suspicious
@@ -43,6 +43,11 @@ opened = 0
 datasize = 0
 progresstext = "" 
 
+
+
+fsencoding = sys.getfilesystemencoding()
+def fsdecode_display(bytestring):
+	return str(bytestring, encoding=fsencoding, errors='replace')
 	
 def sortscore(score, reverse=True):
 	sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse)
@@ -54,8 +59,8 @@ def sortscore(score, reverse=True):
 	return returnscore
 
 def printscore(report):
-	for i in report:
-		print(i[0] + ':' + str(i[1]))
+	for filename, filescore in report:
+		print(fsdecode_display(filename) + ':' + str(filescore))
 
 def scorewords(report):
 	for file in report.keys():
@@ -101,11 +106,11 @@ def scorefile(report):
 def summary(report):
 	filescore = scorefile(report)
 	text = ""
-	for file in sortscore(filescore):
-		text += file[0] + '(' + str(file[1]) + '):'
-		for word in report[file[0]].keys():
-			if report[file[0]][word] > 0:
-				text += word + '(' + str(report[file[0]][word]) + ');' 
+	for filename, filescore in sortscore(filescore):
+		text += fsdecode_display(filename) + '(' + str(filescore) + '):'
+		for word, wordfreq in report[filename].items():
+			if wordfreq > 0:
+				text += word + '(' + str(wordfreq) + ');' 
 		text += '\n'
 	return text
 
@@ -285,20 +290,23 @@ if options.displaysummary and options.summaryfile:
 	exit()
 
 #Run a search if not displaying a existing report
-prunedirs = {'CVS', '.git', '.bzr', '.hg', '.svn'}
+prunedirs = {b'CVS', b'.git', b'.bzr', b'.hg', b'.svn'}
 if len(args) > 0:
 	for a in args:
-		filelist.extend(avfs.find(a, prunedirs))
+		filelist.extend(avfs.find(os.fsencode(a), prunedirs))
+
+skipfileexts_bytes = [os.fsencode(e) for e in options.skipfileextensions]
 
 start = datetime.datetime.now()
 for file in filelist:
-	if skipfile(file, options.skipfileextensions):
+	file_displayname = fsdecode_display(file)
+	if skipfile(file, skipfileexts_bytes):
 		skipped += 1
 		continue
 	try:
 		f = avfs.open(file)
 	except:
-		print("failed to open: " + file)
+		print("failed to open: " + file_displayname)
 		continue
 	opened +=1
 	now = datetime.datetime.now()
@@ -307,8 +315,8 @@ for file in filelist:
 		est = ((now - start) / (opened + skipped)) * len(filelist)
 		est_hr, est_rem = divmod(est.total_seconds(), 3600)
 		est_min, est_sec = divmod(est_rem, 60)
-		if len(file)> 52:
-			prog_file = file.split('/')[0] + "/.../" + file.split('/')[-1]
+		if len(file_displayname)> 52:
+			prog_file = file_displayname.split('/')[0] + "/.../" + file_displayname.split('/')[-1]
 			if len(prog_file) > 52:
 				prog_file = prog_file[0:52]
 		else:
@@ -321,13 +329,13 @@ for file in filelist:
 	try:
 		filecontents = f.read()
 	except UnicodeDecodeError:
-		print("possible binary: " + file)
+		print("possible binary: " + file_displayname)
 		continue
 	except OSError:
-		print("read error: " + file)
+		print("read error: " + file_displayname)
 		continue
 	datasize += len(filecontents)		
-	filenamescore = scoretext(wordlist, file, options.maxwholewordlength)
+	filenamescore = scoretext(wordlist, file_displayname, options.maxwholewordlength)
 	filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength)
 	report[file] = {}
 	for k in filecontentsscore.keys():
-- 
cgit v1.2.3