From 7fc88c0c901b68b6e223d6f1383bc1221eae8488 Mon Sep 17 00:00:00 2001 From: Daniel Gnoutcheff Date: Mon, 25 Jul 2016 18:24:16 -0400 Subject: Don't break on strange filenames --- avfs.py | 78 +++++++++++++++++++++++++++++++------------------------------- suspicious | 40 +++++++++++++++++++------------- 2 files changed, 63 insertions(+), 55 deletions(-) diff --git a/avfs.py b/avfs.py index 5e877ef..5ab7bea 100644 --- a/avfs.py +++ b/avfs.py @@ -9,7 +9,7 @@ import os.path import subprocess import stat -avfs_sys_mount = os.environ['HOME'] + '/.avfs' +avfs_sys_mount = os.fsencode(os.environ['HOME'] + '/.avfs') avfs_started = False def sys_name(fpath): @@ -35,47 +35,47 @@ def open(fpath, *pargs, **kwargs): # extensions we want to add. avfscmds = { - ('.gz', '#ugz'), - ('.tgz', '#ugz#utar'), - ('.tar.bz2', '#ubz2#utar'), - ('.bz2', '#ubz2'), - ('.bz', '#ubz2'), - ('.tbz2', '#ubz2#utar'), - ('.tbz', '#ubz2#utar'), - ('.Z', '#uz'), - ('.tpz', '#uz#utar'), - ('.tz', '#uz#utar'), - ('.taz', '#uz#utar'), - ('.a', '#uar'), - ('.deb', '#uar'), - ('.tar', '#utar'), - ('.gem', '#utar'), # Add upstream - ('.rar', '#urar'), - ('.sfx', '#urar'), - ('.zip', '#uzip'), - ('.jar', '#uzip'), - ('.ear', '#uzip'), - ('.war', '#uzip'), - ('.nupkg', '#uzip'), # Add upstream - ('.whl', '#uzip'), # Add upstream - ('.7z', '#u7z'), - ('.zoo', '#uzoo'), - ('.lha', '#ulha'), - ('.lhz', '#ulha'), - ('.arj', '#uarj'), - ('.cpio', '#ucpio'), - ('.rpm', '#rpm'), - ('.tar.xz', '#uxze#utar'), - ('.txz', '#uxze#utar'), - ('.xz', '#uxze'), - ('.lzma', '#uxze'), + (b'.gz', b'#ugz'), + (b'.tgz', b'#ugz#utar'), + (b'.tar.bz2', b'#ubz2#utar'), + (b'.bz2', b'#ubz2'), + (b'.bz', b'#ubz2'), + (b'.tbz2', b'#ubz2#utar'), + (b'.tbz', b'#ubz2#utar'), + (b'.Z', b'#uz'), + (b'.tpz', b'#uz#utar'), + (b'.tz', b'#uz#utar'), + (b'.taz', b'#uz#utar'), + (b'.a', b'#uar'), + (b'.deb', b'#uar'), + (b'.tar', b'#utar'), + (b'.gem', b'#utar'), # Add upstream + (b'.rar', b'#urar'), + (b'.sfx', b'#urar'), + (b'.zip', b'#uzip'), + (b'.jar', b'#uzip'), + (b'.ear', b'#uzip'), + (b'.war', b'#uzip'), + (b'.nupkg', b'#uzip'), # Add upstream + (b'.whl', b'#uzip'), # Add upstream + (b'.7z', b'#u7z'), + (b'.zoo', b'#uzoo'), + (b'.lha', b'#ulha'), + (b'.lhz', b'#ulha'), + (b'.arj', b'#uarj'), + (b'.cpio', b'#ucpio'), + (b'.rpm', b'#rpm'), + (b'.tar.xz', b'#uxze#utar'), + (b'.txz', b'#uxze#utar'), + (b'.xz', b'#uxze'), + (b'.lzma', b'#uxze'), } def guesscmd(filename): for ext, cmd in avfscmds: if filename.endswith(ext): return cmd + guesscmd(filename[:-len(ext)]) - return '' + return b'' def get_lstat_mode(filename): """ @@ -115,11 +115,11 @@ def find(path, excludes): if stat.S_ISDIR(mode): for entry in os.listdir(sys_path): - yield from find(path + '/' + entry, excludes) + yield from find(path + b'/' + entry, excludes) elif stat.S_ISREG(mode): yield path if __name__ == "__main__": import sys - for f in find(sys.argv[1], {'.git'}): - print(f) + for f in find(os.fsencode(sys.argv[1]), {b'.git'}): + print(f.decode('utf-8', 'replace')) diff --git a/suspicious b/suspicious index 178ae4b..72dd9b0 100755 --- a/suspicious +++ b/suspicious @@ -43,6 +43,11 @@ opened = 0 datasize = 0 progresstext = "" + + +fsencoding = sys.getfilesystemencoding() +def fsdecode_display(bytestring): + return str(bytestring, encoding=fsencoding, errors='replace') def sortscore(score, reverse=True): sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse) @@ -54,8 +59,8 @@ def sortscore(score, reverse=True): return returnscore def printscore(report): - for i in report: - print(i[0] + ':' + str(i[1])) + for filename, filescore in report: + print(fsdecode_display(filename) + ':' + str(filescore)) def scorewords(report): for file in report.keys(): @@ -101,11 +106,11 @@ def scorefile(report): def summary(report): filescore = scorefile(report) text = "" - for file in sortscore(filescore): - text += file[0] + '(' + str(file[1]) + '):' - for word in report[file[0]].keys(): - if report[file[0]][word] > 0: - text += word + '(' + str(report[file[0]][word]) + ');' + for filename, filescore in sortscore(filescore): + text += fsdecode_display(filename) + '(' + str(filescore) + '):' + for word, wordfreq in report[filename].items(): + if wordfreq > 0: + text += word + '(' + str(wordfreq) + ');' text += '\n' return text @@ -285,20 +290,23 @@ if options.displaysummary and options.summaryfile: exit() #Run a search if not displaying a existing report -prunedirs = {'CVS', '.git', '.bzr', '.hg', '.svn'} +prunedirs = {b'CVS', b'.git', b'.bzr', b'.hg', b'.svn'} if len(args) > 0: for a in args: - filelist.extend(avfs.find(a, prunedirs)) + filelist.extend(avfs.find(os.fsencode(a), prunedirs)) + +skipfileexts_bytes = [os.fsencode(e) for e in options.skipfileextensions] start = datetime.datetime.now() for file in filelist: - if skipfile(file, options.skipfileextensions): + file_displayname = fsdecode_display(file) + if skipfile(file, skipfileexts_bytes): skipped += 1 continue try: f = avfs.open(file) except: - print("failed to open: " + file) + print("failed to open: " + file_displayname) continue opened +=1 now = datetime.datetime.now() @@ -307,8 +315,8 @@ for file in filelist: est = ((now - start) / (opened + skipped)) * len(filelist) est_hr, est_rem = divmod(est.total_seconds(), 3600) est_min, est_sec = divmod(est_rem, 60) - if len(file)> 52: - prog_file = file.split('/')[0] + "/.../" + file.split('/')[-1] + if len(file_displayname)> 52: + prog_file = file_displayname.split('/')[0] + "/.../" + file_displayname.split('/')[-1] if len(prog_file) > 52: prog_file = prog_file[0:52] else: @@ -321,13 +329,13 @@ for file in filelist: try: filecontents = f.read() except UnicodeDecodeError: - print("possible binary: " + file) + print("possible binary: " + file_displayname) continue except OSError: - print("read error: " + file) + print("read error: " + file_displayname) continue datasize += len(filecontents) - filenamescore = scoretext(wordlist, file, options.maxwholewordlength) + filenamescore = scoretext(wordlist, file_displayname, options.maxwholewordlength) filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength) report[file] = {} for k in filecontentsscore.keys(): -- cgit v1.2.3