From d7c84268a26a4391af55a3533422367b02737435 Mon Sep 17 00:00:00 2001 From: Daniel Gnoutcheff Date: Mon, 23 Jan 2017 18:38:22 -0500 Subject: Misc. unrecorded changes --- suspicious | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) (limited to 'suspicious') diff --git a/suspicious b/suspicious index 72dd9b0..2ab782b 100755 --- a/suspicious +++ b/suspicious @@ -15,10 +15,11 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -#Authors: Marc Jones , Daniel Gnoutcheff -#Date: June 2016 -#Version 0.3.0 -#Added AVFS support +# Authors: Marc Jones , +# Daniel Gnoutcheff +# Date: June 2016 +# Version 0.3.0 +# Added AVFS support ##TODO #need to verify that word score counts each instance, not just 0 or 1 @@ -32,6 +33,8 @@ import sys import datetime import string import subprocess +import chardet + import avfs report = {} @@ -48,6 +51,32 @@ progresstext = "" fsencoding = sys.getfilesystemencoding() def fsdecode_display(bytestring): return str(bytestring, encoding=fsencoding, errors='replace') + + +_def_file_encoding = sys.getdefaultencoding() + +def decode_file(filename): + """ + Return the contents of the file at the given path as a (Unicode) string. + Return None if the file appears to be a binary. + """ + + with avfs.open(filename, 'rb') as filehandle: + contents_raw = filehandle.read() + + try: + return str(contents_raw, encoding=_def_file_encoding) + except UnicodeDecodeError: + pass + + guessed_encoding = chardet.detect(contents_raw)['encoding'] + if not guessed_encoding: + return None + + try: + return str(contents_raw, encoding=guessed_encoding) + except UnicodeDecodeError: + return None def sortscore(score, reverse=True): sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse) @@ -304,10 +333,13 @@ for file in filelist: skipped += 1 continue try: - f = avfs.open(file) - except: + filecontents = decode_file(file) + except OSError: print("failed to open: " + file_displayname) continue + if filecontents is None: + print("possible binary: " + file_displayname) + continue opened +=1 now = datetime.datetime.now() if options.display_progress: @@ -326,14 +358,6 @@ for file in filelist: .format(frac_done, est_hr, est_min, est_sec, prog_file) print(progresstext, end='', file=sys.stderr) sys.stdout.flush() - try: - filecontents = f.read() - except UnicodeDecodeError: - print("possible binary: " + file_displayname) - continue - except OSError: - print("read error: " + file_displayname) - continue datasize += len(filecontents) filenamescore = scoretext(wordlist, file_displayname, options.maxwholewordlength) filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength) -- cgit v1.2.3