diff options
author | Daniel Gnoutcheff <gnoutchd@softwarefreedom.org> | 2017-01-23 18:38:22 -0500 |
---|---|---|
committer | Daniel Gnoutcheff <gnoutchd@softwarefreedom.org> | 2017-01-23 18:49:28 -0500 |
commit | d7c84268a26a4391af55a3533422367b02737435 (patch) | |
tree | 7dd461cc35542d9ca155ced836c0bda53450bc0d /suspicious | |
parent | 7fc88c0c901b68b6e223d6f1383bc1221eae8488 (diff) |
Misc. unrecorded changes
Diffstat (limited to 'suspicious')
-rwxr-xr-x | suspicious | 52 |
1 files changed, 38 insertions, 14 deletions
@@ -15,10 +15,11 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. -#Authors: Marc Jones <mjones@softwarefreedom.org>, Daniel Gnoutcheff <gnoutchd@softwarefreedom.org> -#Date: June 2016 -#Version 0.3.0 -#Added AVFS support +# Authors: Marc Jones <mjones@softwarefreedom.org>, +# Daniel Gnoutcheff <gnoutchd@softwarefreedom.org> +# Date: June 2016 +# Version 0.3.0 +# Added AVFS support ##TODO #need to verify that word score counts each instance, not just 0 or 1 @@ -32,6 +33,8 @@ import sys import datetime import string import subprocess +import chardet + import avfs report = {} @@ -48,6 +51,32 @@ progresstext = "" fsencoding = sys.getfilesystemencoding() def fsdecode_display(bytestring): return str(bytestring, encoding=fsencoding, errors='replace') + + +_def_file_encoding = sys.getdefaultencoding() + +def decode_file(filename): + """ + Return the contents of the file at the given path as a (Unicode) string. + Return None if the file appears to be a binary. + """ + + with avfs.open(filename, 'rb') as filehandle: + contents_raw = filehandle.read() + + try: + return str(contents_raw, encoding=_def_file_encoding) + except UnicodeDecodeError: + pass + + guessed_encoding = chardet.detect(contents_raw)['encoding'] + if not guessed_encoding: + return None + + try: + return str(contents_raw, encoding=guessed_encoding) + except UnicodeDecodeError: + return None def sortscore(score, reverse=True): sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse) @@ -304,10 +333,13 @@ for file in filelist: skipped += 1 continue try: - f = avfs.open(file) - except: + filecontents = decode_file(file) + except OSError: print("failed to open: " + file_displayname) continue + if filecontents is None: + print("possible binary: " + file_displayname) + continue opened +=1 now = datetime.datetime.now() if options.display_progress: @@ -326,14 +358,6 @@ for file in filelist: .format(frac_done, est_hr, est_min, est_sec, prog_file) print(progresstext, end='', file=sys.stderr) sys.stdout.flush() - try: - filecontents = f.read() - except UnicodeDecodeError: - print("possible binary: " + file_displayname) - continue - except OSError: - print("read error: " + file_displayname) - continue datasize += len(filecontents) filenamescore = scoretext(wordlist, file_displayname, options.maxwholewordlength) filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength) |