summaryrefslogtreecommitdiff
path: root/suspicious
diff options
context:
space:
mode:
Diffstat (limited to 'suspicious')
-rwxr-xr-xsuspicious52
1 files changed, 38 insertions, 14 deletions
diff --git a/suspicious b/suspicious
index 72dd9b0..2ab782b 100755
--- a/suspicious
+++ b/suspicious
@@ -15,10 +15,11 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-#Authors: Marc Jones <mjones@softwarefreedom.org>, Daniel Gnoutcheff <gnoutchd@softwarefreedom.org>
-#Date: June 2016
-#Version 0.3.0
-#Added AVFS support
+# Authors: Marc Jones <mjones@softwarefreedom.org>,
+# Daniel Gnoutcheff <gnoutchd@softwarefreedom.org>
+# Date: June 2016
+# Version 0.3.0
+# Added AVFS support
##TODO
#need to verify that word score counts each instance, not just 0 or 1
@@ -32,6 +33,8 @@ import sys
import datetime
import string
import subprocess
+import chardet
+
import avfs
report = {}
@@ -48,6 +51,32 @@ progresstext = ""
fsencoding = sys.getfilesystemencoding()
def fsdecode_display(bytestring):
return str(bytestring, encoding=fsencoding, errors='replace')
+
+
+_def_file_encoding = sys.getdefaultencoding()
+
+def decode_file(filename):
+ """
+ Return the contents of the file at the given path as a (Unicode) string.
+ Return None if the file appears to be a binary.
+ """
+
+ with avfs.open(filename, 'rb') as filehandle:
+ contents_raw = filehandle.read()
+
+ try:
+ return str(contents_raw, encoding=_def_file_encoding)
+ except UnicodeDecodeError:
+ pass
+
+ guessed_encoding = chardet.detect(contents_raw)['encoding']
+ if not guessed_encoding:
+ return None
+
+ try:
+ return str(contents_raw, encoding=guessed_encoding)
+ except UnicodeDecodeError:
+ return None
def sortscore(score, reverse=True):
sortedscore = sorted(score.items(), key=lambda score: score[1], reverse=reverse)
@@ -304,10 +333,13 @@ for file in filelist:
skipped += 1
continue
try:
- f = avfs.open(file)
- except:
+ filecontents = decode_file(file)
+ except OSError:
print("failed to open: " + file_displayname)
continue
+ if filecontents is None:
+ print("possible binary: " + file_displayname)
+ continue
opened +=1
now = datetime.datetime.now()
if options.display_progress:
@@ -326,14 +358,6 @@ for file in filelist:
.format(frac_done, est_hr, est_min, est_sec, prog_file)
print(progresstext, end='', file=sys.stderr)
sys.stdout.flush()
- try:
- filecontents = f.read()
- except UnicodeDecodeError:
- print("possible binary: " + file_displayname)
- continue
- except OSError:
- print("read error: " + file_displayname)
- continue
datasize += len(filecontents)
filenamescore = scoretext(wordlist, file_displayname, options.maxwholewordlength)
filecontentsscore = scoretext(wordlist, filecontents, options.maxwholewordlength)