Strip numbers from wordlists.

Many wordlists have format "NNNNNN<tab>some-term". We try to detect lines of this format and will parse them to get the term (removing numbers and whitespaces).

Strip numbers from wordlists.
9dd17582 · ulif · d9f918aa · 9dd17582 · 9dd17582
Commit 9dd17582 authored 9 years ago by ulif
--- a/diceware/wordlist.py
+++ b/diceware/wordlist.py
@@ -100,6 +100,9 @@ def get_signed_wordlist(file_descriptor):
            break
        if line.startswith('- '):
            line = line[2:]
+        match = RE_NUMBERED_WORDLIST_ENTRY.match(line)
+        if match:
+            line = match.groups()[0]
        if not line:
            continue
        result += [line, ]

--- a/tests/test_wordlist.py
+++ b/tests/test_wordlist.py
@@ -67,8 +67,8 @@ class Test_GetSignedWordList(object):
        with open(wlist_path, 'r') as fd:
            result = get_signed_wordlist(fd)
        assert len(result) == 7776
-        assert "11111\ta" == result[0]
-        assert "66666\t@" == result[-1]
+        assert "a" == result[0]
+        assert "@" == result[-1]


 class TestWordlistModule(object):