From 6e7c5a93c6bf8ac2b0d84ecd1c5d8070650fe788 Mon Sep 17 00:00:00 2001
From: ulif <uli@gnufix.de>
Date: Sat, 25 Jul 2015 14:23:17 +0200
Subject: [PATCH] Add regexp for numbered wlist entries.

We need this regular expression to strip entries in wordlists like
the original 7776-terms wordlist from diceware.com.
---
 diceware/wordlist.py   |  3 +++
 tests/test_wordlist.py | 15 +++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/diceware/wordlist.py b/diceware/wordlist.py
index 77caee0..4347c33 100644
--- a/diceware/wordlist.py
+++ b/diceware/wordlist.py
@@ -26,6 +26,9 @@ WORDLISTS_DIR = os.path.abspath(
 #: allow names that cannot easily mess up filesystems.
 RE_WORDLIST_NAME = re.compile('^[a-zA-Z0-9_-]+$')
 
+#: A regular expression matching numbered entries in wordlists.
+RE_NUMBERED_WORDLIST_ENTRY = re.compile('^[0-9]+\s+([^\s]+)$')
+
 
 def get_wordlist_names():
     """Get a all names of wordlists stored locally.
diff --git a/tests/test_wordlist.py b/tests/test_wordlist.py
index 1815b38..d9fd16f 100644
--- a/tests/test_wordlist.py
+++ b/tests/test_wordlist.py
@@ -1,8 +1,9 @@
 import os
 import pytest
 from diceware.wordlist import (
-    WORDLISTS_DIR, RE_WORDLIST_NAME, get_wordlist, get_signed_wordlist,
-    get_wordlist_path, get_wordlist_names, is_signed_wordlist,
+    WORDLISTS_DIR, RE_WORDLIST_NAME, RE_NUMBERED_WORDLIST_ENTRY, get_wordlist,
+    get_signed_wordlist, get_wordlist_path, get_wordlist_names,
+    is_signed_wordlist,
 )
 
 
@@ -81,6 +82,16 @@ class TestWordlistModule(object):
         assert RE_WORDLIST_NAME.match('with.dot') is None
         assert RE_WORDLIST_NAME.match('with/slash') is None
 
+    def test_re_numbered_wordlist_entry(self):
+        assert RE_NUMBERED_WORDLIST_ENTRY.match('11111   a') is not None
+        assert RE_NUMBERED_WORDLIST_ENTRY.match(
+            '11111   a').groups() == ('a', )
+        assert RE_NUMBERED_WORDLIST_ENTRY.match('12211\t 1') is not None
+        assert RE_NUMBERED_WORDLIST_ENTRY.match(
+            '12211\t 1').groups() == ('1', )
+        assert RE_NUMBERED_WORDLIST_ENTRY.match('12a11 foo') is None
+        assert RE_NUMBERED_WORDLIST_ENTRY.match('foo bar') is None
+
     def test_get_wordlist_path(self):
         # we can get valid wordlist paths
         assert os.path.exists(get_wordlist_path('en_8k'))
-- 
GitLab