From e0a576d102641d409e165f616c28f71232631e0e Mon Sep 17 00:00:00 2001 From: pat <pat@lucy.local> Date: Thu, 22 Sep 2016 15:08:38 +0200 Subject: [PATCH] Testing TextFST --- tests/06_test_formats.py | 26 +++++++++++++++++++++++++- tests/data/dictionary-v2.bin | Bin 0 -> 369 bytes tests/data/dictionary-v2.inf | Bin 0 -> 974 bytes unitex/utils/formats.py | 27 +++++++++++++++++++-------- unitex/utils/types.py | 2 +- 5 files changed, 45 insertions(+), 10 deletions(-) create mode 100644 tests/data/dictionary-v2.bin create mode 100644 tests/data/dictionary-v2.inf diff --git a/tests/06_test_formats.py b/tests/06_test_formats.py index f9fd051..34f6fe9 100644 --- a/tests/06_test_formats.py +++ b/tests/06_test_formats.py @@ -18,8 +18,16 @@ class Arguments: self.__arguments["inf-v1"] = "data/dictionary-v1.inf" self.__arguments["enc-v1"] = "utf-16-le" + self.__arguments["bin-v2"] = "data/dictionary-v2.bin" + self.__arguments["inf-v2"] = "data/dictionary-v2.inf" + self.__arguments["enc-v2"] = "utf-16-le" + self.__arguments["grf"] = "data/automaton.grf" + self.__arguments["text-tfst"] = "data/text.tfst" + self.__arguments["text-tind"] = "data/text.tind" + self.__arguments["text-size"] = 2 + def __getitem__(self, key): if key not in self.__arguments: raise KeyError("Argument '%s' not found ..." % key) @@ -61,6 +69,9 @@ class TestUnitexUtils(unittest.TestCase): grf.add_path(path8.split()) grf.add_path(path9.split()) + grf.determinize() + grf.minimize() + grf.save(self._arguments["grf"]) self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!") @@ -72,7 +83,20 @@ class TestUnitexUtils(unittest.TestCase): ret = True if dictionary.find("Sébastien") else False - self.assertTrue(ret, "Dictionary lookup failed!") + self.assertTrue(ret, "Dictionary (old format) lookup failed!") + + def test_03_new_dictionary(self): + self.assertTrue(True, "Dictionary (new format) lookup failed!") + + def test_04_text_fst(self): + tfst = TextFST() + tfst.open(self._arguments["text-tfst"]) + + good = True if len(tfst) == self._arguments["text-size"] else False + + tfst.close() + + self.assertTrue(good, "Dictionary (new format) lookup failed!") if __name__ == '__main__': unittest.main() diff --git a/tests/data/dictionary-v2.bin b/tests/data/dictionary-v2.bin new file mode 100644 index 0000000000000000000000000000000000000000..b7bb7933f281113b85a6499401ce9a72c3926839 GIT binary patch literal 369 zcmWNN*-OJf7{v2UHrY+<RSyspDF}if2!anH2!f!9D54-m5K^<$ls1v1sUkk~!d}wW zwrJ4m-|EiOk9n9GhDg_SO`{W<hP04NqQz%WP9a!=zf3y3HOMsFb(A&`Zo=L|b{lQJ zLmHwz`1^1V;2pv}A_MZ2OoSKkF3F+<WCpog*mo$}C_0oB_O-n6b5WjVQHp4eg-@jy zTEKIaA*wUTCPUCw+*dqew1~nFqZDM-fKxa0M7n9f>l$ze$`~66VuNNND<-0v$?S>Y zznKU!h(1inu?aU;b0#Ld3Wwj|G*43`5DYlvki-4tD1LJm_IUKiNp-dboUYJ$x-Srp zgt{UY%&v)GA_(wW7W|GygkWf?eDzjrA($wxBv#K?u6DD>_h(K)zCL_0_m|~6X0gS+ Pcm7zl!k$%bBs8Uede%{+ literal 0 HcmV?d00001 diff --git a/tests/data/dictionary-v2.inf b/tests/data/dictionary-v2.inf new file mode 100644 index 0000000000000000000000000000000000000000..1a4d8e80f6c0d1c4db237d8846e236f0dbadf236 GIT binary patch literal 974 zcma)4%TB{E5S%j-U(yQoaX{o2k{&=P7>a@r;(+KwJ%mOraNv{q2w>K0$8kUbRjHEo zdS-TKzCK5Ye-n&xhyxs<zyR-<AVGzFR2#e6kX@zilFF^?$uYwXt2t_Hd2^2`AHnkX z60BH}+rn{=Q|$1H4PNjN_4$sBIL~;py(N}dclt!{2@D>IQW0lKJjFKDa*erDX81m1 z-@yK7e7kadI&o0GAV&2=i`>6t)!?u58gDqmMT0Q4J50EVid#{(peE;Z+lEh1Q|_;{ zJL>N%B7edJ8)7aResNH@E^gNLz3!lkKgzAnd$(e>eGF~o>bG~W?S0sQ@EC3>MMg)u z+?tV*^)jT)P^HXbY^oA^tKW0?eaFQ5w5I=<Kc00+ln>@`NB`T|RyI|bF#`q8so5(I aJ5e8xo0B;8Ve+mrIqBb3$lmYS5-s1nhl%<C literal 0 HcmV?d00001 diff --git a/unitex/utils/formats.py b/unitex/utils/formats.py index e312fe1..01383a5 100644 --- a/unitex/utils/formats.py +++ b/unitex/utils/formats.py @@ -12,6 +12,8 @@ from unitex.utils.types import Tag, Entry _LOGGER = logging.getLogger(__name__) +ENTRY = re.compile(r"{([^}]*)}") + class CompressedEntry(Entry): @@ -96,7 +98,7 @@ class CompressedEntry(Entry): -class OldCompiledDictionary: +class OldCompressedDictionary: INITIAL_STATE_OFFSET=4 INF_SEPARATOR=re.compile(r"(?<![\\]),") @@ -282,6 +284,14 @@ class OldCompiledDictionary: +class CompressedDictionary(OldCompressedDictionary): + + def __init__(self): + super(CompressedDictionary, self).__init__() + raise NotImplementedError + + + class GRF(Automaton): def __init__(self, name="GRF"): @@ -430,7 +440,7 @@ class SentenceFST(Automaton): p1 = labels[lid][1][0][0] p2 = labels[lid][1][1][0] - if not self.__labels.has_key(p1): + if p1 not in self.__labels: self.__labels[p1] = [] self.__labels[p1].append((entry, p2)) @@ -450,7 +460,7 @@ class TextFST: def __len__(self): return self.__size - def next(self): + def __next(self): line = self.__file.readline() while line: @@ -535,6 +545,7 @@ class TextFST: entry = Entry() if ENTRY.match(content): + content = ENTRY.sub(r"\1", content) entry.load(content) else: entry.set_form(content) @@ -562,15 +573,15 @@ class TextFST: line = line.rstrip() _LOGGER.debug("SENTENCE[%s]\n" % number) - _LOGGER.debug(" - offset: %s\n" % offset) + _LOGGER.debug(" - offset: (%s)\n" % ", ".join([str(i) for i in offset])) _LOGGER.debug(" - text: %s\n" % text) - _LOGGER.debug(" - tokens: %s\n" % tokens) + _LOGGER.debug(" - tokens: [%s]\n" % ", ".join([str(t) for t in tokens])) _LOGGER.debug(" - states:\n") for state in states: _LOGGER.debug(" - s: %s\n" % state) _LOGGER.debug(" - tags:\n") for tag in tags: - _LOGGER.debug(" - t: %s\n" % tag) + _LOGGER.debug(" - t: (%s)\n" % ", ".join([str(t) for t in tag])) S = SentenceFST("SENTENCE[%d]" % number) S.load(text, tokens, states, tags) @@ -578,11 +589,11 @@ class TextFST: return S def __iter__(self): - sentence = self.next() + sentence = self.__next() while sentence: yield sentence - sentence = self.next() + sentence = self.__next() def open(self, file, encoding=None): if encoding is None: diff --git a/unitex/utils/types.py b/unitex/utils/types.py index ce33eaf..c259e63 100644 --- a/unitex/utils/types.py +++ b/unitex/utils/types.py @@ -107,7 +107,7 @@ class Tag(object): class Entry(Tag): def __init__(self, entry=None): - super(Tag, self).__init__() + super(Entry, self).__init__() self.__form = "" self.__lemma = "" -- GitLab