diff --git a/tests/06_test_formats.py b/tests/06_test_formats.py index f9fd0512f0988ab523afcc3d394508451199ecd6..34f6fe9384e010c6ac9e7be8516864b30caadbe2 100644 --- a/tests/06_test_formats.py +++ b/tests/06_test_formats.py @@ -18,8 +18,16 @@ class Arguments: self.__arguments["inf-v1"] = "data/dictionary-v1.inf" self.__arguments["enc-v1"] = "utf-16-le" + self.__arguments["bin-v2"] = "data/dictionary-v2.bin" + self.__arguments["inf-v2"] = "data/dictionary-v2.inf" + self.__arguments["enc-v2"] = "utf-16-le" + self.__arguments["grf"] = "data/automaton.grf" + self.__arguments["text-tfst"] = "data/text.tfst" + self.__arguments["text-tind"] = "data/text.tind" + self.__arguments["text-size"] = 2 + def __getitem__(self, key): if key not in self.__arguments: raise KeyError("Argument '%s' not found ..." % key) @@ -61,6 +69,9 @@ class TestUnitexUtils(unittest.TestCase): grf.add_path(path8.split()) grf.add_path(path9.split()) + grf.determinize() + grf.minimize() + grf.save(self._arguments["grf"]) self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!") @@ -72,7 +83,20 @@ class TestUnitexUtils(unittest.TestCase): ret = True if dictionary.find("Sébastien") else False - self.assertTrue(ret, "Dictionary lookup failed!") + self.assertTrue(ret, "Dictionary (old format) lookup failed!") + + def test_03_new_dictionary(self): + self.assertTrue(True, "Dictionary (new format) lookup failed!") + + def test_04_text_fst(self): + tfst = TextFST() + tfst.open(self._arguments["text-tfst"]) + + good = True if len(tfst) == self._arguments["text-size"] else False + + tfst.close() + + self.assertTrue(good, "Dictionary (new format) lookup failed!") if __name__ == '__main__': unittest.main() diff --git a/tests/data/dictionary-v2.bin b/tests/data/dictionary-v2.bin new file mode 100644 index 0000000000000000000000000000000000000000..b7bb7933f281113b85a6499401ce9a72c3926839 Binary files /dev/null and b/tests/data/dictionary-v2.bin differ diff --git a/tests/data/dictionary-v2.inf b/tests/data/dictionary-v2.inf new file mode 100644 index 0000000000000000000000000000000000000000..1a4d8e80f6c0d1c4db237d8846e236f0dbadf236 Binary files /dev/null and b/tests/data/dictionary-v2.inf differ diff --git a/unitex/utils/formats.py b/unitex/utils/formats.py index e312fe127b3c325d34351b1a72842524feaaa2d5..01383a556f27d54ac119c8ee685936756afbcff5 100644 --- a/unitex/utils/formats.py +++ b/unitex/utils/formats.py @@ -12,6 +12,8 @@ from unitex.utils.types import Tag, Entry _LOGGER = logging.getLogger(__name__) +ENTRY = re.compile(r"{([^}]*)}") + class CompressedEntry(Entry): @@ -96,7 +98,7 @@ class CompressedEntry(Entry): -class OldCompiledDictionary: +class OldCompressedDictionary: INITIAL_STATE_OFFSET=4 INF_SEPARATOR=re.compile(r"(?<![\\]),") @@ -282,6 +284,14 @@ class OldCompiledDictionary: +class CompressedDictionary(OldCompressedDictionary): + + def __init__(self): + super(CompressedDictionary, self).__init__() + raise NotImplementedError + + + class GRF(Automaton): def __init__(self, name="GRF"): @@ -430,7 +440,7 @@ class SentenceFST(Automaton): p1 = labels[lid][1][0][0] p2 = labels[lid][1][1][0] - if not self.__labels.has_key(p1): + if p1 not in self.__labels: self.__labels[p1] = [] self.__labels[p1].append((entry, p2)) @@ -450,7 +460,7 @@ class TextFST: def __len__(self): return self.__size - def next(self): + def __next(self): line = self.__file.readline() while line: @@ -535,6 +545,7 @@ class TextFST: entry = Entry() if ENTRY.match(content): + content = ENTRY.sub(r"\1", content) entry.load(content) else: entry.set_form(content) @@ -562,15 +573,15 @@ class TextFST: line = line.rstrip() _LOGGER.debug("SENTENCE[%s]\n" % number) - _LOGGER.debug(" - offset: %s\n" % offset) + _LOGGER.debug(" - offset: (%s)\n" % ", ".join([str(i) for i in offset])) _LOGGER.debug(" - text: %s\n" % text) - _LOGGER.debug(" - tokens: %s\n" % tokens) + _LOGGER.debug(" - tokens: [%s]\n" % ", ".join([str(t) for t in tokens])) _LOGGER.debug(" - states:\n") for state in states: _LOGGER.debug(" - s: %s\n" % state) _LOGGER.debug(" - tags:\n") for tag in tags: - _LOGGER.debug(" - t: %s\n" % tag) + _LOGGER.debug(" - t: (%s)\n" % ", ".join([str(t) for t in tag])) S = SentenceFST("SENTENCE[%d]" % number) S.load(text, tokens, states, tags) @@ -578,11 +589,11 @@ class TextFST: return S def __iter__(self): - sentence = self.next() + sentence = self.__next() while sentence: yield sentence - sentence = self.next() + sentence = self.__next() def open(self, file, encoding=None): if encoding is None: diff --git a/unitex/utils/types.py b/unitex/utils/types.py index ce33eafd9b03c83097184298220964b92d420858..c259e63f5d45324e384a191452cb5e6dfa4c917f 100644 --- a/unitex/utils/types.py +++ b/unitex/utils/types.py @@ -107,7 +107,7 @@ class Tag(object): class Entry(Tag): def __init__(self, entry=None): - super(Tag, self).__init__() + super(Entry, self).__init__() self.__form = "" self.__lemma = ""