Testing TextFST

e0a576d1 · pat · f045a208 · e0a576d1 · e0a576d1 · e0a576d1
--- a/tests/06_test_formats.py
+++ b/tests/06_test_formats.py
@@ -18,8 +18,16 @@ class Arguments:
        self.__arguments["inf-v1"] = "data/dictionary-v1.inf"
        self.__arguments["enc-v1"] = "utf-16-le"
+        self.__arguments["bin-v2"] = "data/dictionary-v2.bin"
+        self.__arguments["inf-v2"] = "data/dictionary-v2.inf"
+        self.__arguments["enc-v2"] = "utf-16-le"
        self.__arguments["grf"] = "data/automaton.grf"
+        self.__arguments["text-tfst"] = "data/text.tfst"
+        self.__arguments["text-tind"] = "data/text.tind"
+        self.__arguments["text-size"] = 2
    def __getitem__(self, key):
        if key not in self.__arguments:
            raise KeyError("Argument '%s' not found ..." % key)
@@ -61,6 +69,9 @@ class TestUnitexUtils(unittest.TestCase):
        grf.add_path(path8.split())
        grf.add_path(path9.split())
+        grf.determinize()
+        grf.minimize()
        grf.save(self._arguments["grf"])
        self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!")
@@ -72,7 +83,20 @@ class TestUnitexUtils(unittest.TestCase):
        ret = True if dictionary.find("Sébastien") else False
-        self.assertTrue(ret, "Dictionary lookup failed!")
+        self.assertTrue(ret, "Dictionary (old format) lookup failed!")
+    def test_03_new_dictionary(self):
+        self.assertTrue(True, "Dictionary (new format) lookup failed!")
+    def test_04_text_fst(self):
+        tfst = TextFST()
+        tfst.open(self._arguments["text-tfst"])
+        good = True if len(tfst) == self._arguments["text-size"] else False
+        tfst.close()
+        self.assertTrue(good, "Dictionary (new format) lookup failed!")
 if __name__ == '__main__':
    unittest.main()
--- a/tests/data/dictionary-v2.bin
+++ b/tests/data/dictionary-v2.bin
--- a/tests/data/dictionary-v2.inf
+++ b/tests/data/dictionary-v2.inf
--- a/unitex/utils/formats.py
+++ b/unitex/utils/formats.py
@@ -12,6 +12,8 @@ from unitex.utils.types import Tag, Entry
 _LOGGER = logging.getLogger(__name__)
+ENTRY = re.compile(r"{([^}]*)}")
 class CompressedEntry(Entry):
@@ -96,7 +98,7 @@ class CompressedEntry(Entry):
-class OldCompiledDictionary:
+class OldCompressedDictionary:
    INITIAL_STATE_OFFSET=4
    INF_SEPARATOR=re.compile(r"(?<![\\]),")
@@ -282,6 +284,14 @@ class OldCompiledDictionary:
+class CompressedDictionary(OldCompressedDictionary):
+    def __init__(self):
+        super(CompressedDictionary, self).__init__()
+        raise NotImplementedError
 class GRF(Automaton):
    def __init__(self, name="GRF"):
@@ -430,7 +440,7 @@ class SentenceFST(Automaton):
                p1 = labels[lid][1][0][0]
                p2 = labels[lid][1][1][0]
-                if not self.__labels.has_key(p1):
+                if p1 not in self.__labels:
                    self.__labels[p1] = []
                self.__labels[p1].append((entry, p2))
@@ -450,7 +460,7 @@ class TextFST:
    def __len__(self):
        return self.__size
-    def next(self):
+    def __next(self):
        line = self.__file.readline()
        while line:
@@ -535,6 +545,7 @@ class TextFST:
                    entry = Entry()
                    if ENTRY.match(content):
+                        content = ENTRY.sub(r"\1", content)
                        entry.load(content)
                    else:
                        entry.set_form(content)
@@ -562,15 +573,15 @@ class TextFST:
                line = line.rstrip()
            _LOGGER.debug("SENTENCE[%s]\n" % number)
-            _LOGGER.debug(" - offset: %s\n" % offset)
+            _LOGGER.debug(" - offset: (%s)\n" % ", ".join([str(i) for i in offset]))
            _LOGGER.debug(" - text: %s\n" % text)
-            _LOGGER.debug(" - tokens: %s\n" % tokens)
+            _LOGGER.debug(" - tokens: [%s]\n" % ", ".join([str(t) for t in tokens]))
            _LOGGER.debug(" - states:\n")
            for state in states:
                _LOGGER.debug("   - s: %s\n" % state)
            _LOGGER.debug(" - tags:\n")
            for tag in tags:
-                _LOGGER.debug("   - t: %s\n" % tag)
+                _LOGGER.debug("   - t: (%s)\n" % ", ".join([str(t) for t in tag]))
            S = SentenceFST("SENTENCE[%d]" % number)
            S.load(text, tokens, states, tags)
@@ -578,11 +589,11 @@ class TextFST:
            return S
    def __iter__(self):
-        sentence = self.next()
+        sentence = self.__next()
        while sentence:
            yield sentence
-            sentence = self.next()
+            sentence = self.__next()
    def open(self, file, encoding=None):
        if encoding is None:

--- a/unitex/utils/types.py
+++ b/unitex/utils/types.py
@@ -107,7 +107,7 @@ class Tag(object):
 class Entry(Tag):
    def __init__(self, entry=None):
-        super(Tag, self).__init__()
+        super(Entry, self).__init__()
        self.__form = ""
        self.__lemma = ""