integration of the tind fime (for iteration) in TextFST object

b7d50378 · pat · e0a576d1 · b7d50378 · b7d50378 · b7d50378
--- a/tests/06_test_formats.py
+++ b/tests/06_test_formats.py
@@ -26,6 +26,7 @@ class Arguments:
        self.__arguments["text-tfst"] = "data/text.tfst"
        self.__arguments["text-tind"] = "data/text.tind"
+        self.__arguments["text-encoding"] = "utf-8"
        self.__arguments["text-size"] = 2
    def __getitem__(self, key):
@@ -76,7 +77,7 @@ class TestUnitexUtils(unittest.TestCase):
        self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!")
    def test_02_old_dictionary(self):
-        dictionary = OldCompiledDictionary()
+        dictionary = OldCompressedDictionary()
        dictionary.load(self._arguments["bin-v1"],\
                        self._arguments["inf-v1"],\
                        self._arguments["enc-v1"])
@@ -90,12 +91,11 @@ class TestUnitexUtils(unittest.TestCase):
    def test_04_text_fst(self):
        tfst = TextFST()
-        tfst.open(self._arguments["text-tfst"])
+        tfst.load(self._arguments["text-tfst"], self._arguments["text-tind"],\
+                  self._arguments["text-encoding"])
        good = True if len(tfst) == self._arguments["text-size"] else False
-        tfst.close()
        self.assertTrue(good, "Dictionary (new format) lookup failed!")
 if __name__ == '__main__':

--- a/unitex/processor.py
+++ b/unitex/processor.py
@@ -391,6 +391,13 @@ class UnitexProcessor(object):
        self.__dir = None
    def tofst(self):
+        """
+        This function build the text automaton.
+        *Return [TextFST]:*
+          The function returns a TextFST object.
+        """
        pass
    def iter(self, grammar, **kwargs):

--- a/unitex/tools.py
+++ b/unitex/tools.py
@@ -1213,13 +1213,13 @@ def txt2tfst(text, alphabet, **kwargs):
    - **clean [bool]** -- indicates whether the rule of conservation of
      the best paths (see section 7.2.4) should be applied
      (default: False).
    - **normalization_grammar [str]** -- name of a normalization grammar
      that is to be applied to the text automaton.
    - **tagset [str]** -- Elag tagset file to use to normalize
      dictionary entries.
    - **korean [bool]** -- tells the function that it works on Korean
      (default: False).

--- a/unitex/utils/formats.py
+++ b/unitex/utils/formats.py
@@ -6,14 +6,15 @@ import logging
 import re
 import struct
+from builtins import chr
+from io import open
 from unitex import UnitexException, UnitexConstants
 from unitex.utils.fsa import FSAConstants, Automaton
-from unitex.utils.types import Tag, Entry
+from unitex.utils.types import BRACKETED_ENTRY, Tag, Entry
 _LOGGER = logging.getLogger(__name__)
-ENTRY = re.compile(r"{([^}]*)}")
 class CompressedEntry(Entry):
@@ -454,31 +455,39 @@ class SentenceFST(Automaton):
 class TextFST:
    def __init__(self):
-        self.__file = None
+        self.__tfst = None
+        self.__tind = None
        self.__size = 0
+    def __del__(self):
+        self.__tfst.close()
    def __len__(self):
-        return self.__size
+        return len(self.__tind)
+    def __getitem__(self, i):
+        position = self.__tind[i]
-    def __next(self):
+        self.__tfst.seek(position)
-        line = self.__file.readline()
+        line = self.__tfst.readline()
        while line:
            line = line.rstrip()
            if line[0] != "$":
-                raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
+                raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
            # The sentence number (format '$n')
            number = int(line[1:])
-            line = self.__file.readline()
+            line = self.__tfst.readline()
            line = line.rstrip()
            # The text of the sentence
            text = line
-            line = self.__file.readline()
+            line = self.__tfst.readline()
            line = line.rstrip()
            # The tokens of the text
@@ -488,7 +497,7 @@ class TextFST:
            #        - y: length of the token (in characters)
            tokens = [tuple(int(t) for t in token.split("/")) for token in line.split(" ")]
-            line = self.__file.readline()
+            line = self.__tfst.readline()
            line = line.rstrip()
            # The offset of the sentence (from the begining of the text)
@@ -498,13 +507,13 @@ class TextFST:
            #        - Y: the offset in characters
            offset = tuple(int(o) for o in line.split("_"))
-            line = self.__file.readline()
+            line = self.__tfst.readline()
            line = line.rstrip()
            states = []
            while line != "t":
                if line[0] != ":":
-                    raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
+                    raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
                line = line[1:].strip()
                line = line.split()
@@ -514,21 +523,21 @@ class TextFST:
                    state.append((int(line[i]), int(line[i+1])))
                states.append(state)
-                line = self.__file.readline()
+                line = self.__tfst.readline()
                line = line.rstrip()
                if not line:
-                    raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
+                    raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
            states.append(line)
-            line = self.__file.readline()
+            line = self.__tfst.readline()
            line = line.rstrip()
            if line[0] != "f":
-                raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
+                raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
-            line = self.__file.readline()
+            line = self.__tfst.readline()
            line = line.rstrip()
            tags = []
@@ -537,39 +546,39 @@ class TextFST:
                    tags.append(("<E>", None))
                elif line == "@STD":
-                    line = self.__file.readline()
+                    line = self.__tfst.readline()
                    line = line.rstrip()
                    content = line[1:]
                    entry = Entry()
-                    if ENTRY.match(content):
+                    if BRACKETED_ENTRY.match(content):
-                        content = ENTRY.sub(r"\1", content)
+                        content = BRACKETED_ENTRY.sub(r"\1", content)
                        entry.load(content)
                    else:
                        entry.set_form(content)
-                    line = self.__file.readline()
+                    line = self.__tfst.readline()
                    line = line.rstrip()
                    if line[0] != "@":
-                        raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
+                        raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
                    position = [tuple(int(i) for i in p.split(".")) for p in line[1:].split("-")]
                    tags.append((entry, position))
                else:
-                    raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
+                    raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
-                line = self.__file.readline()
+                line = self.__tfst.readline()
                line = line.rstrip()
                if line[0] != ".":
-                    raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
+                    raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
-                line = self.__file.readline()
+                line = self.__tfst.readline()
                line = line.rstrip()
            _LOGGER.debug("SENTENCE[%s]\n" % number)
@@ -589,24 +598,27 @@ class TextFST:
            return S
    def __iter__(self):
-        sentence = self.__next()
+        for i in range(len(self)):
-        while sentence:
+            yield self[i]
-            yield sentence
-            sentence = self.__next()
+    def load(self, fst, index, encoding=None):
-    def open(self, file, encoding=None):
        if encoding is None:
            encoding = UnitexConstants.DEFAULT_ENCODING
-        self.__file = open(file, "r", encoding=encoding)
+        self.__tfst = open(fst, "r", encoding=encoding)
-        line = self.__file.readline()
+        line = self.__tfst.readline()
        line = line.rstrip()
        # The number of sentence in the text fst (format: '000000000N')
        self.__size = int(line)
-    def close(self):
+        self.__tind = []
-        self.__file.close()
-        self.__size = 0
+        with open(index, "rb") as fin:
+            i = fin.read(4)
+            while i:
+                position = struct.unpack("<L", i)
+                self.__tind.append(position[0])
+                i = fin.read(4)
--- a/unitex/utils/types.py
+++ b/unitex/utils/types.py
@@ -8,19 +8,18 @@ from unitex import UnitexException
 _LOGGER = logging.getLogger(__name__)
+BRACKETED_ENTRY = re.compile(r"{([^}]*)}")
 class Tag(object):
-    def __init__(self, tag=None):
+    def __init__(self):
        self.__pos = ""
        self.__features = []
        self.__flexions = []
-        if tag is not None:
-            self.load(tag)
    def __str__(self):
        return self.get()
@@ -106,19 +105,18 @@ class Tag(object):
 class Entry(Tag):
-    def __init__(self, entry=None):
+    def __init__(self):
        super(Entry, self).__init__()
        self.__form = ""
        self.__lemma = ""
-        if entry is not None:
-            self.load(entry)
    def __str__(self):
        return self.get()
-    def load(self, entry):
+    def load(self, entry, bracketed=False):
+        if bracketed is True:
+            entry = BRACKETED_ENTRY.sub(r"\1", entry)
        i = 0
        escaped = False
@@ -177,15 +175,19 @@ class Entry(Tag):
        Tag.load(self, entry[i:])
-    def get(self):
+    def get(self, bracketed=False):
        form = self.get_form(escape=True)
        lemma = self.get_lemma(escape=True)
        if not lemma:
-            lemma = ""
+            lemma = form
        tag = Tag.get(self)
-        return "%s,%s.%s" % (form, lemma, tag)
+        if bracketed is True:
+            return "{%s,%s.%s}" % (form, lemma, tag)
+        else:
+            return "%s,%s.%s" % (form, lemma, tag)
    def set_form(self, form):
        self.__form = form