diff --git a/tests/06_test_formats.py b/tests/06_test_formats.py index 34f6fe9384e010c6ac9e7be8516864b30caadbe2..26f107754118575376ff5b598bdb9472784e641a 100644 --- a/tests/06_test_formats.py +++ b/tests/06_test_formats.py @@ -26,6 +26,7 @@ class Arguments: self.__arguments["text-tfst"] = "data/text.tfst" self.__arguments["text-tind"] = "data/text.tind" + self.__arguments["text-encoding"] = "utf-8" self.__arguments["text-size"] = 2 def __getitem__(self, key): @@ -76,7 +77,7 @@ class TestUnitexUtils(unittest.TestCase): self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!") def test_02_old_dictionary(self): - dictionary = OldCompiledDictionary() + dictionary = OldCompressedDictionary() dictionary.load(self._arguments["bin-v1"],\ self._arguments["inf-v1"],\ self._arguments["enc-v1"]) @@ -90,12 +91,11 @@ class TestUnitexUtils(unittest.TestCase): def test_04_text_fst(self): tfst = TextFST() - tfst.open(self._arguments["text-tfst"]) + tfst.load(self._arguments["text-tfst"], self._arguments["text-tind"],\ + self._arguments["text-encoding"]) good = True if len(tfst) == self._arguments["text-size"] else False - tfst.close() - self.assertTrue(good, "Dictionary (new format) lookup failed!") if __name__ == '__main__': diff --git a/unitex/processor.py b/unitex/processor.py index 50e06bbbbaf63b368e3c199eae808ba0e90ea3a7..47527a1e8fa54d8079a45187b217f4e68fcc99ea 100644 --- a/unitex/processor.py +++ b/unitex/processor.py @@ -391,6 +391,13 @@ class UnitexProcessor(object): self.__dir = None def tofst(self): + """ + This function build the text automaton. + + *Return [TextFST]:* + + The function returns a TextFST object. + """ pass def iter(self, grammar, **kwargs): diff --git a/unitex/tools.py b/unitex/tools.py index 0187ed0b9be180ccc75cea60b17cb21c3a75ec2e..21fc4f086255c7eb5f0420d38da378af6a0511e4 100644 --- a/unitex/tools.py +++ b/unitex/tools.py @@ -1213,13 +1213,13 @@ def txt2tfst(text, alphabet, **kwargs): - **clean [bool]** -- indicates whether the rule of conservation of the best paths (see section 7.2.4) should be applied (default: False). - + - **normalization_grammar [str]** -- name of a normalization grammar that is to be applied to the text automaton. - + - **tagset [str]** -- Elag tagset file to use to normalize dictionary entries. - + - **korean [bool]** -- tells the function that it works on Korean (default: False). diff --git a/unitex/utils/formats.py b/unitex/utils/formats.py index 01383a556f27d54ac119c8ee685936756afbcff5..7a6abb792c434b25a58944312d81c799a357a032 100644 --- a/unitex/utils/formats.py +++ b/unitex/utils/formats.py @@ -6,14 +6,15 @@ import logging import re import struct +from builtins import chr +from io import open + from unitex import UnitexException, UnitexConstants from unitex.utils.fsa import FSAConstants, Automaton -from unitex.utils.types import Tag, Entry +from unitex.utils.types import BRACKETED_ENTRY, Tag, Entry _LOGGER = logging.getLogger(__name__) -ENTRY = re.compile(r"{([^}]*)}") - class CompressedEntry(Entry): @@ -454,31 +455,39 @@ class SentenceFST(Automaton): class TextFST: def __init__(self): - self.__file = None + self.__tfst = None + self.__tind = None + self.__size = 0 + def __del__(self): + self.__tfst.close() + def __len__(self): - return self.__size + return len(self.__tind) + + def __getitem__(self, i): + position = self.__tind[i] - def __next(self): - line = self.__file.readline() + self.__tfst.seek(position) + line = self.__tfst.readline() while line: line = line.rstrip() if line[0] != "$": - raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name) # The sentence number (format '$n') number = int(line[1:]) - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() # The text of the sentence text = line - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() # The tokens of the text @@ -488,7 +497,7 @@ class TextFST: # - y: length of the token (in characters) tokens = [tuple(int(t) for t in token.split("/")) for token in line.split(" ")] - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() # The offset of the sentence (from the begining of the text) @@ -498,13 +507,13 @@ class TextFST: # - Y: the offset in characters offset = tuple(int(o) for o in line.split("_")) - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() states = [] while line != "t": if line[0] != ":": - raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name) line = line[1:].strip() line = line.split() @@ -514,21 +523,21 @@ class TextFST: state.append((int(line[i]), int(line[i+1]))) states.append(state) - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() if not line: - raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name) states.append(line) - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() if line[0] != "f": - raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name) - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() tags = [] @@ -537,39 +546,39 @@ class TextFST: tags.append(("<E>", None)) elif line == "@STD": - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() content = line[1:] entry = Entry() - if ENTRY.match(content): - content = ENTRY.sub(r"\1", content) + if BRACKETED_ENTRY.match(content): + content = BRACKETED_ENTRY.sub(r"\1", content) entry.load(content) else: entry.set_form(content) - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() if line[0] != "@": - raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name) position = [tuple(int(i) for i in p.split(".")) for p in line[1:].split("-")] tags.append((entry, position)) else: - raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name) - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() if line[0] != ".": - raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name) - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() _LOGGER.debug("SENTENCE[%s]\n" % number) @@ -589,24 +598,27 @@ class TextFST: return S def __iter__(self): - sentence = self.__next() - while sentence: - yield sentence + for i in range(len(self)): + yield self[i] - sentence = self.__next() - - def open(self, file, encoding=None): + def load(self, fst, index, encoding=None): if encoding is None: encoding = UnitexConstants.DEFAULT_ENCODING - self.__file = open(file, "r", encoding=encoding) + self.__tfst = open(fst, "r", encoding=encoding) - line = self.__file.readline() + line = self.__tfst.readline() line = line.rstrip() # The number of sentence in the text fst (format: '000000000N') self.__size = int(line) - def close(self): - self.__file.close() - self.__size = 0 + self.__tind = [] + + with open(index, "rb") as fin: + i = fin.read(4) + while i: + position = struct.unpack("<L", i) + self.__tind.append(position[0]) + + i = fin.read(4) diff --git a/unitex/utils/types.py b/unitex/utils/types.py index c259e63f5d45324e384a191452cb5e6dfa4c917f..167ad7ff8becc57c75c92a278f74303d7365f6cb 100644 --- a/unitex/utils/types.py +++ b/unitex/utils/types.py @@ -8,19 +8,18 @@ from unitex import UnitexException _LOGGER = logging.getLogger(__name__) +BRACKETED_ENTRY = re.compile(r"{([^}]*)}") + class Tag(object): - def __init__(self, tag=None): + def __init__(self): self.__pos = "" self.__features = [] self.__flexions = [] - if tag is not None: - self.load(tag) - def __str__(self): return self.get() @@ -106,19 +105,18 @@ class Tag(object): class Entry(Tag): - def __init__(self, entry=None): + def __init__(self): super(Entry, self).__init__() self.__form = "" self.__lemma = "" - if entry is not None: - self.load(entry) - def __str__(self): return self.get() - def load(self, entry): + def load(self, entry, bracketed=False): + if bracketed is True: + entry = BRACKETED_ENTRY.sub(r"\1", entry) i = 0 escaped = False @@ -177,15 +175,19 @@ class Entry(Tag): Tag.load(self, entry[i:]) - def get(self): + def get(self, bracketed=False): form = self.get_form(escape=True) + lemma = self.get_lemma(escape=True) if not lemma: - lemma = "" + lemma = form tag = Tag.get(self) - return "%s,%s.%s" % (form, lemma, tag) + if bracketed is True: + return "{%s,%s.%s}" % (form, lemma, tag) + else: + return "%s,%s.%s" % (form, lemma, tag) def set_form(self, form): self.__form = form