Skip to content
Extraits de code Groupes Projets
Valider b7d50378 rédigé par pat's avatar pat
Parcourir les fichiers

integration of the tind fime (for iteration) in TextFST object

parent e0a576d1
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -26,6 +26,7 @@ class Arguments: ...@@ -26,6 +26,7 @@ class Arguments:
self.__arguments["text-tfst"] = "data/text.tfst" self.__arguments["text-tfst"] = "data/text.tfst"
self.__arguments["text-tind"] = "data/text.tind" self.__arguments["text-tind"] = "data/text.tind"
self.__arguments["text-encoding"] = "utf-8"
self.__arguments["text-size"] = 2 self.__arguments["text-size"] = 2
def __getitem__(self, key): def __getitem__(self, key):
...@@ -76,7 +77,7 @@ class TestUnitexUtils(unittest.TestCase): ...@@ -76,7 +77,7 @@ class TestUnitexUtils(unittest.TestCase):
self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!") self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!")
def test_02_old_dictionary(self): def test_02_old_dictionary(self):
dictionary = OldCompiledDictionary() dictionary = OldCompressedDictionary()
dictionary.load(self._arguments["bin-v1"],\ dictionary.load(self._arguments["bin-v1"],\
self._arguments["inf-v1"],\ self._arguments["inf-v1"],\
self._arguments["enc-v1"]) self._arguments["enc-v1"])
...@@ -90,12 +91,11 @@ class TestUnitexUtils(unittest.TestCase): ...@@ -90,12 +91,11 @@ class TestUnitexUtils(unittest.TestCase):
def test_04_text_fst(self): def test_04_text_fst(self):
tfst = TextFST() tfst = TextFST()
tfst.open(self._arguments["text-tfst"]) tfst.load(self._arguments["text-tfst"], self._arguments["text-tind"],\
self._arguments["text-encoding"])
good = True if len(tfst) == self._arguments["text-size"] else False good = True if len(tfst) == self._arguments["text-size"] else False
tfst.close()
self.assertTrue(good, "Dictionary (new format) lookup failed!") self.assertTrue(good, "Dictionary (new format) lookup failed!")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -391,6 +391,13 @@ class UnitexProcessor(object): ...@@ -391,6 +391,13 @@ class UnitexProcessor(object):
self.__dir = None self.__dir = None
def tofst(self): def tofst(self):
"""
This function build the text automaton.
*Return [TextFST]:*
The function returns a TextFST object.
"""
pass pass
def iter(self, grammar, **kwargs): def iter(self, grammar, **kwargs):
......
...@@ -1213,13 +1213,13 @@ def txt2tfst(text, alphabet, **kwargs): ...@@ -1213,13 +1213,13 @@ def txt2tfst(text, alphabet, **kwargs):
- **clean [bool]** -- indicates whether the rule of conservation of - **clean [bool]** -- indicates whether the rule of conservation of
the best paths (see section 7.2.4) should be applied the best paths (see section 7.2.4) should be applied
(default: False). (default: False).
- **normalization_grammar [str]** -- name of a normalization grammar - **normalization_grammar [str]** -- name of a normalization grammar
that is to be applied to the text automaton. that is to be applied to the text automaton.
- **tagset [str]** -- Elag tagset file to use to normalize - **tagset [str]** -- Elag tagset file to use to normalize
dictionary entries. dictionary entries.
- **korean [bool]** -- tells the function that it works on Korean - **korean [bool]** -- tells the function that it works on Korean
(default: False). (default: False).
......
...@@ -6,14 +6,15 @@ import logging ...@@ -6,14 +6,15 @@ import logging
import re import re
import struct import struct
from builtins import chr
from io import open
from unitex import UnitexException, UnitexConstants from unitex import UnitexException, UnitexConstants
from unitex.utils.fsa import FSAConstants, Automaton from unitex.utils.fsa import FSAConstants, Automaton
from unitex.utils.types import Tag, Entry from unitex.utils.types import BRACKETED_ENTRY, Tag, Entry
_LOGGER = logging.getLogger(__name__) _LOGGER = logging.getLogger(__name__)
ENTRY = re.compile(r"{([^}]*)}")
class CompressedEntry(Entry): class CompressedEntry(Entry):
...@@ -454,31 +455,39 @@ class SentenceFST(Automaton): ...@@ -454,31 +455,39 @@ class SentenceFST(Automaton):
class TextFST: class TextFST:
def __init__(self): def __init__(self):
self.__file = None self.__tfst = None
self.__tind = None
self.__size = 0 self.__size = 0
def __del__(self):
self.__tfst.close()
def __len__(self): def __len__(self):
return self.__size return len(self.__tind)
def __getitem__(self, i):
position = self.__tind[i]
def __next(self): self.__tfst.seek(position)
line = self.__file.readline()
line = self.__tfst.readline()
while line: while line:
line = line.rstrip() line = line.rstrip()
if line[0] != "$": if line[0] != "$":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name) raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
# The sentence number (format '$n') # The sentence number (format '$n')
number = int(line[1:]) number = int(line[1:])
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
# The text of the sentence # The text of the sentence
text = line text = line
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
# The tokens of the text # The tokens of the text
...@@ -488,7 +497,7 @@ class TextFST: ...@@ -488,7 +497,7 @@ class TextFST:
# - y: length of the token (in characters) # - y: length of the token (in characters)
tokens = [tuple(int(t) for t in token.split("/")) for token in line.split(" ")] tokens = [tuple(int(t) for t in token.split("/")) for token in line.split(" ")]
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
# The offset of the sentence (from the begining of the text) # The offset of the sentence (from the begining of the text)
...@@ -498,13 +507,13 @@ class TextFST: ...@@ -498,13 +507,13 @@ class TextFST:
# - Y: the offset in characters # - Y: the offset in characters
offset = tuple(int(o) for o in line.split("_")) offset = tuple(int(o) for o in line.split("_"))
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
states = [] states = []
while line != "t": while line != "t":
if line[0] != ":": if line[0] != ":":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name) raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
line = line[1:].strip() line = line[1:].strip()
line = line.split() line = line.split()
...@@ -514,21 +523,21 @@ class TextFST: ...@@ -514,21 +523,21 @@ class TextFST:
state.append((int(line[i]), int(line[i+1]))) state.append((int(line[i]), int(line[i+1])))
states.append(state) states.append(state)
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
if not line: if not line:
raise UnitexException("File '%s' is corrupted ..." % self.__file.name) raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
states.append(line) states.append(line)
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
if line[0] != "f": if line[0] != "f":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name) raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
tags = [] tags = []
...@@ -537,39 +546,39 @@ class TextFST: ...@@ -537,39 +546,39 @@ class TextFST:
tags.append(("<E>", None)) tags.append(("<E>", None))
elif line == "@STD": elif line == "@STD":
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
content = line[1:] content = line[1:]
entry = Entry() entry = Entry()
if ENTRY.match(content): if BRACKETED_ENTRY.match(content):
content = ENTRY.sub(r"\1", content) content = BRACKETED_ENTRY.sub(r"\1", content)
entry.load(content) entry.load(content)
else: else:
entry.set_form(content) entry.set_form(content)
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
if line[0] != "@": if line[0] != "@":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name) raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
position = [tuple(int(i) for i in p.split(".")) for p in line[1:].split("-")] position = [tuple(int(i) for i in p.split(".")) for p in line[1:].split("-")]
tags.append((entry, position)) tags.append((entry, position))
else: else:
raise UnitexException("File '%s' is corrupted ..." % self.__file.name) raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
if line[0] != ".": if line[0] != ".":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name) raise UnitexException("File '%s' is corrupted ..." % self.__tfst.name)
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
_LOGGER.debug("SENTENCE[%s]\n" % number) _LOGGER.debug("SENTENCE[%s]\n" % number)
...@@ -589,24 +598,27 @@ class TextFST: ...@@ -589,24 +598,27 @@ class TextFST:
return S return S
def __iter__(self): def __iter__(self):
sentence = self.__next() for i in range(len(self)):
while sentence: yield self[i]
yield sentence
sentence = self.__next() def load(self, fst, index, encoding=None):
def open(self, file, encoding=None):
if encoding is None: if encoding is None:
encoding = UnitexConstants.DEFAULT_ENCODING encoding = UnitexConstants.DEFAULT_ENCODING
self.__file = open(file, "r", encoding=encoding) self.__tfst = open(fst, "r", encoding=encoding)
line = self.__file.readline() line = self.__tfst.readline()
line = line.rstrip() line = line.rstrip()
# The number of sentence in the text fst (format: '000000000N') # The number of sentence in the text fst (format: '000000000N')
self.__size = int(line) self.__size = int(line)
def close(self): self.__tind = []
self.__file.close()
self.__size = 0 with open(index, "rb") as fin:
i = fin.read(4)
while i:
position = struct.unpack("<L", i)
self.__tind.append(position[0])
i = fin.read(4)
...@@ -8,19 +8,18 @@ from unitex import UnitexException ...@@ -8,19 +8,18 @@ from unitex import UnitexException
_LOGGER = logging.getLogger(__name__) _LOGGER = logging.getLogger(__name__)
BRACKETED_ENTRY = re.compile(r"{([^}]*)}")
class Tag(object): class Tag(object):
def __init__(self, tag=None): def __init__(self):
self.__pos = "" self.__pos = ""
self.__features = [] self.__features = []
self.__flexions = [] self.__flexions = []
if tag is not None:
self.load(tag)
def __str__(self): def __str__(self):
return self.get() return self.get()
...@@ -106,19 +105,18 @@ class Tag(object): ...@@ -106,19 +105,18 @@ class Tag(object):
class Entry(Tag): class Entry(Tag):
def __init__(self, entry=None): def __init__(self):
super(Entry, self).__init__() super(Entry, self).__init__()
self.__form = "" self.__form = ""
self.__lemma = "" self.__lemma = ""
if entry is not None:
self.load(entry)
def __str__(self): def __str__(self):
return self.get() return self.get()
def load(self, entry): def load(self, entry, bracketed=False):
if bracketed is True:
entry = BRACKETED_ENTRY.sub(r"\1", entry)
i = 0 i = 0
escaped = False escaped = False
...@@ -177,15 +175,19 @@ class Entry(Tag): ...@@ -177,15 +175,19 @@ class Entry(Tag):
Tag.load(self, entry[i:]) Tag.load(self, entry[i:])
def get(self): def get(self, bracketed=False):
form = self.get_form(escape=True) form = self.get_form(escape=True)
lemma = self.get_lemma(escape=True) lemma = self.get_lemma(escape=True)
if not lemma: if not lemma:
lemma = "" lemma = form
tag = Tag.get(self) tag = Tag.get(self)
return "%s,%s.%s" % (form, lemma, tag) if bracketed is True:
return "{%s,%s.%s}" % (form, lemma, tag)
else:
return "%s,%s.%s" % (form, lemma, tag)
def set_form(self, form): def set_form(self, form):
self.__form = form self.__form = form
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter