Skip to content
Extraits de code Groupes Projets
Valider e0a576d1 rédigé par pat's avatar pat
Parcourir les fichiers

Testing TextFST

parent f045a208
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -18,8 +18,16 @@ class Arguments: ...@@ -18,8 +18,16 @@ class Arguments:
self.__arguments["inf-v1"] = "data/dictionary-v1.inf" self.__arguments["inf-v1"] = "data/dictionary-v1.inf"
self.__arguments["enc-v1"] = "utf-16-le" self.__arguments["enc-v1"] = "utf-16-le"
self.__arguments["bin-v2"] = "data/dictionary-v2.bin"
self.__arguments["inf-v2"] = "data/dictionary-v2.inf"
self.__arguments["enc-v2"] = "utf-16-le"
self.__arguments["grf"] = "data/automaton.grf" self.__arguments["grf"] = "data/automaton.grf"
self.__arguments["text-tfst"] = "data/text.tfst"
self.__arguments["text-tind"] = "data/text.tind"
self.__arguments["text-size"] = 2
def __getitem__(self, key): def __getitem__(self, key):
if key not in self.__arguments: if key not in self.__arguments:
raise KeyError("Argument '%s' not found ..." % key) raise KeyError("Argument '%s' not found ..." % key)
...@@ -61,6 +69,9 @@ class TestUnitexUtils(unittest.TestCase): ...@@ -61,6 +69,9 @@ class TestUnitexUtils(unittest.TestCase):
grf.add_path(path8.split()) grf.add_path(path8.split())
grf.add_path(path9.split()) grf.add_path(path9.split())
grf.determinize()
grf.minimize()
grf.save(self._arguments["grf"]) grf.save(self._arguments["grf"])
self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!") self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!")
...@@ -72,7 +83,20 @@ class TestUnitexUtils(unittest.TestCase): ...@@ -72,7 +83,20 @@ class TestUnitexUtils(unittest.TestCase):
ret = True if dictionary.find("Sébastien") else False ret = True if dictionary.find("Sébastien") else False
self.assertTrue(ret, "Dictionary lookup failed!") self.assertTrue(ret, "Dictionary (old format) lookup failed!")
def test_03_new_dictionary(self):
self.assertTrue(True, "Dictionary (new format) lookup failed!")
def test_04_text_fst(self):
tfst = TextFST()
tfst.open(self._arguments["text-tfst"])
good = True if len(tfst) == self._arguments["text-size"] else False
tfst.close()
self.assertTrue(good, "Dictionary (new format) lookup failed!")
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Fichier ajouté
Le fichier a été supprimé par une entrée .gitattributes, ou son encodage n'est pas pris en charge.
...@@ -12,6 +12,8 @@ from unitex.utils.types import Tag, Entry ...@@ -12,6 +12,8 @@ from unitex.utils.types import Tag, Entry
_LOGGER = logging.getLogger(__name__) _LOGGER = logging.getLogger(__name__)
ENTRY = re.compile(r"{([^}]*)}")
class CompressedEntry(Entry): class CompressedEntry(Entry):
...@@ -96,7 +98,7 @@ class CompressedEntry(Entry): ...@@ -96,7 +98,7 @@ class CompressedEntry(Entry):
class OldCompiledDictionary: class OldCompressedDictionary:
INITIAL_STATE_OFFSET=4 INITIAL_STATE_OFFSET=4
INF_SEPARATOR=re.compile(r"(?<![\\]),") INF_SEPARATOR=re.compile(r"(?<![\\]),")
...@@ -282,6 +284,14 @@ class OldCompiledDictionary: ...@@ -282,6 +284,14 @@ class OldCompiledDictionary:
class CompressedDictionary(OldCompressedDictionary):
def __init__(self):
super(CompressedDictionary, self).__init__()
raise NotImplementedError
class GRF(Automaton): class GRF(Automaton):
def __init__(self, name="GRF"): def __init__(self, name="GRF"):
...@@ -430,7 +440,7 @@ class SentenceFST(Automaton): ...@@ -430,7 +440,7 @@ class SentenceFST(Automaton):
p1 = labels[lid][1][0][0] p1 = labels[lid][1][0][0]
p2 = labels[lid][1][1][0] p2 = labels[lid][1][1][0]
if not self.__labels.has_key(p1): if p1 not in self.__labels:
self.__labels[p1] = [] self.__labels[p1] = []
self.__labels[p1].append((entry, p2)) self.__labels[p1].append((entry, p2))
...@@ -450,7 +460,7 @@ class TextFST: ...@@ -450,7 +460,7 @@ class TextFST:
def __len__(self): def __len__(self):
return self.__size return self.__size
def next(self): def __next(self):
line = self.__file.readline() line = self.__file.readline()
while line: while line:
...@@ -535,6 +545,7 @@ class TextFST: ...@@ -535,6 +545,7 @@ class TextFST:
entry = Entry() entry = Entry()
if ENTRY.match(content): if ENTRY.match(content):
content = ENTRY.sub(r"\1", content)
entry.load(content) entry.load(content)
else: else:
entry.set_form(content) entry.set_form(content)
...@@ -562,15 +573,15 @@ class TextFST: ...@@ -562,15 +573,15 @@ class TextFST:
line = line.rstrip() line = line.rstrip()
_LOGGER.debug("SENTENCE[%s]\n" % number) _LOGGER.debug("SENTENCE[%s]\n" % number)
_LOGGER.debug(" - offset: %s\n" % offset) _LOGGER.debug(" - offset: (%s)\n" % ", ".join([str(i) for i in offset]))
_LOGGER.debug(" - text: %s\n" % text) _LOGGER.debug(" - text: %s\n" % text)
_LOGGER.debug(" - tokens: %s\n" % tokens) _LOGGER.debug(" - tokens: [%s]\n" % ", ".join([str(t) for t in tokens]))
_LOGGER.debug(" - states:\n") _LOGGER.debug(" - states:\n")
for state in states: for state in states:
_LOGGER.debug(" - s: %s\n" % state) _LOGGER.debug(" - s: %s\n" % state)
_LOGGER.debug(" - tags:\n") _LOGGER.debug(" - tags:\n")
for tag in tags: for tag in tags:
_LOGGER.debug(" - t: %s\n" % tag) _LOGGER.debug(" - t: (%s)\n" % ", ".join([str(t) for t in tag]))
S = SentenceFST("SENTENCE[%d]" % number) S = SentenceFST("SENTENCE[%d]" % number)
S.load(text, tokens, states, tags) S.load(text, tokens, states, tags)
...@@ -578,11 +589,11 @@ class TextFST: ...@@ -578,11 +589,11 @@ class TextFST:
return S return S
def __iter__(self): def __iter__(self):
sentence = self.next() sentence = self.__next()
while sentence: while sentence:
yield sentence yield sentence
sentence = self.next() sentence = self.__next()
def open(self, file, encoding=None): def open(self, file, encoding=None):
if encoding is None: if encoding is None:
......
...@@ -107,7 +107,7 @@ class Tag(object): ...@@ -107,7 +107,7 @@ class Tag(object):
class Entry(Tag): class Entry(Tag):
def __init__(self, entry=None): def __init__(self, entry=None):
super(Tag, self).__init__() super(Entry, self).__init__()
self.__form = "" self.__form = ""
self.__lemma = "" self.__lemma = ""
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter