Skip to content
Extraits de code Groupes Projets
Valider e0a576d1 rédigé par pat's avatar pat
Parcourir les fichiers

Testing TextFST

parent f045a208
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
......@@ -18,8 +18,16 @@ class Arguments:
self.__arguments["inf-v1"] = "data/dictionary-v1.inf"
self.__arguments["enc-v1"] = "utf-16-le"
self.__arguments["bin-v2"] = "data/dictionary-v2.bin"
self.__arguments["inf-v2"] = "data/dictionary-v2.inf"
self.__arguments["enc-v2"] = "utf-16-le"
self.__arguments["grf"] = "data/automaton.grf"
self.__arguments["text-tfst"] = "data/text.tfst"
self.__arguments["text-tind"] = "data/text.tind"
self.__arguments["text-size"] = 2
def __getitem__(self, key):
if key not in self.__arguments:
raise KeyError("Argument '%s' not found ..." % key)
......@@ -61,6 +69,9 @@ class TestUnitexUtils(unittest.TestCase):
grf.add_path(path8.split())
grf.add_path(path9.split())
grf.determinize()
grf.minimize()
grf.save(self._arguments["grf"])
self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!")
......@@ -72,7 +83,20 @@ class TestUnitexUtils(unittest.TestCase):
ret = True if dictionary.find("Sébastien") else False
self.assertTrue(ret, "Dictionary lookup failed!")
self.assertTrue(ret, "Dictionary (old format) lookup failed!")
def test_03_new_dictionary(self):
self.assertTrue(True, "Dictionary (new format) lookup failed!")
def test_04_text_fst(self):
tfst = TextFST()
tfst.open(self._arguments["text-tfst"])
good = True if len(tfst) == self._arguments["text-size"] else False
tfst.close()
self.assertTrue(good, "Dictionary (new format) lookup failed!")
if __name__ == '__main__':
unittest.main()
Fichier ajouté
Le fichier a été supprimé par une entrée .gitattributes, ou son encodage n'est pas pris en charge.
......@@ -12,6 +12,8 @@ from unitex.utils.types import Tag, Entry
_LOGGER = logging.getLogger(__name__)
ENTRY = re.compile(r"{([^}]*)}")
class CompressedEntry(Entry):
......@@ -96,7 +98,7 @@ class CompressedEntry(Entry):
class OldCompiledDictionary:
class OldCompressedDictionary:
INITIAL_STATE_OFFSET=4
INF_SEPARATOR=re.compile(r"(?<![\\]),")
......@@ -282,6 +284,14 @@ class OldCompiledDictionary:
class CompressedDictionary(OldCompressedDictionary):
def __init__(self):
super(CompressedDictionary, self).__init__()
raise NotImplementedError
class GRF(Automaton):
def __init__(self, name="GRF"):
......@@ -430,7 +440,7 @@ class SentenceFST(Automaton):
p1 = labels[lid][1][0][0]
p2 = labels[lid][1][1][0]
if not self.__labels.has_key(p1):
if p1 not in self.__labels:
self.__labels[p1] = []
self.__labels[p1].append((entry, p2))
......@@ -450,7 +460,7 @@ class TextFST:
def __len__(self):
return self.__size
def next(self):
def __next(self):
line = self.__file.readline()
while line:
......@@ -535,6 +545,7 @@ class TextFST:
entry = Entry()
if ENTRY.match(content):
content = ENTRY.sub(r"\1", content)
entry.load(content)
else:
entry.set_form(content)
......@@ -562,15 +573,15 @@ class TextFST:
line = line.rstrip()
_LOGGER.debug("SENTENCE[%s]\n" % number)
_LOGGER.debug(" - offset: %s\n" % offset)
_LOGGER.debug(" - offset: (%s)\n" % ", ".join([str(i) for i in offset]))
_LOGGER.debug(" - text: %s\n" % text)
_LOGGER.debug(" - tokens: %s\n" % tokens)
_LOGGER.debug(" - tokens: [%s]\n" % ", ".join([str(t) for t in tokens]))
_LOGGER.debug(" - states:\n")
for state in states:
_LOGGER.debug(" - s: %s\n" % state)
_LOGGER.debug(" - tags:\n")
for tag in tags:
_LOGGER.debug(" - t: %s\n" % tag)
_LOGGER.debug(" - t: (%s)\n" % ", ".join([str(t) for t in tag]))
S = SentenceFST("SENTENCE[%d]" % number)
S.load(text, tokens, states, tags)
......@@ -578,11 +589,11 @@ class TextFST:
return S
def __iter__(self):
sentence = self.next()
sentence = self.__next()
while sentence:
yield sentence
sentence = self.next()
sentence = self.__next()
def open(self, file, encoding=None):
if encoding is None:
......
......@@ -107,7 +107,7 @@ class Tag(object):
class Entry(Tag):
def __init__(self, entry=None):
super(Tag, self).__init__()
super(Entry, self).__init__()
self.__form = ""
self.__lemma = ""
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter