diff --git a/setup.py b/setup.py index c40ce57d0a000203ba3bf2dca86825f0e530a036..20761be75cbaae3ed2c66bdc24b51d531897dc64 100644 --- a/setup.py +++ b/setup.py @@ -107,8 +107,11 @@ setup(name = "unitex", license = "GPLv3", install_requires = [], - package_dir = {"unitex":"unitex"}, - packages = ["unitex"], + package_dir = {"unitex": "unitex", + "unitex.utils": "unitex/utils"}, + + packages = ["unitex", + "unitex.utils"], data_files = [], diff --git a/tests/05_test_utils.py b/tests/05_test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..682ee92970daa4c82b386cfc8751407b587e8f0e --- /dev/null +++ b/tests/05_test_utils.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import shutil +import unittest + +from unitex.utils.fsa import Automaton + + + +class Arguments: + + def __init__(self, language=None): + self.__arguments = {} + + self.__arguments["raw"] = "data/grf-raw.dot" + self.__arguments["determinized"] = "data/grf-determinized.dot" + self.__arguments["minimized"] = "data/grf-minimized.dot" + + self.__arguments["automaton"] = None + + def __getitem__(self, key): + if key not in self.__arguments: + raise KeyError("Argument '%s' not found ..." % key) + return self.__arguments[key] + + def __setitem__(self, key, value): + self.__arguments[key] = value + + + +class TestUnitexUtils(unittest.TestCase): + + @classmethod + def setUpClass(self): + self._arguments = Arguments() + + @classmethod + def tearDownClass(self): + if os.path.exists(self._arguments["raw"]): + os.remove(self._arguments["raw"]) + + if os.path.exists(self._arguments["determinized"]): + os.remove(self._arguments["determinized"]) + + if os.path.exists(self._arguments["minimized"]): + os.remove(self._arguments["minimized"]) + + def test_01_automaton_build(self): + self._arguments["automaton"] = Automaton("MWU Test") + + path1 = "président français de la république" + path2 = "président de la république" + path3 = "ministre islandais de la défense" + path4 = "ministre islandais à la défense" + path5 = "secrétaire d'état à la défense" + path6 = "secrétaire d'état" + path7 = "secrétaire" + path8 = "adjoint au secrétaire d'état" + path9 = "adjoint au secrétaire d'état à la défense" + + self._arguments["automaton"].add_path(path1.split()) + self._arguments["automaton"].add_path(path2.split()) + self._arguments["automaton"].add_path(path3.split()) + self._arguments["automaton"].add_path(path4.split()) + self._arguments["automaton"].add_path(path5.split()) + self._arguments["automaton"].add_path(path6.split()) + self._arguments["automaton"].add_path(path7.split()) + self._arguments["automaton"].add_path(path8.split()) + self._arguments["automaton"].add_path(path9.split()) + + self._arguments["automaton"].todot(self._arguments["raw"]) + self.assertTrue(os.path.exists(self._arguments["raw"]), "Automaton building failed!") + + def test_02_automaton_determinize(self): + self._arguments["automaton"].determinize() + self._arguments["automaton"].todot(self._arguments["determinized"]) + + self.assertTrue(os.path.exists(self._arguments["determinized"]), "Automaton determinization failed!") + + + def test_03_automaton_minimize(self): + self._arguments["automaton"].minimize() + self._arguments["automaton"].todot(self._arguments["minimized"]) + + self.assertTrue(os.path.exists(self._arguments["minimized"]), "Automaton minimization failed!") + +if __name__ == '__main__': + unittest.main() diff --git a/tests/06_test_formats.py b/tests/06_test_formats.py new file mode 100644 index 0000000000000000000000000000000000000000..f9fd0512f0988ab523afcc3d394508451199ecd6 --- /dev/null +++ b/tests/06_test_formats.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import shutil +import unittest + +from unitex.utils.formats import * + + + +class Arguments: + + def __init__(self, language=None): + self.__arguments = {} + + self.__arguments["bin-v1"] = "data/dictionary-v1.bin" + self.__arguments["inf-v1"] = "data/dictionary-v1.inf" + self.__arguments["enc-v1"] = "utf-16-le" + + self.__arguments["grf"] = "data/automaton.grf" + + def __getitem__(self, key): + if key not in self.__arguments: + raise KeyError("Argument '%s' not found ..." % key) + return self.__arguments[key] + + + +class TestUnitexUtils(unittest.TestCase): + + @classmethod + def setUpClass(self): + self._arguments = Arguments() + + @classmethod + def tearDownClass(self): + if os.path.exists(self._arguments["grf"]): + os.remove(self._arguments["grf"]) + + def test_01_grf_build(self): + grf = GRF("GRF") + + path1 = "président français de la république" + path2 = "président de la république" + path3 = "ministre islandais de la défense" + path4 = "ministre islandais à la défense" + path5 = "secrétaire d'état à la défense" + path6 = "secrétaire d'état" + path7 = "secrétaire" + path8 = "adjoint au secrétaire d'état" + path9 = "adjoint au secrétaire d'état à la défense" + + grf.add_path(path1.split()) + grf.add_path(path2.split()) + grf.add_path(path3.split()) + grf.add_path(path4.split()) + grf.add_path(path5.split()) + grf.add_path(path6.split()) + grf.add_path(path7.split()) + grf.add_path(path8.split()) + grf.add_path(path9.split()) + + grf.save(self._arguments["grf"]) + self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!") + + def test_02_old_dictionary(self): + dictionary = OldCompiledDictionary() + dictionary.load(self._arguments["bin-v1"],\ + self._arguments["inf-v1"],\ + self._arguments["enc-v1"]) + + ret = True if dictionary.find("Sébastien") else False + + self.assertTrue(ret, "Dictionary lookup failed!") + +if __name__ == '__main__': + unittest.main() diff --git a/tests/data/dictionary-v1.bin b/tests/data/dictionary-v1.bin new file mode 100644 index 0000000000000000000000000000000000000000..1fdbe060565129d4b7f737e3598c05ac24caa528 Binary files /dev/null and b/tests/data/dictionary-v1.bin differ diff --git a/tests/data/dictionary-v1.inf b/tests/data/dictionary-v1.inf new file mode 100644 index 0000000000000000000000000000000000000000..1a4d8e80f6c0d1c4db237d8846e236f0dbadf236 Binary files /dev/null and b/tests/data/dictionary-v1.inf differ diff --git a/tests/data/text.tfst b/tests/data/text.tfst new file mode 100644 index 0000000000000000000000000000000000000000..4fb9e17e4d7cdbb5f22acc50d0ed790c5e9d5c11 --- /dev/null +++ b/tests/data/text.tfst @@ -0,0 +1,273 @@ +0000000002 +$1 +Dans son jardin de Norvège, Sébastien joue du biniou près de son verger. +0/4 1/1 2/3 1/1 3/6 1/1 4/2 1/1 5/7 6/1 1/1 7/9 1/1 8/4 1/1 9/2 1/1 10/6 1/1 11/4 1/1 4/2 1/1 2/3 1/1 12/6 13/1 1/1 +0_0 +: 1 1 +: 2 2 3 2 4 2 +: 5 5 6 3 +: 7 4 8 5 9 5 +: 11 5 12 5 13 5 +: 10 6 +: 14 7 +: 15 8 +: 16 9 17 9 +: 18 10 19 11 +: 21 11 +: 20 12 +: 22 15 23 13 24 13 +: 25 14 26 15 27 15 +: 31 15 32 15 33 15 +: 28 16 29 16 30 16 +: 34 17 35 17 +: 36 18 +t +f +@<E> +. +@STD +@{Dans,dans.PREP+Dnom+z1} +@0.0.0-0.3.0 +. +@STD +@{son,son.N+Conc+[Veg]+z1:ms} +@2.0.0-2.2.0 +. +@STD +@{son,son.N+[Bruit]+z1:ms} +@2.0.0-2.2.0 +. +@STD +@{son,son.DET+Dposs3s+z1:ms:fs} +@2.0.0-2.2.0 +. +@STD +@{jardin de,jardin de.NDET+Dnom7} +@4.0.0-6.1.0 +. +@STD +@{jardin,jardin.N+z1:ms} +@4.0.0-4.5.0 +. +@STD +@{de,.PREP+z1} +@6.0.0-6.1.0 +. +@STD +@{de,de.PREP+z1} +@6.0.0-6.1.0 +. +@STD +@{de,de.DET+Dind+z1:ms:fs:mp:fp} +@6.0.0-6.1.0 +. +@STD +@Norvège +@8.0.0-8.6.0 +. +@STD +@{du,.DET+Dind+z1:ms} +@6.0.0-6.1.0 +. +@STD +@{des,un.DET+Dind+z1:mp:fp} +@6.0.0-6.1.0 +. +@STD +@{de la,du.DET+Dind+z1:fs} +@6.0.0-6.1.0 +. +@STD +@, +@9.0.0-9.0.0 +. +@STD +@Sébastien +@11.0.0-11.8.0 +. +@STD +@{joue,jouer.V+z1:P1s:P3s:S1s:S3s:Y2s} +@13.0.0-13.3.0 +. +@STD +@{joue,joue.N+z1:fs} +@13.0.0-13.3.0 +. +@STD +@{de,.PREP+z1} +@15.0.0-15.1.0 +. +@STD +@{du,du.DET+Dind+z1:ms} +@15.0.0-15.1.0 +. +@STD +@{biniou,biniou.N+z2:ms} +@17.0.0-17.5.0 +. +@STD +@{le,.DET+Ddef+z1:ms} +@15.0.0-15.1.0 +. +@STD +@{près de,près de.PREP+EPCPQ+z1} +@19.0.0-21.1.0 +. +@STD +@{près,près.PREP+Dnom+z1} +@19.0.0-19.3.0 +. +@STD +@{près,près.ADV} +@19.0.0-19.3.0 +. +@STD +@{de,.PREP+z1} +@21.0.0-21.1.0 +. +@STD +@{de,de.PREP+z1} +@21.0.0-21.1.0 +. +@STD +@{de,de.DET+Dind+z1:ms:fs:mp:fp} +@21.0.0-21.1.0 +. +@STD +@{son,son.N+Conc+[Veg]+z1:ms} +@23.0.0-23.2.0 +. +@STD +@{son,son.N+[Bruit]+z1:ms} +@23.0.0-23.2.0 +. +@STD +@{son,son.DET+Dposs3s+z1:ms:fs} +@23.0.0-23.2.0 +. +@STD +@{du,.DET+Dind+z1:ms} +@21.0.0-21.1.0 +. +@STD +@{des,un.DET+Dind+z1:mp:fp} +@21.0.0-21.1.0 +. +@STD +@{de la,du.DET+Dind+z1:fs} +@21.0.0-21.1.0 +. +@STD +@{verger,verger.V+z1:W} +@25.0.0-25.5.0 +. +@STD +@{verger,verger.N+z1:ms} +@25.0.0-25.5.0 +. +@STD +@. +@26.0.0-26.0.0 +. +f +$2 +Il est heureux Monsieur Paumier et ce n'est pas dommage. +15/2 1/1 16/3 1/1 17/7 1/1 18/8 1/1 19/7 1/1 20/2 1/1 21/2 1/1 22/1 23/1 16/3 1/1 24/3 1/1 25/7 13/1 1/1 +29_77 +: 1 1 +: 2 2 3 2 4 2 +: 5 3 6 3 +: 7 4 +: 8 5 +: 9 6 +: 10 7 11 7 12 7 +: 13 8 +: 14 9 15 9 16 9 +: 17 10 18 10 +: 19 11 +: 20 12 +t +f +@<E> +. +@STD +@{Il,il.PRO+PpvIL+z1:3ms} +@0.0.0-0.1.0 +. +@STD +@{est,être.V+z1:P3s} +@2.0.0-2.2.0 +. +@STD +@{est,est.N+z1:ms} +@2.0.0-2.2.0 +. +@STD +@{est,est.A+z1:ms:fs:mp:fp} +@2.0.0-2.2.0 +. +@STD +@{heureux,heureux.N+z1:ms:mp} +@4.0.0-4.6.0 +. +@STD +@{heureux,heureux.A+z1:ms:mp} +@4.0.0-4.6.0 +. +@STD +@{Monsieur,monsieur.N+z1:ms} +@6.0.0-6.7.0 +. +@STD +@{Paumier,paumier.N:ms} +@8.0.0-8.6.0 +. +@STD +@{et,et.CONJC} +@10.0.0-10.1.0 +. +@STD +@{ce,ce.PRO+PpvIL+z1:3ms:3mp} +@12.0.0-12.1.0 +. +@STD +@{ce,ce.PRO+Pdem+z1:ms} +@12.0.0-12.1.0 +. +@STD +@{ce,ce.DET+Ddem+z1:ms} +@12.0.0-12.1.0 +. +@STD +@{ne,.XI+z1} +@14.0.0-15.0.0 +. +@STD +@{est,être.V+z1:P3s} +@16.0.0-16.2.0 +. +@STD +@{est,est.N+z1:ms} +@16.0.0-16.2.0 +. +@STD +@{est,est.A+z1:ms:fs:mp:fp} +@16.0.0-16.2.0 +. +@STD +@{pas,pas.N+z1:ms:mp} +@18.0.0-18.2.0 +. +@STD +@{pas,pas.ADV+z1} +@18.0.0-18.2.0 +. +@STD +@{dommage,dommage.N+z1:ms} +@20.0.0-20.6.0 +. +@STD +@. +@21.0.0-21.0.0 +. +f diff --git a/tests/data/text.tind b/tests/data/text.tind new file mode 100644 index 0000000000000000000000000000000000000000..f3f1dca1d4a84042d99df8e84574e2756812fc1f Binary files /dev/null and b/tests/data/text.tind differ diff --git a/unitex/processor.py b/unitex/processor.py index 665d070885d60c878b8c7f9b2c1ddd3c09b8bdaf..50e06bbbbaf63b368e3c199eae808ba0e90ea3a7 100644 --- a/unitex/processor.py +++ b/unitex/processor.py @@ -390,6 +390,9 @@ class UnitexProcessor(object): self.__snt = None self.__dir = None + def tofst(self): + pass + def iter(self, grammar, **kwargs): """ This function iters over the grammar matches. diff --git a/unitex/utils/__init__.py b/unitex/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..586a5fd94ea213fa09f346a788b78c5db0b9646b --- /dev/null +++ b/unitex/utils/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + diff --git a/unitex/utils/formats.py b/unitex/utils/formats.py new file mode 100644 index 0000000000000000000000000000000000000000..e312fe127b3c325d34351b1a72842524feaaa2d5 --- /dev/null +++ b/unitex/utils/formats.py @@ -0,0 +1,601 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import array +import logging +import re +import struct + +from unitex import UnitexException, UnitexConstants +from unitex.utils.fsa import FSAConstants, Automaton +from unitex.utils.types import Tag, Entry + +_LOGGER = logging.getLogger(__name__) + + + +class CompressedEntry(Entry): + + SEPARATORS = (" ", "-") + SPLITTER = re.compile("([-\s])") + + def __init__(self): + super(CompressedEntry, self).__init__() + + def compute(self, lemma, form): + n, i = "", 0 + + while i < len(lemma) and lemma[i].isdigit(): + n = n + lemma[i] + i = i + 1 + + if i > 0: + prefix = form[:len(form)-int(n)] + else: + prefix = form + + suffix = lemma[i:] + + return "%s%s" % (prefix, suffix) + + def uncompress(self, lemma): + form = self.get_form() + if not lemma: + return form + + # If two words don't have de same number of elements + # the compressed lemma is preceded by '_' + if lemma[0] == '_': + return self.compute(lemma[1:], form) + + wtab = self.SPLITTER.split(form) + ltab = self.SPLITTER.split(lemma) + + l = [] + for i in range(len(ltab)): + if not ltab[i]: + continue + elif ltab[i] in self.SEPARATORS: + l.append(ltab[i]) + else: + l.append(self.compute(ltab[i], wtab[i])) + + return "".join(l) + + def load(self, form, data, lemmatize=True): + data = data.rstrip() + + self.set_form(form) + lemma = "" + + i = 0 + + lemma, escaped = "", False + try: + while True: + if data[i] == "." and escaped is False: + break + elif data[i] == "\\": + if escaped is True: + lemma += data[i] + escaped = False + else: + lemma += data[i] + escaped = True + else: + lemma += data[i] + escaped = False + i += 1 + except IndexError: + raise UnitexException("Wrong lemma for entry '%s' ..." % data) + + if lemmatize is True: + self.set_lemma(self.uncompress(lemma)) + + Tag.load(self, data[i+1:]) + + + +class OldCompiledDictionary: + + INITIAL_STATE_OFFSET=4 + INF_SEPARATOR=re.compile(r"(?<![\\]),") + + def __init__(self): + self.__bin = None + self.__inf = None + + self.__buffer = None + + def lookup(self, token, i=None, pos=None): + if i is None: + i = 0 + + if pos is None: + pos = self.INITIAL_STATE_OFFSET + tnbr = self.__bin[pos] * 256 + self.__bin[pos+1] + pos = pos + 2 + + _LOGGER.debug("Lookup Start: token[%s|%s] -- pos(%s) -- tnbr(%s)\n" % (token[:i], token[i:], pos, tnbr)) + + if i == len(token): + data = [] + + _LOGGER.debug(" Check Final State: pos(%s) -- tnbr(%s)\n" % (pos, tnbr)) + if not (tnbr & 32768): + _LOGGER.debug(" -> Final\n") + index = self.__bin[pos] * 256 * 256 + self.__bin[pos+1] * 256 + self.__bin[pos+2] + + for inf in self.INF_SEPARATOR.split(self.__inf[index]): + E = CompressedEntry() + E.load(token, inf) + + data.append(E) + else: + _LOGGER.debug(" -> Not final\n") + + return data, pos-2 + elif tnbr & 32768: + tnbr = tnbr - 32768 + else: + pos = pos + 3 + + for j in range(tnbr): + char = chr(self.__bin[pos] * 256 + self.__bin[pos+1]) + _LOGGER.debug(" Matching char[%s] -- pos(%s) -> current[%s]\n" % (token[i], pos, char)) + + pos = pos + 2 + + offset = self.__bin[pos] * 256 * 256 + self.__bin[pos+1] * 256 + self.__bin[pos+2] + pos = pos + 3 + + if char == token[i]: + _LOGGER.debug(" -> Char found\n") + return self.lookup(token, i+1, offset) + + # WEIRD... Objective: handle whitespaces in MWU dictionaries for the match function + # -> ["Conseil", "d'", "administration"] == "Conseil d'administration" + elif char == u" " and i == 0: + _LOGGER.debug(" -> Char is whitespace [pass]\n") + return self.lookup(token, i, offset) + + return None, pos + + def find(self, token): + entries, pos = self.lookup(token) + return entries + + def match(self, sequence, i=None, mode=None, separator=None): + if i is None: + i = 0 + + if mode is None: + mode = UnitexConstants.MATCH_MODE_LONGEST + elif mode not in [UnitexConstants.MATCH_MODE_LONGEST,\ + UnitexConstants.MATCH_MODE_SHORTEST,\ + UnitexConstants.MATCH_MODE_ALL]: + raise UnitexException("Wrong match mode: %s ..." % mode) + + matches = [] + + buffer, pos, tnbr = [], None, None + for j in range(i, len(sequence)): + _LOGGER.debug("Match Token: '%s'\n" % sequence[j]) + + entries, pos = self.lookup(sequence[j], pos=pos) + if entries is None: + _LOGGER.debug(" -> No entry found ...\n") + break + _LOGGER.debug(" -> Entries found: pos[%s] -- tnbr[%s]\n" % (pos, tnbr)) + + buffer.append(j) + + if entries: + matches.append((entries, buffer[:])) + if mode == UnitexConstants.MATCH_MODE_SHORTEST: + return matches + + if separator is not None: + _LOGGER.debug("Match Separator: '%s'\n" % separator) + entries, pos = self.lookup(separator, pos=pos) + if entries is None: + _LOGGER.debug(" -> No separator found ...\n") + break + _LOGGER.debug(" -> Separator found\n") + + if not matches: + return None + elif mode == UnitexConstants.MATCH_MODE_LONGEST: + return [matches[-1]] + elif mode == UnitexConstants.MATCH_MODE_ALL: + return matches + + def dump(self, pos=None): + if pos is None: + pos = self.INITIAL_STATE_OFFSET + self.__buffer = [] + + tnbr = self.__bin[pos] * 256 + self.__bin[pos+1] + pos = pos + 2 + + if not (tnbr & 32768): + index = self.__bin[pos] * 256 * 256 + self.__bin[pos+1] * 256 + self.__bin[pos+2] + + form = "".join(self.__buffer) + + for inf in self.INF_SEPARATOR.split(self.__inf[index]): + E = CompressedEntry() + E.load(form, inf) + yield E + + pos = pos + 3 + + else: + tnbr = tnbr - 32768 + + for j in range(tnbr): + self.__buffer.append(chr(self.__bin[pos] * 256 + self.__bin[pos+1])) + pos = pos + 2 + + offset = self.__bin[pos] * 256 * 256 + self.__bin[pos+1] * 256 + self.__bin[pos+2] + pos = pos + 3 + + for E in self.dump(offset): + yield E + + if self.__buffer: + self.__buffer.pop() + + def load(self, bin, inf, encoding=None): + if encoding is None: + encoding = UnitexConstants.DEFAULT_ENCODING + INF = open(inf, "r", encoding=encoding) + + self.__inf = INF.readlines() + self.__inf.pop(0) # Remove number information + + INF.close() + + BIN = open(bin, "r+b") + + a = struct.unpack('B', BIN.read(1))[0] + b = struct.unpack('B', BIN.read(1))[0] + c = struct.unpack('B', BIN.read(1))[0] + d = struct.unpack('B', BIN.read(1))[0] + size = d + (256*c) + (256*256*b) + (256*256*256*a) + + BIN.close() + + BIN = open(bin, "rb") + + self.__bin = array.array('B') + + byte = BIN.read(1) + while byte: + tmp = struct.unpack('B', byte)[0] + + self.__bin.append(tmp) + + byte = BIN.read(1) + + BIN.close() + + + +class GRF(Automaton): + + def __init__(self, name="GRF"): + super(GRF, self).__init__(name) + + def load(self, file, encoding=None): + if encoding is None: + encoding = UnitexConstants.DEFAULT_ENCODING + raise NotImplementedError + + def save(self, file, encoding=None): + if encoding is None: + encoding = UnitexConstants.DEFAULT_ENCODING + + X = 1000 + Y = 1000 + GAP = 20 + + transitions = [] + transitions.append({"label": FSAConstants.EPSILON, "targets": set([])}) + transitions.append({"label": "", "targets": set([])}) + + nmap = {} + root = [] + + for edge, sid, tid in self.iter("dfs"): + source = self[sid] + target = self[tid] + + index = 0 + + key = (str(edge), tid) + if key in nmap: + index = nmap[key] + else: + index = len(transitions) + nmap[key] = index + transitions.append({"label": str(edge), "targets": set([])}) + + if sid == self.get_initial(): + transitions[0]["targets"].add(str(index)) + if target.is_final() is True: + transitions[index]["targets"].add("1") + + for _edge in target: + for _target in target[_edge]: + _index = 0 + + _key = (str(_edge), _target.get_id()) + if _key in nmap: + _index = nmap[_key] + else: + _index = len(transitions) + nmap[_key] = _index + transitions.append({"label": str(_edge), "targets": set([])}) + + transitions[index]["targets"].add(str(_index)) + + with open(file, "w", encoding=encoding) as output: + output.write("#Unigraph\r\n") + output.write("SIZE %s %s\r\n" % (X+GAP, Y+GAP)) + output.write("FONT Times New Roman:B 10\r\n") + output.write("OFONT Monospaced:B 8\r\n") + output.write("BCOLOR 16777215\r\n") + output.write("FCOLOR 0\r\n") + output.write("ACOLOR 13487565\r\n") + output.write("SCOLOR 16711680\r\n") + output.write("CCOLOR 255\r\n") + output.write("DBOXES y\r\n") + output.write("DFRAME y\r\n") + output.write("DDATE y\r\n") + output.write("DFILE y\r\n") + output.write("DDIR n\r\n") + output.write("DRIG n\r\n") + output.write("DRST n\r\n") + output.write("FITS 100\r\n") + output.write("PORIENT L\r\n") + output.write("#\r\n") + output.write("%s\r\n" % len(transitions)) + + for transition in transitions: + label = transition["label"] + size = len(transition["targets"]) + targets = " ".join(list(transition["targets"])) + + if size == 0: + output.write('"%s" %s %s %s \r\n' % (label, GAP, GAP, size)) + else: + output.write('"%s" %s %s %s %s \r\n' % (label, GAP, GAP, size, targets)) + + + +class SentenceFST(Automaton): + + def __init__(self, name="SentenceFST"): + super(SentenceFST, self).__init__(name) + + self.__sentence = None + + self.__tokens = None + self.__labels = None + + def get_sentence(self): + return self.__sentence + + def get_tokens(self): + return self.__tokens + + def get_token(self, i): + return self.__tokens[i] + + def get_label(self, i): + return self.__labels[i] + + def load(self, sentence, tokens, states, labels): + self.__sentence = sentence + + self.__tokens = [] + self.__labels = {} + + start = 0 + for index, length in tokens: + end = start + length + + self.__tokens.append(self.__sentence[start:end]) + start = end + + transitions = [] + + for i in range(len(states)): + initial = False + if i == 0: + initial = True + + final = False + if states[i] == "t": + final = True + + sid = self.add_node(initial=initial, final=final) + if final is True: + break + + for lid, tid in states[i]: + entry = labels[lid][0] + + p1 = labels[lid][1][0][0] + p2 = labels[lid][1][1][0] + + if not self.__labels.has_key(p1): + self.__labels[p1] = [] + self.__labels[p1].append((entry, p2)) + + transitions.append((sid, lid, tid)) + + for sid, lid, tid in transitions: + self.add_edge(lid, sid, tid) + + + +class TextFST: + + def __init__(self): + self.__file = None + self.__size = 0 + + def __len__(self): + return self.__size + + def next(self): + line = self.__file.readline() + + while line: + line = line.rstrip() + + if line[0] != "$": + raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + + # The sentence number (format '$n') + number = int(line[1:]) + + line = self.__file.readline() + line = line.rstrip() + + # The text of the sentence + text = line + + line = self.__file.readline() + line = line.rstrip() + + # The tokens of the text + # -> [(x1, y), (x2, y2), ..., (xi, yi)] + # where, + # - x: token index in file 'tokens.txt' + # - y: length of the token (in characters) + tokens = [tuple(int(t) for t in token.split("/")) for token in line.split(" ")] + + line = self.__file.readline() + line = line.rstrip() + + # The offset of the sentence (from the begining of the text) + # -> X_Y + # where, + # - X: the offset in tokens + # - Y: the offset in characters + offset = tuple(int(o) for o in line.split("_")) + + line = self.__file.readline() + line = line.rstrip() + + states = [] + while line != "t": + if line[0] != ":": + raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + + line = line[1:].strip() + line = line.split() + + state = [] + for i in range(0, len(line), 2): + state.append((int(line[i]), int(line[i+1]))) + states.append(state) + + line = self.__file.readline() + line = line.rstrip() + + if not line: + raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + + states.append(line) + + line = self.__file.readline() + line = line.rstrip() + + if line[0] != "f": + raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + + line = self.__file.readline() + line = line.rstrip() + + tags = [] + while line != "f": + if line == "@<E>": + tags.append(("<E>", None)) + + elif line == "@STD": + line = self.__file.readline() + line = line.rstrip() + + content = line[1:] + + entry = Entry() + + if ENTRY.match(content): + entry.load(content) + else: + entry.set_form(content) + + line = self.__file.readline() + line = line.rstrip() + + if line[0] != "@": + raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + + position = [tuple(int(i) for i in p.split(".")) for p in line[1:].split("-")] + + tags.append((entry, position)) + + else: + raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + + line = self.__file.readline() + line = line.rstrip() + + if line[0] != ".": + raise UnitexException("File '%s' is corrupted ..." % self.__file.name) + + line = self.__file.readline() + line = line.rstrip() + + _LOGGER.debug("SENTENCE[%s]\n" % number) + _LOGGER.debug(" - offset: %s\n" % offset) + _LOGGER.debug(" - text: %s\n" % text) + _LOGGER.debug(" - tokens: %s\n" % tokens) + _LOGGER.debug(" - states:\n") + for state in states: + _LOGGER.debug(" - s: %s\n" % state) + _LOGGER.debug(" - tags:\n") + for tag in tags: + _LOGGER.debug(" - t: %s\n" % tag) + + S = SentenceFST("SENTENCE[%d]" % number) + S.load(text, tokens, states, tags) + + return S + + def __iter__(self): + sentence = self.next() + while sentence: + yield sentence + + sentence = self.next() + + def open(self, file, encoding=None): + if encoding is None: + encoding = UnitexConstants.DEFAULT_ENCODING + + self.__file = open(file, "r", encoding=encoding) + + line = self.__file.readline() + line = line.rstrip() + + # The number of sentence in the text fst (format: '000000000N') + self.__size = int(line) + + def close(self): + self.__file.close() + self.__size = 0 diff --git a/unitex/utils/fsa.py b/unitex/utils/fsa.py new file mode 100644 index 0000000000000000000000000000000000000000..de6250e4150ab03b4b21f9833d151c43b663392d --- /dev/null +++ b/unitex/utils/fsa.py @@ -0,0 +1,574 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from io import open + +from unitex import * + +_LOGGER = logging.getLogger(__name__) + + + +class FSAConstants: + + EPSILON = "<E>" + + DEPTH_FIRST_SEARCH = "dfs" + BREADTH_FIRST_SEARCH = "bfs" + + + +class Edge: + + def __init__(self, label, targets=None, source=None): + self.__label = label + + self.__source = source + + self.__targets = targets + if self.__targets is not None: + self.__tids = set([target.get_id() for target in targets]) + + def __len__(self): + return len(self.__targets) + + def __str__(self): + return self.get_label() + + def __hash__(self): + return hash(self.get_label()) + + def __cmp__(self, e): + return cmp(self.get_label(), self.get_label()) + + def __iter__(self): + for target in self.__targets: + yield target + + def __contains__(self, target): + return True if target.get_id() in self.__tids else False + + def __getitem__(self, i): + return self.__targets[i] + + def get_label(self): + return self.__label + + def get_source(self): + return self.__source + + def set_source(self, source): + self.__source = source + + def get_targets(self): + return self.__targets + + def set_targets(self, targets): + self.__targets = targets + self.__tids = set([target.get_id() for target in targets]) + + def add_target(self, target): + if target.get_id() in self.__tids: + return + self.__targets.append(target) + + def del_target(self, target): + if target.get_id() not in self.__tids: + return + + self.__tids.remove(target.get_id()) + + for i in range(len(self.__targets)): + _target = self.__targets[i] + if _target.get_id() == target.get_id(): + del self.__targets[i] + break + + + +class Node: + + def __init__(self, _id, final=False): + self.__id = _id + + self.__final = final + self.__edges = {} + + self.__depth = 0 + + self.__visited = False + + def __len__(self): + return len(self.__edges) + + def __contains__(self, label): + return label in self.__edges + + def __getitem__(self, label): + return self.__edges.get(label, None) + + def __iter__(self): + for label in self.__edges: + yield label + + def __str__(self): + s = "NODE[%s]" % str(self.get_id()) + + if self.is_final(): + s += " -- FINAL" + + for label in self: + targets = " | ".join([str(target.get_id()) for target in self[label]]) + s += "\n\t%s -> (%s)" % (label, targets) + + return s + + def get_id(self): + return self.__id + + def set_id(self, i): + self.__id = i + + def is_deterministic(self): + if FSAConstants.EPSILON in self.__edges: + return False + + for label in self.__edges: + if len(self[label]) > 1: + return False + + return True + + def exists(self, label, node=None): + if not label in self: + return False + + if node is not None and node not in self[label]: + return False + + return True + + def add(self, label, target): + if self.exists(label, target) is True: + return + + if self.exists(label) is False: + edge = Edge(label, [target], self) + self.__edges[label] = edge + else: + self[label].add_target(target) + + def delete(self, label, node=None): + if not self.exists(label, node): + raise UnitexException("Edge not found: %s" % label) + + if node is None: + del self.__edges[label] + else: + self[label].del_target(node) + + def set_depth(self, depth): + self.__depth = depth + + def get_depth(self): + return self.__depth + + def is_visited(self): + return self.__visited + + def set_visited(self, visited=True): + self.__visited = visited + + def is_final(self): + return self.__final + + def set_final(self, final=True): + self.__final = final + + + +class NodeSets: + + def __init__ (self): + self.__sets = {} + + def __getitem__(self, _id): + return self.__sets[_id] + + def __contains__(self, s): + return s in self.all() + + def __iter__ (self): + return iter(self.all()) + + def all(self): + return set([tuple(l) for l in self.__sets.values()]) + + def add(self, s): + _set = tuple(sorted(set(s))) + for _id in s: + self.__sets[_id] = _set + + + +class Automaton: + + def __init__(self, name="Automaton"): + self.__name = name + + self.__nodes = [] + + self.__initial = 0 + self.__finals = [] + + self.__nodes.append(Node(self.__initial, False)) + + def __len__(self): + return len(self.__nodes) + + def __getitem__(self, _id): + try: + return self.__nodes[_id] + except IndexError: + return None + + def __iter__(self): + for node in self.__nodes: + yield node + + def __str__(self): + title = "# FSA -- %s #" % self.get_name() + + s = "%s\n%s\n%s\n\n" % ("#" * len(title), title, "#" * len(title)) + + for node in self: + s += "%s\n\n" % node + + return s + + def get_name(self): + return self.__name + + def set_name(self, name): + self.__name = name + + def get_depth(self): + depth = 0 + for nid in self.__finals: + final = self.__nodes[nid] + + if final.get_depth() > depth: + depth = final.get_depth() + + return depth + + def get_initial(self): + return self.__initial + + def set_initial(self, initial): + self.__initial = initial + + def get_finals(self): + return self.__finals + + def set_finals(self, finals): + self.__finals = finals + + def get_nodes(self): + return self.__nodes + + def set_nodes(self, nodes): + self.__nodes = nodes + + def add_edge(self, label, sid, tid): + source = self[sid] + target = self[tid] + + target.set_depth(source.get_depth() + 1) + + source.add(label, target) + + def add_node(self, initial=False, final=False): + if initial is True: + return self.__initial + elif final is True: + self.__finals.append(len(self.__nodes)) + self.__nodes.append(Node(self.__finals[-1], True)) + return self.__finals[-1] + + nid = len(self.__nodes) + + self.__nodes.append(Node(nid, final)) + + return nid + + def add_path(self, path): + if len(path) == 0: + raise UnitexException("Empty path!") + sid = self.add_node(initial=True, final=False) + + for label in path[:-1]: + tid = self.add_node(initial=False, final=False) + self.add_edge(label, sid, tid) + + sid = tid + else: + self.add_edge(path[-1], sid, self.add_node(initial=False, final=True)) + + def get_alphabet(self): + alphabet = set() + + for node in self: + for label in node: + alphabet.add(label) + + return tuple(alphabet) + + def is_deterministic(self): + for node in self: + if not node.is_deterministic(): + return False + return True + + def __closure(self, nid): + stack = [nid] + result = set(stack) + + while len(stack) > 0: + current = stack.pop() + + if FSAConstants.EPSILON in self[current]: + edge = self[current][FSAConstants.EPSILON] + if edge not in result: + stack.append(edge) + result.add(edge) + + return tuple(result) + + def determinize(self): + dfa = Automaton("DETERMINIZED(%s)" % self.get_name()) + + alphabet = self.get_alphabet() + + initials = self.__closure(self.get_initial()) + + hid = dfa.add_node(initial=True, final=False) + + visited = {} + visited[initials] = hid + + stack = [initials] + while len(stack) > 0: + current = stack.pop() + + for label in alphabet: + new = set() + for node in current: + if not label in self[node]: + continue + for next in self[node][label]: + new.update(self.__closure(next.get_id())) + new = tuple(new) + + if len(new) == 0: + continue + + if new not in visited: + stack.append(new) + + final = True in [self[_id].is_final() for _id in new] + nid = dfa.add_node(final=final) + + visited[new] = nid + + dfa.add_edge(label, visited[current], visited[new]) + + self.set_name(dfa.get_name()) + + self.set_initial(dfa.get_initial()) + self.set_finals(dfa.get_finals()) + + self.set_nodes(dfa.get_nodes()) + + def minimize(self): + min = Automaton("MINIMIZED(%s)" % self.get_name()) + + alphabet = self.get_alphabet() + + nodetoset = {} + settonode = {} + + sets = NodeSets() + + rest, final = [], [] + for node in self: + if node.is_final(): + final.append(node.get_id()) + else: + rest.append(node.get_id()) + + sets.add(rest) + sets.add(final) + + stack = [s for s in sets if len(s) > 1] + + def target_set(_id, label): + edge = self[_id][label] + + if edge is None: + return None + else: + return sets[edge[0].get_id()] + + while len(stack) > 0: + current = stack.pop() + + for label in alphabet: + target = target_set(current[0], label) + + one, two = [current[0]], [] + for _id in current[1:]: + if target_set(_id, label) == target: + one.append(_id) + else: + two.append(_id) + + if len(two) > 0: + sets.add(one) + sets.add(two) + + if len(one) > 1: + stack.append(one) + if len(two) > 1: + stack.append(two) + + break + + for s in sets: + initial = self.get_initial() in s + final = True in [self[_id].is_final() for _id in s] + + _id = min.add_node(initial=initial, final=final) + + nodetoset[_id] = s + settonode[s] = _id + + for node in min: + done = set() + + s = nodetoset[node.get_id()] + + source = self[s[0]] + for label in source: + edge = source[label] + + if label in done: + continue + done.add(label) + + for target in edge: + t = sets[target.get_id()] + min.add_edge(label, node.get_id(), settonode[t]) + + self.set_name(min.get_name()) + + self.set_initial(min.get_initial()) + self.set_finals(min.get_finals()) + + self.set_nodes(min.get_nodes()) + + def reset(self): + for node in self: + node.set_visited(False) + + def __expand(self, source): + L = [] + + source.set_visited(True) + for label in source: + edge = source[label] + for target in source[label]: + L.append((edge.get_label(), source.get_id(), target.get_id())) + + return L + + def iter(self, iter_type=None): + if iter_type is None: + iter_type = FSAConstants.BREADTH_FIRST_SEARCH + + if len(self[self.get_initial()]) == 0: + raise UnitexException("Empty FSA") + + i = None + if iter_type == FSAConstants.DEPTH_FIRST_SEARCH: + i = -1 + elif iter_type == FSAConstants.BREADTH_FIRST_SEARCH: + i = 0 + else: + raise UnitexException("Unknown iter type: %s" % iter_type) + + root = self[self.get_initial()] + if root.is_visited(): + self.reset() + + L = self.__expand(root) + while L: + edge, sid, tid = L.pop(i) + yield (edge, sid, tid) + + if not self[tid].is_visited(): + L += self.__expand(self[tid]) + + def todot(self, file, encoding=None): + if encoding is None: + encoding = UnitexConstants.DEFAULT_ENCODING + + with open(file, "w", encoding=encoding) as output: + output.write("digraph Automaton {\n\n") + output.write("\tcenter = 1;\n") + output.write("\tcharset = \"%s\";\n" % encoding) + output.write("\trankdir = LR;\n") + output.write("\tranksep = 1;\n") + output.write("\tedge [arrowhead = vee];\n\n") + + nodes = set() + edges = set() + + for node in self: + sid = node.get_id() + n1 = "node%s" % sid + + if not sid in nodes: + nodes.add(sid) + + if node.get_id() == self.get_initial(): + output.write("\t%s[shape = circle, label = \"\"];\n" % n1) + elif node.is_final(): + output.write("\t%s[shape = doublecircle, label = \"\"];\n" % n1) + else: + output.write("\t%s[shape = point, label = \"\"];\n" % n1) + + for label in node: + for target in node[label]: + if (node.get_id(), label, target.get_id()) in edges: + continue + edges.add((node.get_id(), label, target.get_id())) + + tid = target.get_id() + n2 = "node%s" % tid + + if not tid in nodes: + nodes.add(tid) + + if target.get_id() == self.get_initial(): + output.write("\t%s[shape = circle, label = \"\"];\n" % n2) + elif target.is_final(): + output.write("\t%s[shape = doublecircle, label = \"\"];\n" % n2) + else: + output.write("\t%s[shape = point, label = \"\"];\n" % n2) + + output.write("\t%s -> %s [label = \"%s\"];\n" % (n1, n2, label)) + + output.write("\n") + + output.write("}\n") diff --git a/unitex/utils/types.py b/unitex/utils/types.py new file mode 100644 index 0000000000000000000000000000000000000000..ce33eafd9b03c83097184298220964b92d420858 --- /dev/null +++ b/unitex/utils/types.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import logging +import re + +from unitex import UnitexException + +_LOGGER = logging.getLogger(__name__) + + + +class Tag(object): + + def __init__(self, tag=None): + self.__pos = "" + + self.__features = [] + self.__flexions = [] + + if tag is not None: + self.load(tag) + + def __str__(self): + return self.get() + + def load(self, tag): + self.__pos = "" + + self.__features = [] + self.__flexions = [] + + i = 0 + + pos = "" + while i < len(tag) and tag[i] != '+' and tag[i] != ':': + pos = pos + tag[i] + i += 1 + + self.set_pos(pos) + + while i < len(tag) and tag[i] == '+': + sign = tag[i] + + i += 1 + + tmp = "" + while i < len(tag) and tag[i] != '+' and tag[i] != ':': + tmp = tmp + tag[i] + i += 1 + + if tmp: + self.add_feature(tmp) + + while i < len(tag) and tag[i] == ':': + sign = tag[i] + + i += 1 + + tmp = "" + while i < len(tag) and tag[i] != ':': + tmp = tmp + tag[i] + i += 1 + + if tmp: + self.add_flexion(tmp) + + def get(self): + tag = self.get_pos() + + features = "+".join(self.get_features()) + if features: + tag += "+%s" % features + + flexions = "".join(self.get_flexions()) + if flexions: + tag += ":%s" % flexions + + return tag + + def set_pos(self, pos): + self.__pos = pos + + def get_pos(self): + return self.__pos + + def set_features(self, features): + self.__features = features + + def get_features(self): + return self.__features + + def add_feature(self, feature): + self.__features.append(feature) + + def set_flexions(self, flexions): + self.__flexions = flexions + + def get_flexions(self): + return self.__flexions + + def add_flexion(self, flexion): + self.__flexions.append(flexion) + + + +class Entry(Tag): + + def __init__(self, entry=None): + super(Tag, self).__init__() + + self.__form = "" + self.__lemma = "" + + if entry is not None: + self.load(entry) + + def __str__(self): + return self.get() + + def load(self, entry): + i = 0 + + escaped = False + + form = "" + try: + while True: + + if entry[i] == "," and escaped is False: + i += 1 + break + + elif entry[i] == "\\": + if escaped is True: + form += entry[i] + escaped = False + else: + escaped = True + + else: + form += entry[i] + escaped = False + + i += 1 + except IndexError: + raise UnitexException("Invalid entry format '%s'. No comma found." % entry) + + self.set_form(form) + + escaped = False + + lemma = "" + try: + while True: + + if entry[i] == "." and escaped is False: + i += 1 + break + + elif entry[i] == "\\": + if escaped is True: + lemma += entry[i] + escaped = False + else: + escaped = True + + else: + lemma += entry[i] + escaped = False + + i += 1 + except IndexError: + raise UnitexException("Invalid entry format '%s'. No dot found." % entry) + + self.set_lemma(lemma) + + Tag.load(self, entry[i:]) + + def get(self): + form = self.get_form(escape=True) + lemma = self.get_lemma(escape=True) + if not lemma: + lemma = "" + + tag = Tag.get(self) + + return "%s,%s.%s" % (form, lemma, tag) + + def set_form(self, form): + self.__form = form + + def get_form(self, escape=False): + if escape is False: + return self.__form + return self.__form.replace(",", "\,") + + def set_lemma(self, lemma): + self.__lemma = lemma + + def get_lemma(self, escape=False): + if escape is False: + return self.__lemma + return self.__lemma.replace(",", "\,")