Skip to content
Extraits de code Groupes Projets
Valider d67bd9eb rédigé par pat's avatar pat
Parcourir les fichiers

Adding unitex utilities (data types, formats, ...)

parent f91b56bc
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -107,8 +107,11 @@ setup(name = "unitex", ...@@ -107,8 +107,11 @@ setup(name = "unitex",
license = "GPLv3", license = "GPLv3",
install_requires = [], install_requires = [],
package_dir = {"unitex":"unitex"}, package_dir = {"unitex": "unitex",
packages = ["unitex"], "unitex.utils": "unitex/utils"},
packages = ["unitex",
"unitex.utils"],
data_files = [], data_files = [],
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import shutil
import unittest
from unitex.utils.fsa import Automaton
class Arguments:
def __init__(self, language=None):
self.__arguments = {}
self.__arguments["raw"] = "data/grf-raw.dot"
self.__arguments["determinized"] = "data/grf-determinized.dot"
self.__arguments["minimized"] = "data/grf-minimized.dot"
self.__arguments["automaton"] = None
def __getitem__(self, key):
if key not in self.__arguments:
raise KeyError("Argument '%s' not found ..." % key)
return self.__arguments[key]
def __setitem__(self, key, value):
self.__arguments[key] = value
class TestUnitexUtils(unittest.TestCase):
@classmethod
def setUpClass(self):
self._arguments = Arguments()
@classmethod
def tearDownClass(self):
if os.path.exists(self._arguments["raw"]):
os.remove(self._arguments["raw"])
if os.path.exists(self._arguments["determinized"]):
os.remove(self._arguments["determinized"])
if os.path.exists(self._arguments["minimized"]):
os.remove(self._arguments["minimized"])
def test_01_automaton_build(self):
self._arguments["automaton"] = Automaton("MWU Test")
path1 = "président français de la république"
path2 = "président de la république"
path3 = "ministre islandais de la défense"
path4 = "ministre islandais à la défense"
path5 = "secrétaire d'état à la défense"
path6 = "secrétaire d'état"
path7 = "secrétaire"
path8 = "adjoint au secrétaire d'état"
path9 = "adjoint au secrétaire d'état à la défense"
self._arguments["automaton"].add_path(path1.split())
self._arguments["automaton"].add_path(path2.split())
self._arguments["automaton"].add_path(path3.split())
self._arguments["automaton"].add_path(path4.split())
self._arguments["automaton"].add_path(path5.split())
self._arguments["automaton"].add_path(path6.split())
self._arguments["automaton"].add_path(path7.split())
self._arguments["automaton"].add_path(path8.split())
self._arguments["automaton"].add_path(path9.split())
self._arguments["automaton"].todot(self._arguments["raw"])
self.assertTrue(os.path.exists(self._arguments["raw"]), "Automaton building failed!")
def test_02_automaton_determinize(self):
self._arguments["automaton"].determinize()
self._arguments["automaton"].todot(self._arguments["determinized"])
self.assertTrue(os.path.exists(self._arguments["determinized"]), "Automaton determinization failed!")
def test_03_automaton_minimize(self):
self._arguments["automaton"].minimize()
self._arguments["automaton"].todot(self._arguments["minimized"])
self.assertTrue(os.path.exists(self._arguments["minimized"]), "Automaton minimization failed!")
if __name__ == '__main__':
unittest.main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import shutil
import unittest
from unitex.utils.formats import *
class Arguments:
def __init__(self, language=None):
self.__arguments = {}
self.__arguments["bin-v1"] = "data/dictionary-v1.bin"
self.__arguments["inf-v1"] = "data/dictionary-v1.inf"
self.__arguments["enc-v1"] = "utf-16-le"
self.__arguments["grf"] = "data/automaton.grf"
def __getitem__(self, key):
if key not in self.__arguments:
raise KeyError("Argument '%s' not found ..." % key)
return self.__arguments[key]
class TestUnitexUtils(unittest.TestCase):
@classmethod
def setUpClass(self):
self._arguments = Arguments()
@classmethod
def tearDownClass(self):
if os.path.exists(self._arguments["grf"]):
os.remove(self._arguments["grf"])
def test_01_grf_build(self):
grf = GRF("GRF")
path1 = "président français de la république"
path2 = "président de la république"
path3 = "ministre islandais de la défense"
path4 = "ministre islandais à la défense"
path5 = "secrétaire d'état à la défense"
path6 = "secrétaire d'état"
path7 = "secrétaire"
path8 = "adjoint au secrétaire d'état"
path9 = "adjoint au secrétaire d'état à la défense"
grf.add_path(path1.split())
grf.add_path(path2.split())
grf.add_path(path3.split())
grf.add_path(path4.split())
grf.add_path(path5.split())
grf.add_path(path6.split())
grf.add_path(path7.split())
grf.add_path(path8.split())
grf.add_path(path9.split())
grf.save(self._arguments["grf"])
self.assertTrue(os.path.exists(self._arguments["grf"]), "GRF creation failed!")
def test_02_old_dictionary(self):
dictionary = OldCompiledDictionary()
dictionary.load(self._arguments["bin-v1"],\
self._arguments["inf-v1"],\
self._arguments["enc-v1"])
ret = True if dictionary.find("Sébastien") else False
self.assertTrue(ret, "Dictionary lookup failed!")
if __name__ == '__main__':
unittest.main()
Fichier ajouté
Le fichier a été supprimé par une entrée .gitattributes, ou son encodage n'est pas pris en charge.
0000000002
$1
Dans son jardin de Norvège, Sébastien joue du biniou près de son verger.
0/4 1/1 2/3 1/1 3/6 1/1 4/2 1/1 5/7 6/1 1/1 7/9 1/1 8/4 1/1 9/2 1/1 10/6 1/1 11/4 1/1 4/2 1/1 2/3 1/1 12/6 13/1 1/1
0_0
: 1 1
: 2 2 3 2 4 2
: 5 5 6 3
: 7 4 8 5 9 5
: 11 5 12 5 13 5
: 10 6
: 14 7
: 15 8
: 16 9 17 9
: 18 10 19 11
: 21 11
: 20 12
: 22 15 23 13 24 13
: 25 14 26 15 27 15
: 31 15 32 15 33 15
: 28 16 29 16 30 16
: 34 17 35 17
: 36 18
t
f
@<E>
.
@STD
@{Dans,dans.PREP+Dnom+z1}
@0.0.0-0.3.0
.
@STD
@{son,son.N+Conc+[Veg]+z1:ms}
@2.0.0-2.2.0
.
@STD
@{son,son.N+[Bruit]+z1:ms}
@2.0.0-2.2.0
.
@STD
@{son,son.DET+Dposs3s+z1:ms:fs}
@2.0.0-2.2.0
.
@STD
@{jardin de,jardin de.NDET+Dnom7}
@4.0.0-6.1.0
.
@STD
@{jardin,jardin.N+z1:ms}
@4.0.0-4.5.0
.
@STD
@{de,.PREP+z1}
@6.0.0-6.1.0
.
@STD
@{de,de.PREP+z1}
@6.0.0-6.1.0
.
@STD
@{de,de.DET+Dind+z1:ms:fs:mp:fp}
@6.0.0-6.1.0
.
@STD
@Norvège
@8.0.0-8.6.0
.
@STD
@{du,.DET+Dind+z1:ms}
@6.0.0-6.1.0
.
@STD
@{des,un.DET+Dind+z1:mp:fp}
@6.0.0-6.1.0
.
@STD
@{de la,du.DET+Dind+z1:fs}
@6.0.0-6.1.0
.
@STD
@,
@9.0.0-9.0.0
.
@STD
@Sébastien
@11.0.0-11.8.0
.
@STD
@{joue,jouer.V+z1:P1s:P3s:S1s:S3s:Y2s}
@13.0.0-13.3.0
.
@STD
@{joue,joue.N+z1:fs}
@13.0.0-13.3.0
.
@STD
@{de,.PREP+z1}
@15.0.0-15.1.0
.
@STD
@{du,du.DET+Dind+z1:ms}
@15.0.0-15.1.0
.
@STD
@{biniou,biniou.N+z2:ms}
@17.0.0-17.5.0
.
@STD
@{le,.DET+Ddef+z1:ms}
@15.0.0-15.1.0
.
@STD
@{près de,près de.PREP+EPCPQ+z1}
@19.0.0-21.1.0
.
@STD
@{près,près.PREP+Dnom+z1}
@19.0.0-19.3.0
.
@STD
@{près,près.ADV}
@19.0.0-19.3.0
.
@STD
@{de,.PREP+z1}
@21.0.0-21.1.0
.
@STD
@{de,de.PREP+z1}
@21.0.0-21.1.0
.
@STD
@{de,de.DET+Dind+z1:ms:fs:mp:fp}
@21.0.0-21.1.0
.
@STD
@{son,son.N+Conc+[Veg]+z1:ms}
@23.0.0-23.2.0
.
@STD
@{son,son.N+[Bruit]+z1:ms}
@23.0.0-23.2.0
.
@STD
@{son,son.DET+Dposs3s+z1:ms:fs}
@23.0.0-23.2.0
.
@STD
@{du,.DET+Dind+z1:ms}
@21.0.0-21.1.0
.
@STD
@{des,un.DET+Dind+z1:mp:fp}
@21.0.0-21.1.0
.
@STD
@{de la,du.DET+Dind+z1:fs}
@21.0.0-21.1.0
.
@STD
@{verger,verger.V+z1:W}
@25.0.0-25.5.0
.
@STD
@{verger,verger.N+z1:ms}
@25.0.0-25.5.0
.
@STD
@.
@26.0.0-26.0.0
.
f
$2
Il est heureux Monsieur Paumier et ce n'est pas dommage.
15/2 1/1 16/3 1/1 17/7 1/1 18/8 1/1 19/7 1/1 20/2 1/1 21/2 1/1 22/1 23/1 16/3 1/1 24/3 1/1 25/7 13/1 1/1
29_77
: 1 1
: 2 2 3 2 4 2
: 5 3 6 3
: 7 4
: 8 5
: 9 6
: 10 7 11 7 12 7
: 13 8
: 14 9 15 9 16 9
: 17 10 18 10
: 19 11
: 20 12
t
f
@<E>
.
@STD
@{Il,il.PRO+PpvIL+z1:3ms}
@0.0.0-0.1.0
.
@STD
@{est,être.V+z1:P3s}
@2.0.0-2.2.0
.
@STD
@{est,est.N+z1:ms}
@2.0.0-2.2.0
.
@STD
@{est,est.A+z1:ms:fs:mp:fp}
@2.0.0-2.2.0
.
@STD
@{heureux,heureux.N+z1:ms:mp}
@4.0.0-4.6.0
.
@STD
@{heureux,heureux.A+z1:ms:mp}
@4.0.0-4.6.0
.
@STD
@{Monsieur,monsieur.N+z1:ms}
@6.0.0-6.7.0
.
@STD
@{Paumier,paumier.N:ms}
@8.0.0-8.6.0
.
@STD
@{et,et.CONJC}
@10.0.0-10.1.0
.
@STD
@{ce,ce.PRO+PpvIL+z1:3ms:3mp}
@12.0.0-12.1.0
.
@STD
@{ce,ce.PRO+Pdem+z1:ms}
@12.0.0-12.1.0
.
@STD
@{ce,ce.DET+Ddem+z1:ms}
@12.0.0-12.1.0
.
@STD
@{ne,.XI+z1}
@14.0.0-15.0.0
.
@STD
@{est,être.V+z1:P3s}
@16.0.0-16.2.0
.
@STD
@{est,est.N+z1:ms}
@16.0.0-16.2.0
.
@STD
@{est,est.A+z1:ms:fs:mp:fp}
@16.0.0-16.2.0
.
@STD
@{pas,pas.N+z1:ms:mp}
@18.0.0-18.2.0
.
@STD
@{pas,pas.ADV+z1}
@18.0.0-18.2.0
.
@STD
@{dommage,dommage.N+z1:ms}
@20.0.0-20.6.0
.
@STD
@.
@21.0.0-21.0.0
.
f
Fichier ajouté
...@@ -390,6 +390,9 @@ class UnitexProcessor(object): ...@@ -390,6 +390,9 @@ class UnitexProcessor(object):
self.__snt = None self.__snt = None
self.__dir = None self.__dir = None
def tofst(self):
pass
def iter(self, grammar, **kwargs): def iter(self, grammar, **kwargs):
""" """
This function iters over the grammar matches. This function iters over the grammar matches.
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import array
import logging
import re
import struct
from unitex import UnitexException, UnitexConstants
from unitex.utils.fsa import FSAConstants, Automaton
from unitex.utils.types import Tag, Entry
_LOGGER = logging.getLogger(__name__)
class CompressedEntry(Entry):
SEPARATORS = (" ", "-")
SPLITTER = re.compile("([-\s])")
def __init__(self):
super(CompressedEntry, self).__init__()
def compute(self, lemma, form):
n, i = "", 0
while i < len(lemma) and lemma[i].isdigit():
n = n + lemma[i]
i = i + 1
if i > 0:
prefix = form[:len(form)-int(n)]
else:
prefix = form
suffix = lemma[i:]
return "%s%s" % (prefix, suffix)
def uncompress(self, lemma):
form = self.get_form()
if not lemma:
return form
# If two words don't have de same number of elements
# the compressed lemma is preceded by '_'
if lemma[0] == '_':
return self.compute(lemma[1:], form)
wtab = self.SPLITTER.split(form)
ltab = self.SPLITTER.split(lemma)
l = []
for i in range(len(ltab)):
if not ltab[i]:
continue
elif ltab[i] in self.SEPARATORS:
l.append(ltab[i])
else:
l.append(self.compute(ltab[i], wtab[i]))
return "".join(l)
def load(self, form, data, lemmatize=True):
data = data.rstrip()
self.set_form(form)
lemma = ""
i = 0
lemma, escaped = "", False
try:
while True:
if data[i] == "." and escaped is False:
break
elif data[i] == "\\":
if escaped is True:
lemma += data[i]
escaped = False
else:
lemma += data[i]
escaped = True
else:
lemma += data[i]
escaped = False
i += 1
except IndexError:
raise UnitexException("Wrong lemma for entry '%s' ..." % data)
if lemmatize is True:
self.set_lemma(self.uncompress(lemma))
Tag.load(self, data[i+1:])
class OldCompiledDictionary:
INITIAL_STATE_OFFSET=4
INF_SEPARATOR=re.compile(r"(?<![\\]),")
def __init__(self):
self.__bin = None
self.__inf = None
self.__buffer = None
def lookup(self, token, i=None, pos=None):
if i is None:
i = 0
if pos is None:
pos = self.INITIAL_STATE_OFFSET
tnbr = self.__bin[pos] * 256 + self.__bin[pos+1]
pos = pos + 2
_LOGGER.debug("Lookup Start: token[%s|%s] -- pos(%s) -- tnbr(%s)\n" % (token[:i], token[i:], pos, tnbr))
if i == len(token):
data = []
_LOGGER.debug(" Check Final State: pos(%s) -- tnbr(%s)\n" % (pos, tnbr))
if not (tnbr & 32768):
_LOGGER.debug(" -> Final\n")
index = self.__bin[pos] * 256 * 256 + self.__bin[pos+1] * 256 + self.__bin[pos+2]
for inf in self.INF_SEPARATOR.split(self.__inf[index]):
E = CompressedEntry()
E.load(token, inf)
data.append(E)
else:
_LOGGER.debug(" -> Not final\n")
return data, pos-2
elif tnbr & 32768:
tnbr = tnbr - 32768
else:
pos = pos + 3
for j in range(tnbr):
char = chr(self.__bin[pos] * 256 + self.__bin[pos+1])
_LOGGER.debug(" Matching char[%s] -- pos(%s) -> current[%s]\n" % (token[i], pos, char))
pos = pos + 2
offset = self.__bin[pos] * 256 * 256 + self.__bin[pos+1] * 256 + self.__bin[pos+2]
pos = pos + 3
if char == token[i]:
_LOGGER.debug(" -> Char found\n")
return self.lookup(token, i+1, offset)
# WEIRD... Objective: handle whitespaces in MWU dictionaries for the match function
# -> ["Conseil", "d'", "administration"] == "Conseil d'administration"
elif char == u" " and i == 0:
_LOGGER.debug(" -> Char is whitespace [pass]\n")
return self.lookup(token, i, offset)
return None, pos
def find(self, token):
entries, pos = self.lookup(token)
return entries
def match(self, sequence, i=None, mode=None, separator=None):
if i is None:
i = 0
if mode is None:
mode = UnitexConstants.MATCH_MODE_LONGEST
elif mode not in [UnitexConstants.MATCH_MODE_LONGEST,\
UnitexConstants.MATCH_MODE_SHORTEST,\
UnitexConstants.MATCH_MODE_ALL]:
raise UnitexException("Wrong match mode: %s ..." % mode)
matches = []
buffer, pos, tnbr = [], None, None
for j in range(i, len(sequence)):
_LOGGER.debug("Match Token: '%s'\n" % sequence[j])
entries, pos = self.lookup(sequence[j], pos=pos)
if entries is None:
_LOGGER.debug(" -> No entry found ...\n")
break
_LOGGER.debug(" -> Entries found: pos[%s] -- tnbr[%s]\n" % (pos, tnbr))
buffer.append(j)
if entries:
matches.append((entries, buffer[:]))
if mode == UnitexConstants.MATCH_MODE_SHORTEST:
return matches
if separator is not None:
_LOGGER.debug("Match Separator: '%s'\n" % separator)
entries, pos = self.lookup(separator, pos=pos)
if entries is None:
_LOGGER.debug(" -> No separator found ...\n")
break
_LOGGER.debug(" -> Separator found\n")
if not matches:
return None
elif mode == UnitexConstants.MATCH_MODE_LONGEST:
return [matches[-1]]
elif mode == UnitexConstants.MATCH_MODE_ALL:
return matches
def dump(self, pos=None):
if pos is None:
pos = self.INITIAL_STATE_OFFSET
self.__buffer = []
tnbr = self.__bin[pos] * 256 + self.__bin[pos+1]
pos = pos + 2
if not (tnbr & 32768):
index = self.__bin[pos] * 256 * 256 + self.__bin[pos+1] * 256 + self.__bin[pos+2]
form = "".join(self.__buffer)
for inf in self.INF_SEPARATOR.split(self.__inf[index]):
E = CompressedEntry()
E.load(form, inf)
yield E
pos = pos + 3
else:
tnbr = tnbr - 32768
for j in range(tnbr):
self.__buffer.append(chr(self.__bin[pos] * 256 + self.__bin[pos+1]))
pos = pos + 2
offset = self.__bin[pos] * 256 * 256 + self.__bin[pos+1] * 256 + self.__bin[pos+2]
pos = pos + 3
for E in self.dump(offset):
yield E
if self.__buffer:
self.__buffer.pop()
def load(self, bin, inf, encoding=None):
if encoding is None:
encoding = UnitexConstants.DEFAULT_ENCODING
INF = open(inf, "r", encoding=encoding)
self.__inf = INF.readlines()
self.__inf.pop(0) # Remove number information
INF.close()
BIN = open(bin, "r+b")
a = struct.unpack('B', BIN.read(1))[0]
b = struct.unpack('B', BIN.read(1))[0]
c = struct.unpack('B', BIN.read(1))[0]
d = struct.unpack('B', BIN.read(1))[0]
size = d + (256*c) + (256*256*b) + (256*256*256*a)
BIN.close()
BIN = open(bin, "rb")
self.__bin = array.array('B')
byte = BIN.read(1)
while byte:
tmp = struct.unpack('B', byte)[0]
self.__bin.append(tmp)
byte = BIN.read(1)
BIN.close()
class GRF(Automaton):
def __init__(self, name="GRF"):
super(GRF, self).__init__(name)
def load(self, file, encoding=None):
if encoding is None:
encoding = UnitexConstants.DEFAULT_ENCODING
raise NotImplementedError
def save(self, file, encoding=None):
if encoding is None:
encoding = UnitexConstants.DEFAULT_ENCODING
X = 1000
Y = 1000
GAP = 20
transitions = []
transitions.append({"label": FSAConstants.EPSILON, "targets": set([])})
transitions.append({"label": "", "targets": set([])})
nmap = {}
root = []
for edge, sid, tid in self.iter("dfs"):
source = self[sid]
target = self[tid]
index = 0
key = (str(edge), tid)
if key in nmap:
index = nmap[key]
else:
index = len(transitions)
nmap[key] = index
transitions.append({"label": str(edge), "targets": set([])})
if sid == self.get_initial():
transitions[0]["targets"].add(str(index))
if target.is_final() is True:
transitions[index]["targets"].add("1")
for _edge in target:
for _target in target[_edge]:
_index = 0
_key = (str(_edge), _target.get_id())
if _key in nmap:
_index = nmap[_key]
else:
_index = len(transitions)
nmap[_key] = _index
transitions.append({"label": str(_edge), "targets": set([])})
transitions[index]["targets"].add(str(_index))
with open(file, "w", encoding=encoding) as output:
output.write("#Unigraph\r\n")
output.write("SIZE %s %s\r\n" % (X+GAP, Y+GAP))
output.write("FONT Times New Roman:B 10\r\n")
output.write("OFONT Monospaced:B 8\r\n")
output.write("BCOLOR 16777215\r\n")
output.write("FCOLOR 0\r\n")
output.write("ACOLOR 13487565\r\n")
output.write("SCOLOR 16711680\r\n")
output.write("CCOLOR 255\r\n")
output.write("DBOXES y\r\n")
output.write("DFRAME y\r\n")
output.write("DDATE y\r\n")
output.write("DFILE y\r\n")
output.write("DDIR n\r\n")
output.write("DRIG n\r\n")
output.write("DRST n\r\n")
output.write("FITS 100\r\n")
output.write("PORIENT L\r\n")
output.write("#\r\n")
output.write("%s\r\n" % len(transitions))
for transition in transitions:
label = transition["label"]
size = len(transition["targets"])
targets = " ".join(list(transition["targets"]))
if size == 0:
output.write('"%s" %s %s %s \r\n' % (label, GAP, GAP, size))
else:
output.write('"%s" %s %s %s %s \r\n' % (label, GAP, GAP, size, targets))
class SentenceFST(Automaton):
def __init__(self, name="SentenceFST"):
super(SentenceFST, self).__init__(name)
self.__sentence = None
self.__tokens = None
self.__labels = None
def get_sentence(self):
return self.__sentence
def get_tokens(self):
return self.__tokens
def get_token(self, i):
return self.__tokens[i]
def get_label(self, i):
return self.__labels[i]
def load(self, sentence, tokens, states, labels):
self.__sentence = sentence
self.__tokens = []
self.__labels = {}
start = 0
for index, length in tokens:
end = start + length
self.__tokens.append(self.__sentence[start:end])
start = end
transitions = []
for i in range(len(states)):
initial = False
if i == 0:
initial = True
final = False
if states[i] == "t":
final = True
sid = self.add_node(initial=initial, final=final)
if final is True:
break
for lid, tid in states[i]:
entry = labels[lid][0]
p1 = labels[lid][1][0][0]
p2 = labels[lid][1][1][0]
if not self.__labels.has_key(p1):
self.__labels[p1] = []
self.__labels[p1].append((entry, p2))
transitions.append((sid, lid, tid))
for sid, lid, tid in transitions:
self.add_edge(lid, sid, tid)
class TextFST:
def __init__(self):
self.__file = None
self.__size = 0
def __len__(self):
return self.__size
def next(self):
line = self.__file.readline()
while line:
line = line.rstrip()
if line[0] != "$":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
# The sentence number (format '$n')
number = int(line[1:])
line = self.__file.readline()
line = line.rstrip()
# The text of the sentence
text = line
line = self.__file.readline()
line = line.rstrip()
# The tokens of the text
# -> [(x1, y), (x2, y2), ..., (xi, yi)]
# where,
# - x: token index in file 'tokens.txt'
# - y: length of the token (in characters)
tokens = [tuple(int(t) for t in token.split("/")) for token in line.split(" ")]
line = self.__file.readline()
line = line.rstrip()
# The offset of the sentence (from the begining of the text)
# -> X_Y
# where,
# - X: the offset in tokens
# - Y: the offset in characters
offset = tuple(int(o) for o in line.split("_"))
line = self.__file.readline()
line = line.rstrip()
states = []
while line != "t":
if line[0] != ":":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
line = line[1:].strip()
line = line.split()
state = []
for i in range(0, len(line), 2):
state.append((int(line[i]), int(line[i+1])))
states.append(state)
line = self.__file.readline()
line = line.rstrip()
if not line:
raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
states.append(line)
line = self.__file.readline()
line = line.rstrip()
if line[0] != "f":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
line = self.__file.readline()
line = line.rstrip()
tags = []
while line != "f":
if line == "@<E>":
tags.append(("<E>", None))
elif line == "@STD":
line = self.__file.readline()
line = line.rstrip()
content = line[1:]
entry = Entry()
if ENTRY.match(content):
entry.load(content)
else:
entry.set_form(content)
line = self.__file.readline()
line = line.rstrip()
if line[0] != "@":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
position = [tuple(int(i) for i in p.split(".")) for p in line[1:].split("-")]
tags.append((entry, position))
else:
raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
line = self.__file.readline()
line = line.rstrip()
if line[0] != ".":
raise UnitexException("File '%s' is corrupted ..." % self.__file.name)
line = self.__file.readline()
line = line.rstrip()
_LOGGER.debug("SENTENCE[%s]\n" % number)
_LOGGER.debug(" - offset: %s\n" % offset)
_LOGGER.debug(" - text: %s\n" % text)
_LOGGER.debug(" - tokens: %s\n" % tokens)
_LOGGER.debug(" - states:\n")
for state in states:
_LOGGER.debug(" - s: %s\n" % state)
_LOGGER.debug(" - tags:\n")
for tag in tags:
_LOGGER.debug(" - t: %s\n" % tag)
S = SentenceFST("SENTENCE[%d]" % number)
S.load(text, tokens, states, tags)
return S
def __iter__(self):
sentence = self.next()
while sentence:
yield sentence
sentence = self.next()
def open(self, file, encoding=None):
if encoding is None:
encoding = UnitexConstants.DEFAULT_ENCODING
self.__file = open(file, "r", encoding=encoding)
line = self.__file.readline()
line = line.rstrip()
# The number of sentence in the text fst (format: '000000000N')
self.__size = int(line)
def close(self):
self.__file.close()
self.__size = 0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from io import open
from unitex import *
_LOGGER = logging.getLogger(__name__)
class FSAConstants:
EPSILON = "<E>"
DEPTH_FIRST_SEARCH = "dfs"
BREADTH_FIRST_SEARCH = "bfs"
class Edge:
def __init__(self, label, targets=None, source=None):
self.__label = label
self.__source = source
self.__targets = targets
if self.__targets is not None:
self.__tids = set([target.get_id() for target in targets])
def __len__(self):
return len(self.__targets)
def __str__(self):
return self.get_label()
def __hash__(self):
return hash(self.get_label())
def __cmp__(self, e):
return cmp(self.get_label(), self.get_label())
def __iter__(self):
for target in self.__targets:
yield target
def __contains__(self, target):
return True if target.get_id() in self.__tids else False
def __getitem__(self, i):
return self.__targets[i]
def get_label(self):
return self.__label
def get_source(self):
return self.__source
def set_source(self, source):
self.__source = source
def get_targets(self):
return self.__targets
def set_targets(self, targets):
self.__targets = targets
self.__tids = set([target.get_id() for target in targets])
def add_target(self, target):
if target.get_id() in self.__tids:
return
self.__targets.append(target)
def del_target(self, target):
if target.get_id() not in self.__tids:
return
self.__tids.remove(target.get_id())
for i in range(len(self.__targets)):
_target = self.__targets[i]
if _target.get_id() == target.get_id():
del self.__targets[i]
break
class Node:
def __init__(self, _id, final=False):
self.__id = _id
self.__final = final
self.__edges = {}
self.__depth = 0
self.__visited = False
def __len__(self):
return len(self.__edges)
def __contains__(self, label):
return label in self.__edges
def __getitem__(self, label):
return self.__edges.get(label, None)
def __iter__(self):
for label in self.__edges:
yield label
def __str__(self):
s = "NODE[%s]" % str(self.get_id())
if self.is_final():
s += " -- FINAL"
for label in self:
targets = " | ".join([str(target.get_id()) for target in self[label]])
s += "\n\t%s -> (%s)" % (label, targets)
return s
def get_id(self):
return self.__id
def set_id(self, i):
self.__id = i
def is_deterministic(self):
if FSAConstants.EPSILON in self.__edges:
return False
for label in self.__edges:
if len(self[label]) > 1:
return False
return True
def exists(self, label, node=None):
if not label in self:
return False
if node is not None and node not in self[label]:
return False
return True
def add(self, label, target):
if self.exists(label, target) is True:
return
if self.exists(label) is False:
edge = Edge(label, [target], self)
self.__edges[label] = edge
else:
self[label].add_target(target)
def delete(self, label, node=None):
if not self.exists(label, node):
raise UnitexException("Edge not found: %s" % label)
if node is None:
del self.__edges[label]
else:
self[label].del_target(node)
def set_depth(self, depth):
self.__depth = depth
def get_depth(self):
return self.__depth
def is_visited(self):
return self.__visited
def set_visited(self, visited=True):
self.__visited = visited
def is_final(self):
return self.__final
def set_final(self, final=True):
self.__final = final
class NodeSets:
def __init__ (self):
self.__sets = {}
def __getitem__(self, _id):
return self.__sets[_id]
def __contains__(self, s):
return s in self.all()
def __iter__ (self):
return iter(self.all())
def all(self):
return set([tuple(l) for l in self.__sets.values()])
def add(self, s):
_set = tuple(sorted(set(s)))
for _id in s:
self.__sets[_id] = _set
class Automaton:
def __init__(self, name="Automaton"):
self.__name = name
self.__nodes = []
self.__initial = 0
self.__finals = []
self.__nodes.append(Node(self.__initial, False))
def __len__(self):
return len(self.__nodes)
def __getitem__(self, _id):
try:
return self.__nodes[_id]
except IndexError:
return None
def __iter__(self):
for node in self.__nodes:
yield node
def __str__(self):
title = "# FSA -- %s #" % self.get_name()
s = "%s\n%s\n%s\n\n" % ("#" * len(title), title, "#" * len(title))
for node in self:
s += "%s\n\n" % node
return s
def get_name(self):
return self.__name
def set_name(self, name):
self.__name = name
def get_depth(self):
depth = 0
for nid in self.__finals:
final = self.__nodes[nid]
if final.get_depth() > depth:
depth = final.get_depth()
return depth
def get_initial(self):
return self.__initial
def set_initial(self, initial):
self.__initial = initial
def get_finals(self):
return self.__finals
def set_finals(self, finals):
self.__finals = finals
def get_nodes(self):
return self.__nodes
def set_nodes(self, nodes):
self.__nodes = nodes
def add_edge(self, label, sid, tid):
source = self[sid]
target = self[tid]
target.set_depth(source.get_depth() + 1)
source.add(label, target)
def add_node(self, initial=False, final=False):
if initial is True:
return self.__initial
elif final is True:
self.__finals.append(len(self.__nodes))
self.__nodes.append(Node(self.__finals[-1], True))
return self.__finals[-1]
nid = len(self.__nodes)
self.__nodes.append(Node(nid, final))
return nid
def add_path(self, path):
if len(path) == 0:
raise UnitexException("Empty path!")
sid = self.add_node(initial=True, final=False)
for label in path[:-1]:
tid = self.add_node(initial=False, final=False)
self.add_edge(label, sid, tid)
sid = tid
else:
self.add_edge(path[-1], sid, self.add_node(initial=False, final=True))
def get_alphabet(self):
alphabet = set()
for node in self:
for label in node:
alphabet.add(label)
return tuple(alphabet)
def is_deterministic(self):
for node in self:
if not node.is_deterministic():
return False
return True
def __closure(self, nid):
stack = [nid]
result = set(stack)
while len(stack) > 0:
current = stack.pop()
if FSAConstants.EPSILON in self[current]:
edge = self[current][FSAConstants.EPSILON]
if edge not in result:
stack.append(edge)
result.add(edge)
return tuple(result)
def determinize(self):
dfa = Automaton("DETERMINIZED(%s)" % self.get_name())
alphabet = self.get_alphabet()
initials = self.__closure(self.get_initial())
hid = dfa.add_node(initial=True, final=False)
visited = {}
visited[initials] = hid
stack = [initials]
while len(stack) > 0:
current = stack.pop()
for label in alphabet:
new = set()
for node in current:
if not label in self[node]:
continue
for next in self[node][label]:
new.update(self.__closure(next.get_id()))
new = tuple(new)
if len(new) == 0:
continue
if new not in visited:
stack.append(new)
final = True in [self[_id].is_final() for _id in new]
nid = dfa.add_node(final=final)
visited[new] = nid
dfa.add_edge(label, visited[current], visited[new])
self.set_name(dfa.get_name())
self.set_initial(dfa.get_initial())
self.set_finals(dfa.get_finals())
self.set_nodes(dfa.get_nodes())
def minimize(self):
min = Automaton("MINIMIZED(%s)" % self.get_name())
alphabet = self.get_alphabet()
nodetoset = {}
settonode = {}
sets = NodeSets()
rest, final = [], []
for node in self:
if node.is_final():
final.append(node.get_id())
else:
rest.append(node.get_id())
sets.add(rest)
sets.add(final)
stack = [s for s in sets if len(s) > 1]
def target_set(_id, label):
edge = self[_id][label]
if edge is None:
return None
else:
return sets[edge[0].get_id()]
while len(stack) > 0:
current = stack.pop()
for label in alphabet:
target = target_set(current[0], label)
one, two = [current[0]], []
for _id in current[1:]:
if target_set(_id, label) == target:
one.append(_id)
else:
two.append(_id)
if len(two) > 0:
sets.add(one)
sets.add(two)
if len(one) > 1:
stack.append(one)
if len(two) > 1:
stack.append(two)
break
for s in sets:
initial = self.get_initial() in s
final = True in [self[_id].is_final() for _id in s]
_id = min.add_node(initial=initial, final=final)
nodetoset[_id] = s
settonode[s] = _id
for node in min:
done = set()
s = nodetoset[node.get_id()]
source = self[s[0]]
for label in source:
edge = source[label]
if label in done:
continue
done.add(label)
for target in edge:
t = sets[target.get_id()]
min.add_edge(label, node.get_id(), settonode[t])
self.set_name(min.get_name())
self.set_initial(min.get_initial())
self.set_finals(min.get_finals())
self.set_nodes(min.get_nodes())
def reset(self):
for node in self:
node.set_visited(False)
def __expand(self, source):
L = []
source.set_visited(True)
for label in source:
edge = source[label]
for target in source[label]:
L.append((edge.get_label(), source.get_id(), target.get_id()))
return L
def iter(self, iter_type=None):
if iter_type is None:
iter_type = FSAConstants.BREADTH_FIRST_SEARCH
if len(self[self.get_initial()]) == 0:
raise UnitexException("Empty FSA")
i = None
if iter_type == FSAConstants.DEPTH_FIRST_SEARCH:
i = -1
elif iter_type == FSAConstants.BREADTH_FIRST_SEARCH:
i = 0
else:
raise UnitexException("Unknown iter type: %s" % iter_type)
root = self[self.get_initial()]
if root.is_visited():
self.reset()
L = self.__expand(root)
while L:
edge, sid, tid = L.pop(i)
yield (edge, sid, tid)
if not self[tid].is_visited():
L += self.__expand(self[tid])
def todot(self, file, encoding=None):
if encoding is None:
encoding = UnitexConstants.DEFAULT_ENCODING
with open(file, "w", encoding=encoding) as output:
output.write("digraph Automaton {\n\n")
output.write("\tcenter = 1;\n")
output.write("\tcharset = \"%s\";\n" % encoding)
output.write("\trankdir = LR;\n")
output.write("\tranksep = 1;\n")
output.write("\tedge [arrowhead = vee];\n\n")
nodes = set()
edges = set()
for node in self:
sid = node.get_id()
n1 = "node%s" % sid
if not sid in nodes:
nodes.add(sid)
if node.get_id() == self.get_initial():
output.write("\t%s[shape = circle, label = \"\"];\n" % n1)
elif node.is_final():
output.write("\t%s[shape = doublecircle, label = \"\"];\n" % n1)
else:
output.write("\t%s[shape = point, label = \"\"];\n" % n1)
for label in node:
for target in node[label]:
if (node.get_id(), label, target.get_id()) in edges:
continue
edges.add((node.get_id(), label, target.get_id()))
tid = target.get_id()
n2 = "node%s" % tid
if not tid in nodes:
nodes.add(tid)
if target.get_id() == self.get_initial():
output.write("\t%s[shape = circle, label = \"\"];\n" % n2)
elif target.is_final():
output.write("\t%s[shape = doublecircle, label = \"\"];\n" % n2)
else:
output.write("\t%s[shape = point, label = \"\"];\n" % n2)
output.write("\t%s -> %s [label = \"%s\"];\n" % (n1, n2, label))
output.write("\n")
output.write("}\n")
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import re
from unitex import UnitexException
_LOGGER = logging.getLogger(__name__)
class Tag(object):
def __init__(self, tag=None):
self.__pos = ""
self.__features = []
self.__flexions = []
if tag is not None:
self.load(tag)
def __str__(self):
return self.get()
def load(self, tag):
self.__pos = ""
self.__features = []
self.__flexions = []
i = 0
pos = ""
while i < len(tag) and tag[i] != '+' and tag[i] != ':':
pos = pos + tag[i]
i += 1
self.set_pos(pos)
while i < len(tag) and tag[i] == '+':
sign = tag[i]
i += 1
tmp = ""
while i < len(tag) and tag[i] != '+' and tag[i] != ':':
tmp = tmp + tag[i]
i += 1
if tmp:
self.add_feature(tmp)
while i < len(tag) and tag[i] == ':':
sign = tag[i]
i += 1
tmp = ""
while i < len(tag) and tag[i] != ':':
tmp = tmp + tag[i]
i += 1
if tmp:
self.add_flexion(tmp)
def get(self):
tag = self.get_pos()
features = "+".join(self.get_features())
if features:
tag += "+%s" % features
flexions = "".join(self.get_flexions())
if flexions:
tag += ":%s" % flexions
return tag
def set_pos(self, pos):
self.__pos = pos
def get_pos(self):
return self.__pos
def set_features(self, features):
self.__features = features
def get_features(self):
return self.__features
def add_feature(self, feature):
self.__features.append(feature)
def set_flexions(self, flexions):
self.__flexions = flexions
def get_flexions(self):
return self.__flexions
def add_flexion(self, flexion):
self.__flexions.append(flexion)
class Entry(Tag):
def __init__(self, entry=None):
super(Tag, self).__init__()
self.__form = ""
self.__lemma = ""
if entry is not None:
self.load(entry)
def __str__(self):
return self.get()
def load(self, entry):
i = 0
escaped = False
form = ""
try:
while True:
if entry[i] == "," and escaped is False:
i += 1
break
elif entry[i] == "\\":
if escaped is True:
form += entry[i]
escaped = False
else:
escaped = True
else:
form += entry[i]
escaped = False
i += 1
except IndexError:
raise UnitexException("Invalid entry format '%s'. No comma found." % entry)
self.set_form(form)
escaped = False
lemma = ""
try:
while True:
if entry[i] == "." and escaped is False:
i += 1
break
elif entry[i] == "\\":
if escaped is True:
lemma += entry[i]
escaped = False
else:
escaped = True
else:
lemma += entry[i]
escaped = False
i += 1
except IndexError:
raise UnitexException("Invalid entry format '%s'. No dot found." % entry)
self.set_lemma(lemma)
Tag.load(self, entry[i:])
def get(self):
form = self.get_form(escape=True)
lemma = self.get_lemma(escape=True)
if not lemma:
lemma = ""
tag = Tag.get(self)
return "%s,%s.%s" % (form, lemma, tag)
def set_form(self, form):
self.__form = form
def get_form(self, escape=False):
if escape is False:
return self.__form
return self.__form.replace(",", "\,")
def set_lemma(self, lemma):
self.__lemma = lemma
def get_lemma(self, escape=False):
if escape is False:
return self.__lemma
return self.__lemma.replace(",", "\,")
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter