Newer
Older
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import shutil
import unittest
from unitex import *
from unitex.tools import *
class Arguments:
def __init__(self, language=None):
self.__arguments = {}
self.__arguments["dic_type"] = UnitexConstants.DELAF
self.__arguments["dic_check"] = "data/CHECK_DIC.TXT"
self.__arguments["bin"] = "data/dictionary.bin"
self.__arguments["inf"] = "data/dictionary.inf"
self.__arguments["alphabet"] = "data/Alphabet.txt"
self.__arguments["alphabet-sorted"] = "data/Alphabet_sort.txt"
self.__arguments["sentence"] = "data/Sentence.fst2"
self.__arguments["txt"] = "data/corpus.txt"
self.__arguments["snt"] = "data/corpus.snt"
self.__arguments["dir"] = "data/corpus_snt"
self.__arguments["xtr"] = "data/corpus.xtr"
self.__arguments["text.cod"] = os.path.join(self.__arguments["dir"], "text.cod")
self.__arguments["tok_by_freq.txt"] = os.path.join(self.__arguments["dir"], "tok_by_freq.txt")
self.__arguments["tok_by_alph.txt"] = os.path.join(self.__arguments["dir"], "tok_by_alph.txt")
self.__arguments["stats.n"] = os.path.join(self.__arguments["dir"], "stats.n")
self.__arguments["enter.pos"] = os.path.join(self.__arguments["dir"], "enter.pos")
self.__arguments["dlf"] = os.path.join(self.__arguments["dir"], "dlf")
self.__arguments["dlc"] = os.path.join(self.__arguments["dir"], "dlc")
self.__arguments["err"] = os.path.join(self.__arguments["dir"], "err")
self.__arguments["tags_err"] = os.path.join(self.__arguments["dir"], "tags_err")
self.__arguments["tags.ind"] = os.path.join(self.__arguments["dir"], "tags.ind")
self.__arguments["stat_dic.n"] = os.path.join(self.__arguments["dir"], "stat_dic.n")
Patrick Watrin
a validé
self.__arguments["text.tfst"] = os.path.join(self.__arguments["dir"], "text.tfst")
self.__arguments["text.tind"] = os.path.join(self.__arguments["dir"], "text.tind")
self.__arguments["grf"] = "data/grammar.grf"
self.__arguments["fst"] = "data/grammar.fst2"
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
self.__arguments["ind"] = os.path.join(self.__arguments["dir"], "concord.ind")
self.__arguments["concord.n"] = os.path.join(self.__arguments["dir"], "concord.n")
self.__arguments["concordances"] = os.path.join(self.__arguments["dir"], "concord.txt")
def __getitem__(self, key):
if key not in self.__arguments:
raise KeyError("Argument '%s' not found ..." % key)
return self.__arguments[key]
class TestUnitexTools(unittest.TestCase):
@classmethod
def setUpClass(self):
self._arguments = Arguments()
@classmethod
def tearDownClass(self):
# Removing output file from the 'check_dic' command.
if os.path.exists(self._arguments["dic_check"]):
os.remove(self._arguments["dic_check"])
# Removing output file from the 'compress' command.
if os.path.exists(self._arguments["bin"]):
os.remove(self._arguments["bin"])
if os.path.exists(self._arguments["inf"]):
os.remove(self._arguments["inf"])
# Removing output file from the 'normalize' and 'fst2txt' commands.
if os.path.exists(self._arguments["snt"]):
os.remove(self._arguments["snt"])
# Removing (recursively) the text directory.
if os.path.exists(self._arguments["dir"]):
shutil.rmtree(self._arguments["dir"])
# Removing output file from the 'grf2fst2' command.
if os.path.exists(self._arguments["fst"]):
os.remove(self._arguments["fst"])
# Removing output file from the 'extract' command.
if os.path.exists(self._arguments["xtr"]):
os.remove(self._arguments["xtr"])
def test_01_check_dic(self):
dictionary = self._arguments["dic"]
dtype = self._arguments["dic_type"]
alphabet = self._arguments["alphabet"]
kwargs = {}
kwargs["strict"] = False
kwargs["no_space_warning"] = True
ret = check_dic(dictionary, dtype, alphabet, **kwargs)
Patrick Watrin
a validé
ok = os.path.exists(self._arguments["dic_check"]) and ret
Patrick Watrin
a validé
self.assertTrue(ok, "Dictionary checking failed!")
def test_02_compress(self):
dictionary = self._arguments["dic"]
kwargs["output"] = None
kwargs["flip"] = False
kwargs["semitic"] = False
kwargs["version"] = UnitexConstants.DICTIONARY_VERSION_1
ret = compress(dictionary, **kwargs)
Patrick Watrin
a validé
ok = os.path.exists(self._arguments["bin"]) and os.path.exists(self._arguments["inf"]) and ret
Patrick Watrin
a validé
self.assertTrue(ok, "Compression failed!")
def test_03_normalize(self):
text = self._arguments["txt"]
kwargs = {}
kwargs["no_carriage_return"] = False
kwargs["input_offsets"] = None
kwargs["output_offsets"] = None
kwargs["replacement_rules"] = None
kwargs["no_separator_normalization"] = False
ret = normalize(text, **kwargs)
Patrick Watrin
a validé
ok = os.path.exists(self._arguments["snt"]) and ret
Patrick Watrin
a validé
self.assertTrue(ok, "Normalisation failed!")
def test_04_fst2txt(self):
grammar = self._arguments["sentence"]
text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
kwargs = {}
kwargs["start_on_space"] = False
kwargs["char_by_char"] = False
kwargs["merge"] = True
ret = fst2txt(grammar, text, alphabet, **kwargs)
Patrick Watrin
a validé
ok = ret
Patrick Watrin
a validé
self.assertTrue(ok, "FST application failed!")
def test_05_tokenize(self):
if not os.path.exists(self._arguments["dir"]):
os.mkdir(self._arguments["dir"])
text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
kwargs = {}
kwargs["char_by_char"] = False
kwargs["tokens"] = None
kwargs["input_offsets"] = None
kwargs["output_offsets"] = None
ret = tokenize(text, alphabet, **kwargs)
Patrick Watrin
a validé
ok = ret
ok = ok and os.path.exists(self._arguments["text.cod"])
ok = ok and os.path.exists(self._arguments["tok_by_freq.txt"])
ok = ok and os.path.exists(self._arguments["tok_by_alph.txt"])
ok = ok and os.path.exists(self._arguments["stats.n"])
ok = ok and os.path.exists(self._arguments["enter.pos"])
Patrick Watrin
a validé
self.assertTrue(ok, "Tokenisation failed!")
def test_06_dico(self):
dictionaries = [self._arguments["bin"]]
text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
kwargs = {}
kwargs["morpho"] = None
kwargs["korean"] = False
kwargs["semitic"] = False
kwargs["arabic_rules"] = None
kwargs["raw"] = None
ret = dico(dictionaries, text, alphabet, **kwargs)
Patrick Watrin
a validé
ok = ret
ok = ok and os.path.exists(self._arguments["dlf"])
ok = ok and os.path.exists(self._arguments["dlc"])
ok = ok and os.path.exists(self._arguments["err"])
ok = ok and os.path.exists(self._arguments["tags_err"])
ok = ok and os.path.exists(self._arguments["tags.ind"])
ok = ok and os.path.exists(self._arguments["stat_dic.n"])
Patrick Watrin
a validé
self.assertTrue(ok, "Dictionary application failed!")
def test_07_sort_txt(self):
files = []
files.append(self._arguments["dlf"])
files.append(self._arguments["dlc"])
files.append(self._arguments["err"])
files.append(self._arguments["tags_err"])
kwargs = {}
kwargs["duplicates"] = False
kwargs["reverse"] = False
kwargs["sort_order"] = self._arguments["alphabet-sorted"]
kwargs["line_info"] = self._arguments["stat_dic.n"]
kwargs["thai"] = False
kwargs["factorize_inflectional_codes"] = False
ok = True
for text in files:
ret = sort_txt(text, **kwargs)
Patrick Watrin
a validé
ok = ok and ret
Patrick Watrin
a validé
self.assertTrue(ok, "Sorting failed!")
def test_08_grf2fst2(self):
grammar = self._arguments["grf"]
alphabet = self._arguments["alphabet"]
kwargs = {}
kwargs["loop_check"] = False
kwargs["char_by_char"] = False
kwargs["pkgdir"] = None
kwargs["no_empty_graph_warning"] = False
kwargs["tfst_check"] = False
kwargs["silent_grf_name"] = False
kwargs["named_repositories"] = None
kwargs["debug"] = False
kwargs["check_variables"] = False
ret = grf2fst2(grammar, alphabet, **kwargs)
Patrick Watrin
a validé
ok = os.path.exists(self._arguments["fst"]) and ret
Patrick Watrin
a validé
self.assertTrue(ok, "Grammar compilation failed!")
def test_09_locate(self):
grammar = self._arguments["fst"]
text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
kwargs = {}
kwargs["start_on_space"] = False
kwargs["char_by_char"] = False
kwargs["morpho"] = None
kwargs["korean"] = False
kwargs["arabic_rules"] = None
kwargs["sntdir"] = None
kwargs["negation_operator"] = UnitexConstants.NEGATION_OPERATOR
kwargs["number_of_matches"] = None
kwargs["stop_token_count"] = None
kwargs["match_mode"] = UnitexConstants.MATCH_MODE_LONGEST
kwargs["output_mode"] = UnitexConstants.OUTPUT_MODE_MERGE
kwargs["protect_dic_chars"] = True
kwargs["variable"] = None
kwargs["ambiguous_outputs"] = True
kwargs["variable_error"] = UnitexConstants.ON_ERROR_IGNORE
ret = locate(grammar, text, alphabet, **kwargs)
Patrick Watrin
a validé
ok = os.path.exists(self._arguments["ind"]) and os.path.exists(self._arguments["concord.n"]) and ret
Patrick Watrin
a validé
self.assertTrue(ok, "Locate failed!")
def test_10_concord(self):
index = self._arguments["ind"]
alphabet = self._arguments["alphabet-sorted"]
kwargs = {}
kwargs["font"] = None
kwargs["fontsize"] = None
kwargs["only_ambiguous"] = False
kwargs["only_matches"] = False
kwargs["left"] = "1000s"
kwargs["right"] = "1000s"
kwargs["sort"] = UnitexConstants.SORT_CENTER_RIGHT
kwargs["format"] = UnitexConstants.FORMAT_TEXT
kwargs["script"] = None
kwargs["offsets"] = None
kwargs["unxmlize"] = None
kwargs["output"] = None
kwargs["directory"] = None
kwargs["thai"] = False
ret = concord(index, alphabet, **kwargs)
Patrick Watrin
a validé
ok = os.path.exists(self._arguments["concordances"]) and ret
Patrick Watrin
a validé
self.assertTrue(ok, "Concord failed!")
Patrick Watrin
a validé
def test_11_txt2tfst(self):
text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
Patrick Watrin
a validé
kwargs = {}
kwargs["clean"] = False
kwargs["normalization_grammar"] = None
kwargs["tagset"] = None
kwargs["korean"] = False
ret = txt2tfst(text, alphabet, **kwargs)
Patrick Watrin
a validé
ok = ret
ok = ok and os.path.exists(self._arguments["text.tfst"])
ok = ok and os.path.exists(self._arguments["text.tind"])
self.assertTrue(ok, "Txt2Tfst failed!")
def test_12_extract(self):
text = self._arguments["snt"]
output = self._arguments["xtr"]
index = self._arguments["ind"]
kwargs = {}
kwargs["non_matching_sentences"] = False
ret = extract(text, output, index, **kwargs)
ok = ret
ok = ok and os.path.exists(self._arguments["xtr"])
self.assertTrue(ok, "Extract failed!")
if __name__ == '__main__':
unittest.main()