#!/usr/bin/env python # -*- coding: utf-8 -*- import os import shutil import unittest from unitex import * from unitex.tools import * class Arguments: def __init__(self, language=None): self.__arguments = {} self.__arguments["dic"] = "data/dictionary.dic" self.__arguments["dic_type"] = UnitexConstants.DELAF self.__arguments["dic_check"] = "data/CHECK_DIC.TXT" self.__arguments["bin"] = "data/dictionary.bin" self.__arguments["inf"] = "data/dictionary.inf" self.__arguments["alphabet"] = "data/Alphabet.txt" self.__arguments["alphabet-sorted"] = "data/Alphabet_sort.txt" self.__arguments["sentence"] = "data/Sentence.fst2" self.__arguments["txt"] = "data/corpus.txt" self.__arguments["snt"] = "data/corpus.snt" self.__arguments["dir"] = "data/corpus_snt" self.__arguments["xtr"] = "data/corpus.xtr" self.__arguments["text.cod"] = os.path.join(self.__arguments["dir"], "text.cod") self.__arguments["tok_by_freq.txt"] = os.path.join(self.__arguments["dir"], "tok_by_freq.txt") self.__arguments["tok_by_alph.txt"] = os.path.join(self.__arguments["dir"], "tok_by_alph.txt") self.__arguments["stats.n"] = os.path.join(self.__arguments["dir"], "stats.n") self.__arguments["enter.pos"] = os.path.join(self.__arguments["dir"], "enter.pos") self.__arguments["dlf"] = os.path.join(self.__arguments["dir"], "dlf") self.__arguments["dlc"] = os.path.join(self.__arguments["dir"], "dlc") self.__arguments["err"] = os.path.join(self.__arguments["dir"], "err") self.__arguments["tags_err"] = os.path.join(self.__arguments["dir"], "tags_err") self.__arguments["tags.ind"] = os.path.join(self.__arguments["dir"], "tags.ind") self.__arguments["stat_dic.n"] = os.path.join(self.__arguments["dir"], "stat_dic.n") self.__arguments["text.tfst"] = os.path.join(self.__arguments["dir"], "text.tfst") self.__arguments["text.tind"] = os.path.join(self.__arguments["dir"], "text.tind") self.__arguments["grf"] = "data/grammar.grf" self.__arguments["fst"] = "data/grammar.fst2" self.__arguments["ind"] = os.path.join(self.__arguments["dir"], "concord.ind") self.__arguments["concord.n"] = os.path.join(self.__arguments["dir"], "concord.n") self.__arguments["concordances"] = os.path.join(self.__arguments["dir"], "concord.txt") def __getitem__(self, key): if key not in self.__arguments: raise KeyError("Argument '%s' not found ..." % key) return self.__arguments[key] class TestUnitexTools(unittest.TestCase): @classmethod def setUpClass(self): self._arguments = Arguments() @classmethod def tearDownClass(self): # Removing output file from the 'check_dic' command. if os.path.exists(self._arguments["dic_check"]): os.remove(self._arguments["dic_check"]) # Removing output file from the 'compress' command. if os.path.exists(self._arguments["bin"]): os.remove(self._arguments["bin"]) if os.path.exists(self._arguments["inf"]): os.remove(self._arguments["inf"]) # Removing output file from the 'normalize' and 'fst2txt' commands. if os.path.exists(self._arguments["snt"]): os.remove(self._arguments["snt"]) # Removing (recursively) the text directory. if os.path.exists(self._arguments["dir"]): shutil.rmtree(self._arguments["dir"]) # Removing output file from the 'grf2fst2' command. if os.path.exists(self._arguments["fst"]): os.remove(self._arguments["fst"]) # Removing output file from the 'extract' command. if os.path.exists(self._arguments["xtr"]): os.remove(self._arguments["xtr"]) def test_01_check_dic(self): dictionary = self._arguments["dic"] dtype = self._arguments["dic_type"] alphabet = self._arguments["alphabet"] kwargs = {} kwargs["strict"] = False kwargs["no_space_warning"] = True ret = check_dic(dictionary, dtype, alphabet, **kwargs) ok = os.path.exists(self._arguments["dic_check"]) and ret self.assertTrue(ok, "Dictionary checking failed!") def test_02_compress(self): dictionary = self._arguments["dic"] kwargs = {} kwargs["output"] = None kwargs["flip"] = False kwargs["semitic"] = False kwargs["version"] = UnitexConstants.DICTIONARY_VERSION_1 ret = compress(dictionary, **kwargs) ok = os.path.exists(self._arguments["bin"]) and os.path.exists(self._arguments["inf"]) and ret self.assertTrue(ok, "Compression failed!") def test_03_normalize(self): text = self._arguments["txt"] kwargs = {} kwargs["no_carriage_return"] = False kwargs["input_offsets"] = None kwargs["output_offsets"] = None kwargs["replacement_rules"] = None kwargs["no_separator_normalization"] = False ret = normalize(text, **kwargs) ok = os.path.exists(self._arguments["snt"]) and ret self.assertTrue(ok, "Normalisation failed!") def test_04_fst2txt(self): grammar = self._arguments["sentence"] text = self._arguments["snt"] alphabet = self._arguments["alphabet"] kwargs = {} kwargs["start_on_space"] = False kwargs["char_by_char"] = False kwargs["merge"] = True ret = fst2txt(grammar, text, alphabet, **kwargs) ok = ret self.assertTrue(ok, "FST application failed!") def test_05_tokenize(self): if not os.path.exists(self._arguments["dir"]): os.mkdir(self._arguments["dir"]) text = self._arguments["snt"] alphabet = self._arguments["alphabet"] kwargs = {} kwargs["char_by_char"] = False kwargs["tokens"] = None kwargs["input_offsets"] = None kwargs["output_offsets"] = None ret = tokenize(text, alphabet, **kwargs) ok = ret ok = ok and os.path.exists(self._arguments["text.cod"]) ok = ok and os.path.exists(self._arguments["tok_by_freq.txt"]) ok = ok and os.path.exists(self._arguments["tok_by_alph.txt"]) ok = ok and os.path.exists(self._arguments["stats.n"]) ok = ok and os.path.exists(self._arguments["enter.pos"]) self.assertTrue(ok, "Tokenisation failed!") def test_06_dico(self): dictionaries = [self._arguments["bin"]] text = self._arguments["snt"] alphabet = self._arguments["alphabet"] kwargs = {} kwargs["morpho"] = None kwargs["korean"] = False kwargs["semitic"] = False kwargs["arabic_rules"] = None kwargs["raw"] = None ret = dico(dictionaries, text, alphabet, **kwargs) ok = ret ok = ok and os.path.exists(self._arguments["dlf"]) ok = ok and os.path.exists(self._arguments["dlc"]) ok = ok and os.path.exists(self._arguments["err"]) ok = ok and os.path.exists(self._arguments["tags_err"]) ok = ok and os.path.exists(self._arguments["tags.ind"]) ok = ok and os.path.exists(self._arguments["stat_dic.n"]) self.assertTrue(ok, "Dictionary application failed!") def test_07_sort_txt(self): files = [] files.append(self._arguments["dlf"]) files.append(self._arguments["dlc"]) files.append(self._arguments["err"]) files.append(self._arguments["tags_err"]) kwargs = {} kwargs["duplicates"] = False kwargs["reverse"] = False kwargs["sort_order"] = self._arguments["alphabet-sorted"] kwargs["line_info"] = self._arguments["stat_dic.n"] kwargs["thai"] = False kwargs["factorize_inflectional_codes"] = False ok = True for text in files: ret = sort_txt(text, **kwargs) ok = ok and ret self.assertTrue(ok, "Sorting failed!") def test_08_grf2fst2(self): grammar = self._arguments["grf"] alphabet = self._arguments["alphabet"] kwargs = {} kwargs["loop_check"] = False kwargs["char_by_char"] = False kwargs["pkgdir"] = None kwargs["no_empty_graph_warning"] = False kwargs["tfst_check"] = False kwargs["silent_grf_name"] = False kwargs["named_repositories"] = None kwargs["debug"] = False kwargs["check_variables"] = False ret = grf2fst2(grammar, alphabet, **kwargs) ok = os.path.exists(self._arguments["fst"]) and ret self.assertTrue(ok, "Grammar compilation failed!") def test_09_locate(self): grammar = self._arguments["fst"] text = self._arguments["snt"] alphabet = self._arguments["alphabet"] kwargs = {} kwargs["start_on_space"] = False kwargs["char_by_char"] = False kwargs["morpho"] = None kwargs["korean"] = False kwargs["arabic_rules"] = None kwargs["sntdir"] = None kwargs["negation_operator"] = UnitexConstants.NEGATION_OPERATOR kwargs["number_of_matches"] = None kwargs["stop_token_count"] = None kwargs["match_mode"] = UnitexConstants.MATCH_MODE_LONGEST kwargs["output_mode"] = UnitexConstants.OUTPUT_MODE_MERGE kwargs["protect_dic_chars"] = True kwargs["variable"] = None kwargs["ambiguous_outputs"] = True kwargs["variable_error"] = UnitexConstants.ON_ERROR_IGNORE ret = locate(grammar, text, alphabet, **kwargs) ok = os.path.exists(self._arguments["ind"]) and os.path.exists(self._arguments["concord.n"]) and ret self.assertTrue(ok, "Locate failed!") def test_10_concord(self): index = self._arguments["ind"] alphabet = self._arguments["alphabet-sorted"] kwargs = {} kwargs["font"] = None kwargs["fontsize"] = None kwargs["only_ambiguous"] = False kwargs["only_matches"] = False kwargs["left"] = "1000s" kwargs["right"] = "1000s" kwargs["sort"] = UnitexConstants.SORT_CENTER_RIGHT kwargs["format"] = UnitexConstants.FORMAT_TEXT kwargs["script"] = None kwargs["offsets"] = None kwargs["unxmlize"] = None kwargs["output"] = None kwargs["directory"] = None kwargs["thai"] = False ret = concord(index, alphabet, **kwargs) ok = os.path.exists(self._arguments["concordances"]) and ret self.assertTrue(ok, "Concord failed!") def test_11_txt2tfst(self): text = self._arguments["snt"] alphabet = self._arguments["alphabet"] kwargs = {} kwargs["clean"] = False kwargs["normalization_grammar"] = None kwargs["tagset"] = None kwargs["korean"] = False ret = txt2tfst(text, alphabet, **kwargs) ok = ret ok = ok and os.path.exists(self._arguments["text.tfst"]) ok = ok and os.path.exists(self._arguments["text.tind"]) self.assertTrue(ok, "Txt2Tfst failed!") def test_12_extract(self): text = self._arguments["snt"] output = self._arguments["xtr"] index = self._arguments["ind"] kwargs = {} kwargs["non_matching_sentences"] = False ret = extract(text, output, index, **kwargs) ok = ret ok = ok and os.path.exists(self._arguments["xtr"]) self.assertTrue(ok, "Extract failed!") if __name__ == '__main__': unittest.main()