Skip to content
Extraits de code Groupes Projets
01_test_tools.py 11,4 ko
Newer Older
  • Learn to ignore specific revisions
  • #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import os
    import shutil
    import unittest
    
    
    from unitex.tools import *
    
    
    class Arguments:
    
        def __init__(self, language=None):
            self.__arguments = {}
    
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            self.__arguments["dic"] = "data/dictionary.dic" 
    
            self.__arguments["dic_type"] = UnitexConstants.DELAF
    
            self.__arguments["dic_check"] = "data/CHECK_DIC.TXT" 
    
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            self.__arguments["bin"] = "data/dictionary.bin" 
            self.__arguments["inf"] = "data/dictionary.inf" 
    
    
            self.__arguments["alphabet"] = "data/Alphabet.txt" 
    
            self.__arguments["alphabet-sorted"] = "data/Alphabet_sort.txt" 
    
    
            self.__arguments["sentence"] = "data/Sentence.fst2" 
    
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            self.__arguments["txt"] = "data/corpus.txt" 
            self.__arguments["snt"] = "data/corpus.snt" 
            self.__arguments["dir"] = "data/corpus_snt" 
    
            self.__arguments["xtr"] = "data/corpus.xtr" 
    
    
            self.__arguments["text.cod"] = os.path.join(self.__arguments["dir"], "text.cod")
            self.__arguments["tok_by_freq.txt"] = os.path.join(self.__arguments["dir"], "tok_by_freq.txt")
            self.__arguments["tok_by_alph.txt"] = os.path.join(self.__arguments["dir"], "tok_by_alph.txt")
            self.__arguments["stats.n"] = os.path.join(self.__arguments["dir"], "stats.n")
            self.__arguments["enter.pos"] = os.path.join(self.__arguments["dir"], "enter.pos")
    
            self.__arguments["dlf"] = os.path.join(self.__arguments["dir"], "dlf")
            self.__arguments["dlc"] = os.path.join(self.__arguments["dir"], "dlc")
            self.__arguments["err"] = os.path.join(self.__arguments["dir"], "err")
            self.__arguments["tags_err"] = os.path.join(self.__arguments["dir"], "tags_err")
            self.__arguments["tags.ind"] = os.path.join(self.__arguments["dir"], "tags.ind")
            self.__arguments["stat_dic.n"] = os.path.join(self.__arguments["dir"], "stat_dic.n")
    
    
            self.__arguments["text.tfst"] = os.path.join(self.__arguments["dir"], "text.tfst")
            self.__arguments["text.tind"] = os.path.join(self.__arguments["dir"], "text.tind")
    
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            self.__arguments["grf"] = "data/grammar.grf" 
            self.__arguments["fst"] = "data/grammar.fst2" 
    
    
            self.__arguments["ind"] = os.path.join(self.__arguments["dir"], "concord.ind") 
            self.__arguments["concord.n"] = os.path.join(self.__arguments["dir"], "concord.n") 
            self.__arguments["concordances"] = os.path.join(self.__arguments["dir"], "concord.txt") 
    
        def __getitem__(self, key):
            if key not in self.__arguments:
                raise KeyError("Argument '%s' not found ..." % key)
            return self.__arguments[key]
    
    
    
    class TestUnitexTools(unittest.TestCase):
    
        @classmethod
        def setUpClass(self):
            self._arguments = Arguments()
    
        @classmethod
        def tearDownClass(self):
            # Removing output file from the 'check_dic' command.
            if os.path.exists(self._arguments["dic_check"]):
                os.remove(self._arguments["dic_check"])
    
            # Removing output file from the 'compress' command.
            if os.path.exists(self._arguments["bin"]):
                os.remove(self._arguments["bin"])
            if os.path.exists(self._arguments["inf"]):
                os.remove(self._arguments["inf"])
    
            # Removing output file from the 'normalize' and 'fst2txt' commands.
            if os.path.exists(self._arguments["snt"]):
                os.remove(self._arguments["snt"])
    
            # Removing (recursively) the text directory.
            if os.path.exists(self._arguments["dir"]):
                shutil.rmtree(self._arguments["dir"])
    
            # Removing output file from the 'grf2fst2' command.
            if os.path.exists(self._arguments["fst"]):
                os.remove(self._arguments["fst"])
    
    
            # Removing output file from the 'extract' command.
            if os.path.exists(self._arguments["xtr"]):
                os.remove(self._arguments["xtr"])
    
    
        def test_01_check_dic(self):
    
            dictionary = self._arguments["dic"]
            dtype = self._arguments["dic_type"]
            alphabet = self._arguments["alphabet"]
    
    
            kwargs = {}
            kwargs["strict"] = False
            kwargs["no_space_warning"] = True
    
    
            ret = check_dic(dictionary, dtype, alphabet, **kwargs)
    
            ok = os.path.exists(self._arguments["dic_check"]) and ret
    
            self.assertTrue(ok, "Dictionary checking failed!")
    
    
        def test_02_compress(self):
    
            dictionary = self._arguments["dic"]
    
            kwargs["flip"] = False
            kwargs["semitic"] = False
    
            kwargs["version"] = UnitexConstants.DICTIONARY_VERSION_1
    
            ret = compress(dictionary, **kwargs)
    
            ok = os.path.exists(self._arguments["bin"]) and os.path.exists(self._arguments["inf"]) and ret
    
            self.assertTrue(ok, "Compression failed!")
    
    
        def test_03_normalize(self):
    
            text = self._arguments["txt"]
    
    
            kwargs = {}
            kwargs["no_carriage_return"] = False
            kwargs["input_offsets"] = None
            kwargs["output_offsets"] = None
            kwargs["replacement_rules"] = None
            kwargs["no_separator_normalization"] = False
    
    
            ret = normalize(text, **kwargs)
    
            ok = os.path.exists(self._arguments["snt"]) and ret
    
            self.assertTrue(ok, "Normalisation failed!")
    
    
        def test_04_fst2txt(self):
    
            grammar = self._arguments["sentence"]
            text = self._arguments["snt"]
            alphabet = self._arguments["alphabet"]
    
    
            kwargs = {}
            kwargs["start_on_space"] = False
            kwargs["char_by_char"] = False
            kwargs["merge"] = True
    
    
            ret = fst2txt(grammar, text, alphabet, **kwargs)
    
            self.assertTrue(ok, "FST application failed!")
    
    
        def test_05_tokenize(self):
            if not os.path.exists(self._arguments["dir"]):
                os.mkdir(self._arguments["dir"])
    
    
            text = self._arguments["snt"]
            alphabet = self._arguments["alphabet"]
    
    
            kwargs = {}
            kwargs["char_by_char"] = False
            kwargs["tokens"] = None
            kwargs["input_offsets"] = None
            kwargs["output_offsets"] = None
    
    
            ret = tokenize(text, alphabet, **kwargs)
    
            ok = ok and os.path.exists(self._arguments["text.cod"])
            ok = ok and os.path.exists(self._arguments["tok_by_freq.txt"])
            ok = ok and os.path.exists(self._arguments["tok_by_alph.txt"])
            ok = ok and os.path.exists(self._arguments["stats.n"])
            ok = ok and os.path.exists(self._arguments["enter.pos"])
    
    
            self.assertTrue(ok, "Tokenisation failed!")
    
            dictionaries = [self._arguments["bin"]]
            text = self._arguments["snt"]
            alphabet = self._arguments["alphabet"]
    
    
            kwargs = {}
            kwargs["morpho"] = None
            kwargs["korean"] = False
            kwargs["semitic"] = False
            kwargs["arabic_rules"] = None
            kwargs["raw"] = None
    
    
            ret = dico(dictionaries, text, alphabet, **kwargs)
    
            ok = ok and os.path.exists(self._arguments["dlf"])
            ok = ok and os.path.exists(self._arguments["dlc"])
            ok = ok and os.path.exists(self._arguments["err"])
            ok = ok and os.path.exists(self._arguments["tags_err"])
            ok = ok and os.path.exists(self._arguments["tags.ind"])
            ok = ok and os.path.exists(self._arguments["stat_dic.n"])
    
    
            self.assertTrue(ok, "Dictionary application failed!")
    
    
        def test_07_sort_txt(self):
            files = []
            files.append(self._arguments["dlf"])
            files.append(self._arguments["dlc"])
            files.append(self._arguments["err"])
            files.append(self._arguments["tags_err"])
    
            kwargs = {}
            kwargs["duplicates"] = False
            kwargs["reverse"] = False
    
            kwargs["sort_order"] = self._arguments["alphabet-sorted"]
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            kwargs["line_info"] = self._arguments["stat_dic.n"]
    
            kwargs["thai"] = False
            kwargs["factorize_inflectional_codes"] = False
    
            ok = True
    
    
            for text in files:
                ret = sort_txt(text, **kwargs)
    
    
        def test_08_grf2fst2(self):
    
            grammar = self._arguments["grf"]
            alphabet = self._arguments["alphabet"]
    
    
            kwargs = {}
            kwargs["loop_check"] = False
            kwargs["char_by_char"] = False
            kwargs["pkgdir"] = None
            kwargs["no_empty_graph_warning"] = False
            kwargs["tfst_check"] = False
            kwargs["silent_grf_name"] = False
            kwargs["named_repositories"] = None
            kwargs["debug"] = False
            kwargs["check_variables"] = False
    
    
            ret = grf2fst2(grammar, alphabet, **kwargs)
    
            ok = os.path.exists(self._arguments["fst"]) and ret
    
            self.assertTrue(ok, "Grammar compilation failed!")
    
    
        def test_09_locate(self):
    
            grammar = self._arguments["fst"]
            text = self._arguments["snt"]
            alphabet = self._arguments["alphabet"]
    
    
            kwargs = {}
            kwargs["start_on_space"] = False
            kwargs["char_by_char"] = False
            kwargs["morpho"] = None
            kwargs["korean"] = False
            kwargs["arabic_rules"] = None
            kwargs["sntdir"] = None
    
            kwargs["negation_operator"] = UnitexConstants.NEGATION_OPERATOR
    
    
            kwargs["number_of_matches"] = None
    
            kwargs["stop_token_count"] = None
    
    
            kwargs["match_mode"] = UnitexConstants.MATCH_MODE_LONGEST
    
            kwargs["output_mode"] = UnitexConstants.OUTPUT_MODE_MERGE
    
            kwargs["protect_dic_chars"] = True
            kwargs["variable"] = None
    
            kwargs["ambiguous_outputs"] = True
    
            kwargs["variable_error"] = UnitexConstants.ON_ERROR_IGNORE
    
            ret = locate(grammar, text, alphabet, **kwargs)
    
            ok = os.path.exists(self._arguments["ind"]) and os.path.exists(self._arguments["concord.n"]) and ret
    
    
        def test_10_concord(self):
    
            index = self._arguments["ind"]
    
            alphabet = self._arguments["alphabet-sorted"]
    
    
            kwargs = {}
            kwargs["font"] = None
            kwargs["fontsize"] = None
            kwargs["only_ambiguous"] = False
            kwargs["only_matches"] = False
            kwargs["left"] = "1000s"
            kwargs["right"] = "1000s"
    
    
            kwargs["sort"] = UnitexConstants.SORT_CENTER_RIGHT
    
            kwargs["format"] = UnitexConstants.FORMAT_TEXT
    
            kwargs["script"] = None
            kwargs["offsets"] = None
            kwargs["unxmlize"] = None
            kwargs["output"] = None
    
            kwargs["directory"] = None
            kwargs["thai"] = False
    
    
            ret = concord(index, alphabet, **kwargs)
    
            ok = os.path.exists(self._arguments["concordances"]) and ret
    
            text = self._arguments["snt"]
            alphabet = self._arguments["alphabet"]
    
    
            kwargs = {}
            kwargs["clean"] = False
            kwargs["normalization_grammar"] = None
            kwargs["tagset"] = None
            kwargs["korean"] = False
    
    
            ret = txt2tfst(text, alphabet, **kwargs)
    
    
            ok = ret
            ok = ok and os.path.exists(self._arguments["text.tfst"])
            ok = ok and os.path.exists(self._arguments["text.tind"])
    
            self.assertTrue(ok, "Txt2Tfst failed!")
    
    
        def test_12_extract(self):
            text = self._arguments["snt"]
            output = self._arguments["xtr"]
            index = self._arguments["ind"]
    
            kwargs = {}
            kwargs["non_matching_sentences"] = False
    
            ret = extract(text, output, index, **kwargs)
    
            ok = ret
            ok = ok and os.path.exists(self._arguments["xtr"])
    
            self.assertTrue(ok, "Extract failed!")
    
    
    
    if __name__ == '__main__':
        unittest.main()