Skip to content
Extraits de code Groupes Projets
tools.py 44,2 ko
Newer Older
  • Learn to ignore specific revisions
  • #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
    # NOTE: The documentation adapted for each function is extracted from
    #       the Unitex manual.
    
    from _unitex import unitex_tool
    
    
    from unitex import *
    from unitex.config import CheckDicOptions,\
                              CompressOptions,\
                              ConcordOptions,\
                              DicoOptions,\
                              ExtractOptions,\
                              Fst2TxtOptions,\
                              Grf2Fst2Options,\
                              LocateOptions,\
                              NormalizeOptions,\
                              SortTxtOptions,\
                              TokenizeOptions,\
                              Txt2TFstOptions
    
    from unitex.io import exists
    
    _LOGGER = logging.getLogger(__name__)
    
    def check_dic(dictionary, dtype, alphabet, **kwargs):
    
        """
        This function checks the format of <dela> and produces a file named
    
        CHECK_DIC.TXT that contains check result informations. This file is
        stored in the <dela> directory.
    
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            dictionary [str] -- the dictionary file path.
    
            dtype [str] -- the dictionary type:
                - UnitexConstants.DELAF (inflected);
                - UnitexConstants.DELAS (non inflected).
    
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            alphabet [str] -- the alphabet file path.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            strict [bool] -- strict syntax checking against unprotected dot
                and comma (default: False).
    
            no_space_warning [bool] -- tolerates spaces in grammatical,
                semantic and inflectional codes (default: True).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            True if it succeeds and False otherwise.
    
        options = CheckDicOptions()
        options.load(kwargs)
    
        if exists(dictionary) is False:
    
            raise UnitexException("[CHECKDIC] Dictionary file '%s' doesn't exists" % dictionary)
    
    
        command = ["UnitexTool", "CheckDic"]
    
    
        if dtype == UnitexConstants.DELAF:
    
            command.append("--delaf")
    
        elif dtype == UnitexConstants.DELAS:
    
            command.append("--delas")
    
    
        if options["strict"] is True:
    
            command.append("--strict")
    
        if options["no_space_warning"] is True:
    
            command.append("--no_space_warning")
    
        command .append("--alphabet=%s" % alphabet)
    
        command.append(dictionary)
    
    
        command.append("-qutf8-no-bom")
    
        command = " ".join(command)
    
    
        _LOGGER.info("Checking dic '%s'" % dictionary)
        _LOGGER.debug("Command: %s", command)
    
    def compress(dictionary, **kwargs):
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function takes a DELAF dictionary as a parameter and compresses
        it. The compression of a dictionary dico.dic produces two files:
    
            - dico.bin: a binary file containing the minimum automaton of
                the inflected forms of the dictionary;
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            - dico.inf: a text file containing the compressed forms required
                for the reconstruction of the dictionary lines from the
                inflected forms contained in the automaton.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            dictionary [str] -- the dictionary file path.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            output [str] -- sets the output file. By default, a file xxx.dic
                will produce a file xxx.bin.
    
            flip [bool] -- indicates that the inflected and canonical forms
                should be swapped in the compressed dictionary. This option
                is used to construct an inverse dictionary which is
                necessary for the program 'Reconstrucao' (default: False).
    
            semitic [bool] -- indicates that the semitic compression
                algorithm should be used. Setting this option with semitic
                languages like Arabic significantly reduces the size of the
                output dictionary (default: False).
    
            version [str] -- 'v1': produces an old style .bin file;
                             'v2': produces a new style .bin file, with no
                                   file size limitation to 16 Mb and a
                                   smaller size (default).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            True if it succeeds and False otherwise.
    
        options = CompressOptions()
        options.load(kwargs)
    
        if exists(dictionary) is False:
    
            raise UnitexException("[COMPRESS] Dictionary file '%s' doesn't exists" % dictionary)
    
    
        command = ["UnitexTool", "Compress"]
    
    
        if options["output"] is not None:
            command.append("--output=%s" % options["output"])
        if options["flip"] is True:
    
            command.append("--flip")
    
        if options["semitic"] is True:
    
            command.append("--semitic")
    
    
        if options["version"] == UnitexConstants.DICTIONARY_VERSION_1:
    
            command.append("--v1")
    
        elif options["version"] == UnitexConstants.DICTIONARY_VERSION_2:
    
            command.append("--v2")
    
        command.append(dictionary)
    
    
        command.append("-qutf8-no-bom")
    
        command = " ".join(command)
    
    
        _LOGGER.info("Compressing dic '%s'" % dictionary)
        _LOGGER.debug("Command: %s", command)
    
    def concord(index, alphabet, **kwargs):
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function takes a concordance index file produced by the
        function 'locate' and produces a concordance. It is also possible to
        produce a modified text version taking into account the transducer
        outputs associated to the occurrences.
    
        The result of the application of this function is a file called
        concord.txt if the concordance was constructed in text mode, a file
        called concord.html if 'output_mode' is UnitexConstants.FORMAT_HTML,
        UnitexConstants.FORMAT_GLOSSANET' or UnitexConstants.FORMAT_SCRIPT,
        and a text file with the name defined by the user of the function if
        the function has constructed a modified version of the text.
    
        In --html mode, the occurrence is coded as a hypertext link. The
        reference associated to this link is of the form <a href="X Y Z">.
        X et Y represent the beginning and ending positions of the
        occurrence in characters in the file text_name.snt. Z represents the
        number of the sentence in which the occurrence was found.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            index [str] -- the index file path (produced by the 'locate'
                function).
    
            alphabet [str] -- alphabet file used for sorting.
    
    
        Keyword arguments:
    
          - Generic options:
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                font [str] -- the name of the font to use if the output is
                    an HTML file.
    
                fontsize [int] -- the font size to use if the output is an
                    HTML file.
    
                only_ambiguous [bool] -- Only displays identical occurrences
                    with ambiguous outputs, in text order (default: False).
    
                only_matches [bool] -- this option will force empty right
                    and left contexts. Moreover, if used with
                    UnitexConstants.FORMAT_TEXT, the function will not
                    surround matches with tabulations (default: False).
    
                left [str] -- number of characters on the left of the
                    occurrences (default=0). In Thai mode, this means the
                    number of non-diacritic characters.
    
                right [str] -- number of characters (non-diacritic ones in
                    Thai mode) on the right of the occurrences (default=0).
                    If the occurrence is shorter than this value, the
                    concordance line is completed up to right. If the
                    occurrence is longer than the length defined by right,
                    it is nevertheless saved as whole.
    
                NOTE: For both --left and --right, you can add the 's'
                character to stop at the first {S} tag. For instance, if you
                set '40s' for the left value, the left context will end at
                40 characters at most, less if the {S} tag is found before.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                sort [str] -- specifies the sort order. Possible values:
    
                    - 'UnitexConstants.SORT_TEXT_ORDER': order in which the
                        occurrences appear in the text (default);
    
                    - 'UnitexConstants.SORT_LEFT_CENTER': left context for
                        primary sort, then occurrence for secondary sort;
    
                    - 'UnitexConstants.SORT_LEFT_RIGHT': left context, then
                        right context;
    
                    - 'UnitexConstants.SORT_CENTER_LEFT': occurrence, then
                        left context;
    
                    - 'UnitexConstants.SORT_CENTER_RIGHT': occurrence, then
                        right context;
    
                    - 'UnitexConstants.SORT_RIGHT_LEFT': right context, then
                        left context;
    
                    - 'UnitexConstants.SORT_RIGHT_CENTER': left context,
                        then occurrence.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                format [str] -- specifies the output fomat. Possible values:
    
                    - UnitexConstants.FORMAT_HTML: produces a concordance in
                        HTML format encoded in UTF-8 (default);
    
                    - UnitexConstants.FORMAT_TEXT: produces a concordance in
                        Unicode text format;
    
                    - UnitexConstants.FORMAT_GLOSSANET: produces a
                        concordance for GlossaNet in HTML format where
                        occurrences are links described by the 'script'
                        argument (cf. Unitex manual p. 268). The HTML file
                        is encoded in UTF-8;
    
                    - UnitexConstants.FORMAT_SCRIPT: produces a HTML
                        concordance file where occurrences are links
                        described by the 'script' argument;
    
                    - UnitexConstants.FORMAT_INDEX: produces an index of the
                        concordance, made of the content of the occurrences
                        (with the grammar outputs, if any), preceded by the
                        positions of the occurrences in the text file given
                        in characters;
    
                    - UnitexConstants.FORMAT_UIMA: produces an index of the
                        concordance relative to the original text file,
                        before any Unitex operation. The 'offsets' argument
                        must be provided;
    
                    - UnitexConstants.FORMAT_PRLG: produces a concordance
                        for PRLG corpora where each line is prefixed by
                        information extracted with Unxmlize’s 'prlg' option.
                        You must provide both the 'offsets' and the
                        'unxmlize' argument;
    
                    - UnitexConstants.FORMAT_XML: produces an xml index of
                        the concordance;
    
                    - UnitexConstants.FORMAT_XML_WITH_HEADER: produces an
                        xml index of the concordance with full xml header;
    
                    - UnitexConstants.FORMAT_AXIS: quite the same as
                        'index', but the numbers represent the median
                        character of each occurrence;
    
                    - UnitexConstants.FORMAT_XALIGN: another index file,
                        used by the text alignment module. Each line is made
                        of 3 integers X Y Z followed by the content of the
                        occurrence. X is the sentence number, starting from
                        1. Y and Z are the starting and ending positions of
                        the occurrence in the sentence, given in characters;
    
                    - UnitexConstants.FORMAT_MERGE: indicates to the
                        function that it is supposed to produce a modified
                        version of the text and save it in a file.
                        The filename must be provided with the 'output'
                        argument.
    
                script [str] -- string describing the links format for
                    'glossanet' and 'script' output. For instance, if you
                    use 'http://www.google.com/search?q=', you will obtain a
                    HTML concordance file where occurrences are hyperlinks
                    to Google queries.
    
                offsets [str] -- the file produced by Tokenize’s
                    output_offsets option (needed by the 'uima' and the
                    'prlg' format).
    
                unxmlize [str] -- file produced by Unxmlize’s 'prlg' option
                    (needed by the 'prlg' format).
    
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                output [str] -- the output filename (needed by the 'merge'
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                    format).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                directory [str] -- indicates to the function that it must
                    not work in the same directory than <index> but in
                    'directory'.
    
                thai [bool] -- option to use for Thai concordances
                    (default: False).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            True if it succeeds and False otherwise.
    
        options = ConcordOptions()
        options.load(kwargs)
    
        if exists(index) is False:
    
            raise UnitexException("[CONCORD] Index file '%s' doesn't exists" % index)
    
        if exists(alphabet) is False:
    
            raise UnitexException("[CONCORD] Alphabet file '%s' doesn't exists" % alphabet)
    
    
        command = ["UnitexTool", "Concord"]
    
    
        if options["font"] is not None:
            command.append("--font=%s" % options["font"])
        if options["fontsize"] is not None:
            command.append("--fontsize=%s" % options["fontsize"])
        if options["only_ambiguous"] is True:
    
            command.append("--only_ambiguous")
    
        if options["only_matches"] is True:
    
            command.append("--only_matches")
    
    
        command.append("--left=%s" % options["left"])
        command.append("--right=%s" % options["right"])
    
        if options["sort"] == UnitexConstants.SORT_TEXT_ORDER:
    
            command.append("--TO")
    
        elif options["sort"] == UnitexConstants.SORT_LEFT_CENTER:
    
            command.append("--LC")
    
        elif options["sort"] == UnitexConstants.SORT_LEFT_RIGHT:
    
            command.append("--LR")
    
        elif options["sort"] == UnitexConstants.SORT_CENTER_LEFT:
    
            command.append("--CL")
    
        elif options["sort"] == UnitexConstants.SORT_CENTER_RIGHT:
    
            command.append("--CR")
    
        elif options["sort"] == UnitexConstants.SORT_RIGHT_LEFT:
    
            command.append("--RL")
    
        elif options["sort"] == UnitexConstants.SORT_RIGHT_CENTER:
    
        if options["format"] == UnitexConstants.FORMAT_HTML:
    
            command.append("--html")
    
        elif options["format"] == UnitexConstants.FORMAT_TEXT:
    
            command.append("--text")
    
        elif options["format"] == UnitexConstants.FORMAT_GLOSSANET:
            command.append("--glossanet=%s" % options["script"])
        elif options["format"] == UnitexConstants.FORMAT_SCRIPT:
            command.append("--script=%s" % options["script"])
        elif options["format"] == UnitexConstants.FORMAT_INDEX:
    
            command.append("--index")
    
        elif options["format"] == UnitexConstants.FORMAT_UIMA:
            command.append("--uima=%s" % options["offsets"])
        elif options["format"] == UnitexConstants.FORMAT_PRLG:
            command.append("--PRLG=%s,%s" % options["unxmlize"], options["offsets"])
        elif options["format"] == UnitexConstants.FORMAT_XML:
    
            command.append("--xml")
    
        elif options["format"] == UnitexConstants.FORMAT_XML_WITH_HEADERS:
    
            command.append("--xml-with-header")
    
        elif options["format"] == UnitexConstants.FORMAT_AXIS:
    
            command.append("--axis")
    
        elif options["format"] == UnitexConstants.FORMAT_XALIGN:
    
            command.append("--xalign")
    
        elif options["format"] == UnitexConstants.FORMAT_MERGE:
            command.append("--merge=%s" % options["output"])
    
        if options["directory"] is not None:
            command.append("--directory=%s" % options["directory"])
    
    
        command.append("--alphabet=%s" % alphabet)
    
    
        if options["thai"] is not True:
    
            command.append("--thai")
    
        command.append(index)
    
    
        command.append("-qutf8-no-bom")
    
        command = " ".join(command)
    
    
        _LOGGER.info("Create concordance for '%s'" % index)
        _LOGGER.debug("Command: %s", command)
    
    def dico(dictionaries, text, alphabet, **kwargs):
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function applies dictionaries to a text. The text must have
        been cut up into lexical units by the 'tokenize' function.
    
        The function 'dico' produces the following files, and saves them in
        the directory of the text:
    
            - dlf: dictionary of simple words in the text;
            - dlc: dictionary of compound words in the text;
            - err: list of unknown words in the text;
            - tags_err: unrecognized simple words that are not matched by
                        the tags.ind file;
            - tags.ind: sequences to be inserted in the text automaton (see
                        section 3.8.3, page 69);
            - stat_dic.n: file containing the number of simple words, the
                          number of compound words, and the number of
                          unknown words in the text.
    
        NOTE: Files dlf, dlc, err and tags_err are not sorted. Use the
        function 'sort_txt' to sort them.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            dictionaries [list(str)] -- list of dictionary pathes ('bin' or
                'fst2' formats).
    
            text [str] -- text (snt format) file path.
    
            alphabet [str] -- alphabet file path.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            morpho [list(str)] -- this optional argument indicates which
                morphological mode dictionaries are to be used, if needed by
                some .fst2 dictionaries. The argument is a list of
                dictionary path (bin format).
    
            korean [bool] -- specify the dictionary is in korean
                (default: False).
    
            semitic [bool] -- specify the dictionary is in a semitic
                language (default: False).
    
            arabic_rules [str] -- specifies the Arabic typographic rule
                configuration file path.
    
            raw [str] -- alternative output file path containing both simple
                and compound words, without requiring a text directory.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            True if it succeeds and False otherwise.
    
        options = DicoOptions()
        options.load(kwargs)
    
        for dictionary in dictionaries:
    
            if exists(dictionary) is False:
    
                raise UnitexException("[DICO] Dictionary file '%s' doesn't exists" % dictionary)
    
        if exists(text) is False:
    
            raise UnitexException("[DICO] Text file '%s' doesn't exists" % text)
    
        if exists(alphabet) is False:
    
            raise UnitexException("[DICO] Alphabet file '%s' doesn't exists" % alphabet)
    
    
        command = ["UnitexTool", "Dico"]
    
        command.append("--text=%s" % text)
        command.append("--alphabet=%s" % alphabet)
    
    
        if options["morpho"] is not None:
    
            command.append("--morpho=%s" % ",".join(options["morpho"]))
    
        if options["korean"] is True:
    
            command.append("--korean")
    
        if options["semitic"] is True:
    
            command.append("--semitic")
    
        if options["arabic_rules"] is not None:
    
            command.append("--arabic_rules=%s" % options["arabic_rules"])
    
        if options["raw"] is not None:
    
            command.append("--raw=%s" % raw)
    
    
        command += dictionaries
    
        command.append("-qutf8-no-bom")
    
        command = " ".join(command)
    
    
        _LOGGER.info("Applying dictionaries")
        _LOGGER.debug("Command: %s", command)
    
    def extract(text, output, index, **kwargs):
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function extracts from the given text all sentences that
        contain at least one occurrence from the concordance. The parameter
        <text> represents the complete path of the text file, without
        omitting the extension .snt.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            text [str] -- the text file (.snt format).
    
            output [str] -- the output text file.
    
            index [str] -- the index file path (produced by the 'locate'
                function).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            non_matching_sentences [bool] -- extracts all sentences that
                don’t contain matching units (default: False).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            True if it succeeds and False otherwise.
    
        """
        options = ExtractOptions()
        options.load(kwargs)
    
    
        if exists(text) is False:
    
            raise UnitexException("[EXTRACT] Text file '%s' doesn't exists" % text)
    
        if exists(index) is False:
    
            raise UnitexException("[EXTRACT] Index file '%s' doesn't exists" % index)
    
        command = ["UnitexTool", "Extract"]
    
    
        if options["non_matching_sentences"] is False:
    
            command.append("--yes")
        else:
            command.append("--no")
    
        command.append("--output=%s" % output)
        command.append("--index=%s" % index)
    
        command.append(text)
    
        command.append("-qutf8-no-bom")
        command = " ".join(command)
    
    
        _LOGGER.info("Extracting sentences")
        _LOGGER.debug("Command: %s", command)
    
        ret = unitex_tool(command)
    
        return ret
    
    
    
    def fst2txt(grammar, text, alphabet, **kwargs):
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function applies a transducer to a text in longest match mode
        at the preprocessing stage, when the text has not been cut into
        lexical units yet. This function modifies the input text file.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        NOTE: This function modifies the input text file.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            grammar [str] -- the fst2 to apply on the text.
    
            text [str] -- the (.snt) text file to be modified.
    
            alphabet [str] -- the alphabet file of the language of the text.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            start_on_space [bool] -- this parameter indicates that the
                search will start at any position in the text, even before a
                space. This parameter should only be used to carry out
                morphological searches (default: False).
    
            char_by_char [bool] -- works in character by character
                tokenization mode. This is useful for languages like Thai
                (default: False).
    
            merge [bool] -- merge (instead of replace) transducer outputs
                with text inputs (default: True).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            True if it succeeds and False otherwise.
    
        options = Fst2TxtOptions()
        options.load(kwargs)
    
        if exists(grammar) is False:
    
            raise UnitexException("[FST2TXT] Grammar file '%s' doesn't exists" % grammar)
    
        if exists(text) is False:
    
            raise UnitexException("[FST2TXT] Text file '%s' doesn't exists" % text)
    
        if exists(alphabet) is False:
    
            raise UnitexException("[FST2TXT] Alphabet file '%s' doesn't exists" % alphabet)
    
    
        command = ["UnitexTool", "Fst2Txt"]
    
        command.append("--text=%s" % text)
        command.append("--alphabet=%s" % alphabet)
    
    
        if options["start_on_space"] is False:
    
            command.append("--dont_start_on_space")
        else:
            command.append("--start_on_space")
    
    
        if options["char_by_char"] is False:
    
            command.append("--word_by_word")
        else:
            command.append("--char_by_char")
    
    
        if options["merge"] is True:
    
            command.append("--merge")
        else:
            command.append("--replace")
    
        command.append(grammar)
    
    
        command.append("-qutf8-no-bom")
    
        command = " ".join(command)
    
    
        _LOGGER.info("Applying grammar '%s'..." % grammar)
        _LOGGER.debug("Command: %s", command)
    
    def grf2fst2(grammar, alphabet, **kwargs):
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function compiles a grammar into a .fst2 file (for more details
        see section 6.2). The parameter <grf> denotes the complete path of
        the main graph of the grammar, without omitting the extension .grf.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        The result is a file with the same name as the graph passed to the
        function as a parameter, but with extension .fst2. This file is
        saved in the same directory as <grf>.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            grammar [str] -- the grf to compile.
    
            alphabet [str] -- specifies the alphabet file to be used for
                tokenizing the content of the grammar boxes into lexical
                units.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            loop_check [bool] -- enables error (loop) checking
                (default: False).
    
            char_by_char [bool] -- tokenization will be done character by
                character. If neither -c nor -a option is used, lexical
                units will be sequences of any Unicode letters
                (default: False).
    
            pkgdir [str] -- specifies the repository directory to use (see
                section 5.2.2, page 99).
    
            no_empty_graph_warning [bool] -- no warning will be emitted when
                a graph matches the empty word. This option is used by
                MultiFlex in order not to scare users with meaningless error
                messages when they design an inflection grammar that matches
                the empty word (default: False).
    
            tfst_check [bool] -- checks wether the given graph can be
                considered as a valid sentence automaton or not
                (default: False).
    
            silent_grf_name [bool] -- does not print the graph names
                (default: True).
    
            named_repositories [list(str)] -- declaration of named
                repositories. This argument is made of one or more X=Y
                sequences, separated by ‘;’, where X is the name of the
                repository denoted by pathname Y.
    
            debug [bool] -- compile graphs in debug mode (default: False).
    
            check_variables [bool] -- check output validity to avoid
                malformed variable expressions (default: True).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            True if it succeeds and False otherwise.
    
        options = Grf2Fst2Options()
        options.load(kwargs)
    
        if exists(grammar) is False:
    
            raise UnitexException("[GRF2FST2] Grammar file '%s' doesn't exists" % grammar)
    
        if exists(alphabet) is False:
    
            raise UnitexException("[GRF2FST2] Alphabet file '%s' doesn't exists" % alphabet)
    
    
        command = ["UnitexTool", "Grf2Fst2"]
    
    
        if options["loop_check"] is False:
    
            command.append("--no_loop_check")
        else:
            command.append("--loop_check")
    
    
        command.append("--alphabet=%s" % alphabet)
    
        if options["char_by_char"] is True:
    
            command.append("--char_by_char")
    
        if options["pkgdir"] is not None:
            command.append("--pkgdir=%s" % options["pkgdir"])
        if options["no_empty_graph_warning"] is True:
    
            command.append("--no_empty_graph_warning")
    
        if options["tfst_check"] is True:
    
            command.append("--tfst_check")
    
        if options["silent_grf_name"] is True:
    
            command.append("--silent_grf_name")
    
        if options["named_repositories"] is not None:
            command.append("--named_repositories=%s" % ";".join(options["named_repositories"]))
        if options["debug"] is True:
    
            command.append("--debug")
    
        if options["check_variables"] is True:
    
            command.append("--check_variables")
    
        command.append(grammar)
    
    
        command.append("-qutf8-no-bom")
    
        command = " ".join(command)
    
    
        _LOGGER.info("Compiling grammar '%s'..." % grammar)
        _LOGGER.debug("Command: %s", command)
    
    def locate(grammar, text, alphabet, **kwargs):
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function applies a grammar to a text and constructs an index of
        the occurrences found.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function saves the references to the found occurrences in a
        file called concord.ind. The number of occurrences, the number of
        units belonging to those occurrences, as well as the percentage of
        recognized units within the text are saved in a file called
    
        concord.n. These two files are stored in the directory of the text.
    
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            grammar [str] -- the fst2 to apply on the text.
    
            text [str] -- the text file, with extension .snt.
    
            alphabet [str] -- the alphabet file of the language of the text.
    
    
        Keyword arguments:
          - Generic options:
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                start_on_space [bool] -- this parameter indicates that the
                    search will start at any position in the text, even
                    before a space. This parameter should only be used to
                    carry out morphological searches (default: False).
    
                char_by_char [bool] -- works in character by character
                    tokenization mode. This is useful for languages like
                    Thai (default: False).
    
                morpho [list(str)] -- this optional argument indicates which
                    morphological mode dictionaries are to be used, if
                    needed by some .fst2 dictionaries. The argument is a
                    list of dictionary path (bin format).
    
                korean [bool] -- specify the dictionary is in korean
                    (default: False).
    
                arabic_rules [str] -- specifies the Arabic typographic rule
                    configuration file path.
    
                sntdir [str] -- puts produced files in 'sntdir' instead of
                    the text directory. Note that 'sntdir' must end with a
                    file separator (\ or /).
    
                negation_operator [str] -- specifies the negation operator
                    to be used in Locate patterns. The two legal values for
                    X are minus and tilde (default). Using minus provides
                    backward compatibility with previous versions of Unitex.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                number_of_matches [int] -- stops after the first N matches
                    (default: all matches).
    
    
          - Maximum iterations per token options:
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                stop_token_count [list(int_1, int_2)] -- emits a warning
                    after 'int_1' iterations on a token and stops after
                    'int_2' iterations.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                match_mode [str] -- Possible values are:
                    - UnitexConstants.MATCH_MODE_SHORTEST
                    - UnitexConstants.MATCH_MODE_LONGEST (default)
                    - UnitexConstants.MATCH_MODE_ALL
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                output_mode [str] -- Possible values are:
                    - UnitexConstants.OUTPUT_MODE_IGNORE (default)
                    - UnitexConstants.OUTPUT_MODE_MERGE
                    - UnitexConstants.OUTPUT_MODE_REPLACE
    
                protect_dic_chars [bool] -- when 'merge' or 'replace' mode
                    is used, this option protects some input characters with
                    a backslash. This is useful when Locate is called by
                    'dico' in order to avoid producing bad lines like:
                    3,14,.PI.NUM (default: True).
    
                variable [list(str_1, str_2)] -- sets an output variable
                    named str_1 with content str_2. Note that str_2 must be
                    ASCII.
    
    
          - Ambiguous output options:
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
                ambiguous_outputs [bool] -- allows the production of several
                    matches with same input but different outputs. If False,
                    in case of ambiguous outputs, one will be arbitrarily
                    chosen and kept, depending on the internal state of the
                    function (default: True).
    
                variable_error [str] -- Possible values are:
                    - UnitexConstants.ON_ERROR_EXIT
                    - UnitexConstants.ON_ERROR_IGNORE (default)
                    - UnitexConstants.ON_ERROR_BACKTRACK
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            True if it succeeds and False otherwise.
    
        options = LocateOptions()
        options.load(kwargs)
    
        if exists(grammar) is False:
    
            raise UnitexException("[LOCATE] Grammar file '%s' doesn't exists" % grammar)
    
        if exists(text) is False:
    
            raise UnitexException("[LOCATE] Text file '%s' doesn't exists" % text)
    
        if exists(alphabet) is False:
    
            raise UnitexException("[LOCATE] Alphabet file '%s' doesn't exists" % alphabet)
    
    
        command = ["UnitexTool", "Locate"]
    
        command.append("--text=%s" % text)
        command.append("--alphabet=%s" % alphabet)
    
    
        if options["morpho"] is not None:
    
            command.append("--morpho=%s" % ",".join(options["morpho"]))
    
        if options["start_on_space"] is False:
    
            command.append("--dont_start_on_space")
        else:
            command.append("--start_on_space")
    
    
        if options["char_by_char"] is False:
    
            command.append("--word_by_word")
        else:
            command.append("--char_by_char")
    
    
        if options["sntdir"] is not None:
    
            command.append("--sntdir=%s" % options["sntdir"])
    
        if options["korean"] is True:
    
            command.append("--korean")
    
        if options["arabic_rules"] is not None:
    
            command.append("--arabic_rules=%s" % options["arabic_rules"])
    
        if options["negation_operator"] is not None:
    
            command.append("--negation_operator=%s" % options["negation_operator"])
    
        if options["number_of_matches"] is None:
    
            command.append("--all")
        else:
    
            command.append("--number_of_matches=%s" % options["number_of_matches"])
    
        if options["stop_token_count"] is not None:
            if options["stop_token_count[0]"] is None:
    
                command.append("--stop_token_count=%s" % stop_token_count[1])
            else:
                command.append("--stop_token_count=%s,%s" % (stop_token_count[0], stop_token_count[1]))
    
    
        if options["match_mode"] == UnitexConstants.MATCH_MODE_LONGEST:
    
            command.append("--longest_matches")
    
        elif options["match_mode"] == UnitexConstants.MATCH_MODE_SHORTEST:
    
            command.append("--shortest_matches")
    
        elif options["match_mode"] == UnitexConstants.MATCH_MODE_ALL:
    
            command.append("--all_matches")
    
    
        if options["output_mode"] == UnitexConstants.OUTPUT_MODE_IGNORE:
    
            command.append("--ignore")
    
        elif options["output_mode"] == UnitexConstants.OUTPUT_MODE_MERGE:
    
            command.append("--merge")
    
        elif options["output_mode"] == UnitexConstants.OUTPUT_MODE_RELACE:
    
            command.append("--replace")
    
    
        if options["protect_dic_chars"] is True:
    
            command.append("--protect_dic_chars")
    
    
        if options["variable"] is not None:
    
            command.append("--variable=%s=%s" % (options["variable"][0], options["variable"][1]))
    
        if options["ambiguous_outputs"] is True:
    
            command.append("--ambiguous_outputs")
        else:
            command.append("--no_ambiguous_outputs")
    
    
        if options["variable_error"] == UnitexConstants.ON_ERROR_IGNORE:
    
            command.append("--ignore_variable_error")
    
        elif options["variable_error"] == UnitexConstants.ON_ERROR_EXIT:
    
            command.append("--exit_on_variable_error")
    
        elif options["variable_error"] == UnitexConstants.ON_ERROR_BACKTRACK:
    
            command.append("--backtrack_on_variable_error")
    
        command.append(grammar)
    
    
        command.append("-qutf8-no-bom")
    
        command = " ".join(command)
    
    
        _LOGGER.info("Locating pattern '%s'..." % grammar)
        _LOGGER.debug("Command: %s", command)
    
    def normalize(text, **kwargs):
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function carries out a normalization of text separators. The
        separators are space, tab, and newline. Every sequence of separators
        that contains at least one newline is replaced by a unique newline.
        All other sequences of separators are replaced by a single space.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function also checks the syntax of lexical tags found in the
        text. All sequences in curly brackets should be either the sentence
        delimiter {S}, the stop marker {STOP}, or valid entries in the DELAF
        format ({aujourd’hui,.ADV}).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        NOTE: the function creates a modified version of the text that is
              saved in a file with extension .snt.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        WARNING: if you specify a normalization rule file, its rules will be
                 applied prior to anything else. So, you have to be very
                 careful if you manipulate separators in such rules.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            text [str] -- the text file to normalize.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            no_carriage_return [bool] -- every separator sequence will be
                turned into a single space (default: False).
    
            input_offsets [str] -- base offset file to be used.
    
            output_offsets [str] -- offset file to be produced.
    
            replacement_rules [str] -- specifies the normalization rule file
                to be used. See section 14.13.6 for details about the format
                of this file. By default, the function only replaces { and }
                by [ and ].
    
            no_separator_normalization [bool] -- only applies replacement
                rules specified with the 'replacement_rules' option
                (default: False).
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
            True if it succeeds and False otherwise.
    
        options = NormalizeOptions()
        options.load(kwargs)
    
        if exists(text) is False:
    
            raise UnitexException("[NORMALIZE] Text file '%s' doesn't exists" % text)
    
    
        command = ["UnitexTool", "Normalize"]
    
    
        if options["no_carriage_return"] is True:
    
            command.append("--no_carriage_return")
    
    
        if options["input_offsets"] is not None:
    
            command.append("--input_offsets=%s" % options["input_offsets"])
    
        if options["output_offsets"] is not None:
    
            command.append("--output_offsets=%s" % options["output_offsets"])
    
        if options["replacement_rules"] is not None:
    
            command.append("--replacement_rules=%s" % options["replacement_rules"])
    
        if options["no_separator_normalization"] is True:
    
            command.append("--no_separator_normalization")
    
        command.append(text)
    
    
        command.append("-qutf8-no-bom")
    
        command = " ".join(command)
    
    
        _LOGGER.info("Normalizing text '%s'..." % text)
        _LOGGER.debug("Command: %s", command)
    
    def sort_txt(text, **kwargs):
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        This function carries out a lexicographical sorting of the lines of
        file <txt>. <txt> represents the complete path of the file to be
        sorted.
    
    Patrick Watrin's avatar
    Patrick Watrin a validé
        The input text file is modified. By default, the sorting is