diff --git a/unitex/tools.py b/unitex/tools.py index 82c5df758e5635a75349009daf532a63d2a07f08..e2104267f9e0e9ad28500ee098dfadbc4c5b8e82 100644 --- a/unitex/tools.py +++ b/unitex/tools.py @@ -1,5 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# NOTE: The documentation adapted for each function is extracted from +# the Unitex manual. import logging @@ -117,7 +119,7 @@ def compress(dictionary, **kwargs): smaller size (default). Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = CompressOptions() options.load(kwargs) @@ -154,101 +156,172 @@ def compress(dictionary, **kwargs): def concord(index, alphabet, **kwargs): """ - This function takes a concordance index file produced by the function Locate and - produces a concordance. It is also possible to produce a modified text version taking - into account the transducer outputs associated to the occurrences. - - The result of the application of this function is a file called concord.txt if the concordance - was constructed in text mode, a file called concord.html if the output - mode was --html, --glossanet or --script, and a text file with the name de- - fined by the user of the function if the function has constructed a modified version - of the text. - - In --html mode, the occurrence is coded as a hypertext link. The reference associated - to this link is of the form <a href="X Y Z">. X et Y represent the beginning - and ending positions of the occurrence in characters in the file text_name.snt. Z - represents the number of the sentence in which the occurrence was found. + This function takes a concordance index file produced by the + function 'locate' and produces a concordance. It is also possible to + produce a modified text version taking into account the transducer + outputs associated to the occurrences. + + The result of the application of this function is a file called + concord.txt if the concordance was constructed in text mode, a file + called concord.html if 'output_mode' is UnitexConstants.FORMAT_HTML, + UnitexConstants.FORMAT_GLOSSANET' or UnitexConstants.FORMAT_SCRIPT, + and a text file with the name defined by the user of the function if + the function has constructed a modified version of the text. + + In --html mode, the occurrence is coded as a hypertext link. The + reference associated to this link is of the form <a href="X Y Z">. + X et Y represent the beginning and ending positions of the + occurrence in characters in the file text_name.snt. Z represents the + number of the sentence in which the occurrence was found. Arguments: - index [str] -- the index file path (produced by the 'locate' function) - alphabet [str] -- alphabet file used for sorting + index [str] -- the index file path (produced by the 'locate' + function). + + alphabet [str] -- alphabet file used for sorting. Keyword arguments: - Generic options: - font [str] -- the name of the font to use if the output is an HTML - file. - fontsize [int] -- the font size to use if the output is an HTML file. - only_ambiguous [bool] -- Only displays identical occurrences with ambiguous - outputs, in text order (default: False) - only_matches [bool] -- this option will force empty right and left contexts. Moreover, - if used with -t/–text, Concord will not surround matches with - tabulations (default: False) - left [str] -- number of characters on the left of the occurrences (default=0). - In Thai mode, this means the number of non-diacritic characters. - right [str] -- number of characters (non-diacritic ones in Thai mode) on - the right of the occurrences (default=0). If the occurrence is - shorter than this value, the concordance line is completed up to - right. If the occurrence is longer than the length defined by - right, it is nevertheless saved as whole. - - NOTE: For both --left and --right, you can add the s character to stop at - the first {S} tag. For instance, if you set 40s for the left value, the left context - will end at 40 characters at most, less if the {S} tag is found before. + font [str] -- the name of the font to use if the output is + an HTML file. + + fontsize [int] -- the font size to use if the output is an + HTML file. + + only_ambiguous [bool] -- Only displays identical occurrences + with ambiguous outputs, in text order (default: False). + + only_matches [bool] -- this option will force empty right + and left contexts. Moreover, if used with + UnitexConstants.FORMAT_TEXT, the function will not + surround matches with tabulations (default: False). + + left [str] -- number of characters on the left of the + occurrences (default=0). In Thai mode, this means the + number of non-diacritic characters. + + right [str] -- number of characters (non-diacritic ones in + Thai mode) on the right of the occurrences (default=0). + If the occurrence is shorter than this value, the + concordance line is completed up to right. If the + occurrence is longer than the length defined by right, + it is nevertheless saved as whole. + + NOTE: For both --left and --right, you can add the 's' + character to stop at the first {S} tag. For instance, if you + set '40s' for the left value, the left context will end at + 40 characters at most, less if the {S} tag is found before. - Sort options: - sort [str] -- 'UnitexConstants.SORT_TEXT_ORDER': order in which the occurrences appear in the text (default) - 'UnitexConstants.SORT_LEFT_CENTER': left context for primary sort, then occurrence for secondary sort - 'UnitexConstants.SORT_LEFT_RIGHT': left context, then right context - 'UnitexConstants.SORT_CENTER_LEFT': occurrence, then left context - 'UnitexConstants.SORT_CENTER_RIGHT': occurrence, then right context - 'UnitexConstants.SORT_RIGHT_LEFT': right context, then left context - 'UnitexConstants.SORT_RIGHT_CENTER': left context, then occurrence + sort [str] -- specifies the sort order. Possible values: + + - 'UnitexConstants.SORT_TEXT_ORDER': order in which the + occurrences appear in the text (default); + + - 'UnitexConstants.SORT_LEFT_CENTER': left context for + primary sort, then occurrence for secondary sort; + + - 'UnitexConstants.SORT_LEFT_RIGHT': left context, then + right context; + + - 'UnitexConstants.SORT_CENTER_LEFT': occurrence, then + left context; + + - 'UnitexConstants.SORT_CENTER_RIGHT': occurrence, then + right context; + + - 'UnitexConstants.SORT_RIGHT_LEFT': right context, then + left context; + + - 'UnitexConstants.SORT_RIGHT_CENTER': left context, + then occurrence. - Output options: - format [str] -- UnitexConstants.FORMAT_HTML: produces a concordance in HTML format encoded in UTF-8 (default) - UnitexConstants.FORMAT_TEXT: produces a concordance in Unicode text format - UnitexConstants.FORMAT_GLOSSANET: produces a concordance for GlossaNet in HTML format where - occurrences are links described by the 'script' argument - (cf. Unitex manual p. 268). The HTML file is encoded in UTF-8 - UnitexConstants.FORMAT_SCRIPT: produces a HTML concordance file where occurrences are links - described by the 'script' argument - UnitexConstants.FORMAT_INDEX: produces an index of the concordance, made of the content of the - occurrences (with the grammar outputs, if any), preceded by the - positions of the occurrences in the text file given in characters - UnitexConstants.FORMAT_UIMA: produces an index of the concordance relative to the original text - file, before any Unitex operation. The 'offsets' argument must be - provided - UnitexConstants.FORMAT_PRLG: produces a concordance for PRLG corpora where each line is prefixed - by information extracted with Unxmlize’s 'prlg' option. You must - provide both the 'offsets' and the 'unxmlize' argument - UnitexConstants.FORMAT_XML: produces an xml index of the concordance - UnitexConstants.FORMAT_XML_WITH_HEADER: produces an xml index of the concordance with full xml header - UnitexConstants.FORMAT_AXIS: quite the same as 'index', but the numbers represent the median - character of each occurrence - UnitexConstants.FORMAT_XALIGN: another index file, used by the text alignment module. Each line is - made of 3 integers X Y Z followed by the content of the occurrence. - X is the sentence number, starting from 1. Y and Z are the starting - and ending positions of the occurrence in the sentence, given in - characters - UnitexConstants.FORMAT_MERGE: indicates to the function that it is supposed to produce a modified - version of the text and save it in a file. The filename must be - provided with the 'output' argument - script [str] -- string describing the links format for 'glossanet' and 'script' output. For instance, - if you use 'http://www.google.com/search?q=', you will obtain a HTML concordance - file where occurrences are hyperlinks to Google queries - offsets [str] -- the file produced by Tokenize’s output_offsets option (needed by the 'uima' and the - 'prlg' format) - unxmlize [str] -- file produced by Unxmlize’s 'prlg' option (needed by the 'prlg' format) - output [str] -- the output filename (needed by the 'merge' format) + format [str] -- specifies the output fomat. Possible values: + + - UnitexConstants.FORMAT_HTML: produces a concordance in + HTML format encoded in UTF-8 (default); + + - UnitexConstants.FORMAT_TEXT: produces a concordance in + Unicode text format; + + - UnitexConstants.FORMAT_GLOSSANET: produces a + concordance for GlossaNet in HTML format where + occurrences are links described by the 'script' + argument (cf. Unitex manual p. 268). The HTML file + is encoded in UTF-8; + + - UnitexConstants.FORMAT_SCRIPT: produces a HTML + concordance file where occurrences are links + described by the 'script' argument; + + - UnitexConstants.FORMAT_INDEX: produces an index of the + concordance, made of the content of the occurrences + (with the grammar outputs, if any), preceded by the + positions of the occurrences in the text file given + in characters; + + - UnitexConstants.FORMAT_UIMA: produces an index of the + concordance relative to the original text file, + before any Unitex operation. The 'offsets' argument + must be provided; + + - UnitexConstants.FORMAT_PRLG: produces a concordance + for PRLG corpora where each line is prefixed by + information extracted with Unxmlize’s 'prlg' option. + You must provide both the 'offsets' and the + 'unxmlize' argument; + + - UnitexConstants.FORMAT_XML: produces an xml index of + the concordance; + + - UnitexConstants.FORMAT_XML_WITH_HEADER: produces an + xml index of the concordance with full xml header; + + - UnitexConstants.FORMAT_AXIS: quite the same as + 'index', but the numbers represent the median + character of each occurrence; + + - UnitexConstants.FORMAT_XALIGN: another index file, + used by the text alignment module. Each line is made + of 3 integers X Y Z followed by the content of the + occurrence. X is the sentence number, starting from + 1. Y and Z are the starting and ending positions of + the occurrence in the sentence, given in characters; + + - UnitexConstants.FORMAT_MERGE: indicates to the + function that it is supposed to produce a modified + version of the text and save it in a file. + The filename must be provided with the 'output' + argument. + + script [str] -- string describing the links format for + 'glossanet' and 'script' output. For instance, if you + use 'http://www.google.com/search?q=', you will obtain a + HTML concordance file where occurrences are hyperlinks + to Google queries. + + offsets [str] -- the file produced by Tokenize’s + output_offsets option (needed by the 'uima' and the + 'prlg' format). + + unxmlize [str] -- file produced by Unxmlize’s 'prlg' option + (needed by the 'prlg' format). + + output [str] -- the output filename (needed by the 'merge' + format). - Other options: - directory [str] -- indicates to the function that it must not work in the same directory - than <index> but in 'directory' - thai [bool] -- option to use for Thai concordances (default: False) + directory [str] -- indicates to the function that it must + not work in the same directory than <index> but in + 'directory'. + + thai [bool] -- option to use for Thai concordances + (default: False). Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = ConcordOptions() options.load(kwargs) @@ -335,42 +408,54 @@ def concord(index, alphabet, **kwargs): def dico(dictionaries, text, alphabet, **kwargs): """ - This function applies dictionaries to a text. The text must have been cut up into - lexical units by the 'tokenize' function. - - The function 'dico' produces the following files, and saves them in the directory of - the text: - - dlf: dictionary of simple words in the text - - dlc: dictionary of compound words in the text - - err: list of unknown words in the text - - tags_err: unrecognized simple words that are not matched by the tags.ind - file - - tags.ind: sequences to be inserted in the text automaton (see section 3.8.3, - page 69) - - stat_dic.n: file containing the number of simple words, the number of compound - words, and the number of unknown words in the text - - NOTE: Files dlf, dlc, err and tags_err are not sorted. Use the function sort_txt - to sort them + This function applies dictionaries to a text. The text must have + been cut up into lexical units by the 'tokenize' function. + + The function 'dico' produces the following files, and saves them in + the directory of the text: + + - dlf: dictionary of simple words in the text; + - dlc: dictionary of compound words in the text; + - err: list of unknown words in the text; + - tags_err: unrecognized simple words that are not matched by + the tags.ind file; + - tags.ind: sequences to be inserted in the text automaton (see + section 3.8.3, page 69); + - stat_dic.n: file containing the number of simple words, the + number of compound words, and the number of + unknown words in the text. + + NOTE: Files dlf, dlc, err and tags_err are not sorted. Use the + function 'sort_txt' to sort them. Arguments: - dictionaries [list(str)] -- list of dictionary pathes ('bin' or 'fst2' formats) - text [str] -- text (snt format) file path - alphabet [str] -- alphabet file path + dictionaries [list(str)] -- list of dictionary pathes ('bin' or + 'fst2' formats). + + text [str] -- text (snt format) file path. + + alphabet [str] -- alphabet file path. Keyword arguments: - morpho [list(str)] -- this optional argument indicates which morphological mode - dictionaries are to be used, if needed by some .fst2 - dictionaries. The argument is a list of dictionary path - (bin format) - korean [bool] -- specify the dictionary is in korean (default: False) - semitic [bool] -- specify the dictionary is in a semitic language (default: False) - arabic_rules [str] -- specifies the Arabic typographic rule configuration file path - raw [str] -- alternative output file path containing both simple and compound - words, without requiring a text directory + morpho [list(str)] -- this optional argument indicates which + morphological mode dictionaries are to be used, if needed by + some .fst2 dictionaries. The argument is a list of + dictionary path (bin format). + + korean [bool] -- specify the dictionary is in korean + (default: False). + + semitic [bool] -- specify the dictionary is in a semitic + language (default: False). + + arabic_rules [str] -- specifies the Arabic typographic rule + configuration file path. + + raw [str] -- alternative output file path containing both simple + and compound words, without requiring a text directory. Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = DicoOptions() options.load(kwargs) @@ -414,21 +499,25 @@ def dico(dictionaries, text, alphabet, **kwargs): def extract(text, output, index, **kwargs): """ - This function extracts from the given text all sentences that contain at least one - occurrence from the concordance. The parameter <text> represents the complete - path of the text file, without omitting the extension .snt. + This function extracts from the given text all sentences that + contain at least one occurrence from the concordance. The parameter + <text> represents the complete path of the text file, without + omitting the extension .snt. Arguments: - text [str] -- the text file (.snt format) - output [str] -- the output text file - index [str] -- the index file path (produced by the 'locate' function) + text [str] -- the text file (.snt format). + + output [str] -- the output text file. + + index [str] -- the index file path (produced by the 'locate' + function). Keyword arguments: - non_matching_sentences [bool] -- extracts all sentences that don’t contain matching - units (default: False) + non_matching_sentences [bool] -- extracts all sentences that + don’t contain matching units (default: False). Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = ExtractOptions() options.load(kwargs) @@ -463,29 +552,34 @@ def extract(text, output, index, **kwargs): def fst2txt(grammar, text, alphabet, **kwargs): """ - This function applies a transducer to a text in longest match mode at the preprocessing - stage, when the text has not been cut into lexical units yet. This function modifies the input - text file. + This function applies a transducer to a text in longest match mode + at the preprocessing stage, when the text has not been cut into + lexical units yet. This function modifies the input text file. - This function modifies the input text file. + NOTE: This function modifies the input text file. Arguments: - grammar [str] -- The fst2 to apply on the text - text [str] -- the text file to be modified, with extension .snt - alphabet [str] -- the alphabet file of the language of the text + grammar [str] -- the fst2 to apply on the text. + + text [str] -- the (.snt) text file to be modified. + + alphabet [str] -- the alphabet file of the language of the text. Keyword arguments: - start_on_space [bool] -- this parameter indicates that the search will start at - any position in the text, even before a space. This parameter - should only be used to carry out morphological searches - (default: False) - char_by_char [bool] -- works in character by character tokenization mode. - This is useful for languages like Thai (default: False) - merge [bool] -- merge (instead of replace) transducer outputs with text inputs - (default: True) + start_on_space [bool] -- this parameter indicates that the + search will start at any position in the text, even before a + space. This parameter should only be used to carry out + morphological searches (default: False). + + char_by_char [bool] -- works in character by character + tokenization mode. This is useful for languages like Thai + (default: False). + + merge [bool] -- merge (instead of replace) transducer outputs + with text inputs (default: True). Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = Fst2TxtOptions() options.load(kwargs) @@ -532,45 +626,58 @@ def fst2txt(grammar, text, alphabet, **kwargs): def grf2fst2(grammar, alphabet, **kwargs): """ - This function compiles a grammar into a .fst2 file (for more details see section - 6.2). The parameter <grf> denotes the complete path of the main graph of the - grammar, without omitting the extension .grf. + This function compiles a grammar into a .fst2 file (for more details + see section 6.2). The parameter <grf> denotes the complete path of + the main graph of the grammar, without omitting the extension .grf. - The result is a file with the same name as the graph passed to the function as a - parameter, but with extension .fst2. This file is saved in the same directory as - <grf>. + The result is a file with the same name as the graph passed to the + function as a parameter, but with extension .fst2. This file is + saved in the same directory as <grf>. Arguments: - grammar [str] -- The grf to compile - alphabet [str] -- specifies the alphabet file to be used for tokenizing the content of - the grammar boxes into lexical units + grammar [str] -- the grf to compile. + + alphabet [str] -- specifies the alphabet file to be used for + tokenizing the content of the grammar boxes into lexical + units. Keyword arguments: - loop_check [bool] -- enables error (loop) checking (default: False) - char_by_char [bool] -- tokenization will be done character by character. - If neither -c nor -a option is used, lexical units - will be sequences of any Unicode letters (default: False) - pkgdir [str] -- specifies the repository directory to use (see section - 5.2.2, page 99) - no_empty_graph_warning [bool] -- no warning will be emitted when a graph matches the - empty word. This option is used by MultiFlex in order - not to scare users with meaningless error messages when - they design an inflection grammar that matches the - empty word (default: False) - tfst_check [bool] -- checks wether the given graph can be considered as a - valid sentence automaton or not (default: False) - silent_grf_name [bool] -- does not print the graph names (needed for consistent - log files across several systems; default: True) - named_repositories [list(str)] -- declaration of named repositories. This argument is made - of one or more X=Y sequences, separated by ‘;’, where X is - the name of the repository denoted by pathname Y. You can - use this option several times - debug [bool] -- compile graphs in debug mode (default: False) - check_variables [bool] -- check output validity to avoid malformed variable - expressions (default: True) + loop_check [bool] -- enables error (loop) checking + (default: False). + + char_by_char [bool] -- tokenization will be done character by + character. If neither -c nor -a option is used, lexical + units will be sequences of any Unicode letters + (default: False). + + pkgdir [str] -- specifies the repository directory to use (see + section 5.2.2, page 99). + + no_empty_graph_warning [bool] -- no warning will be emitted when + a graph matches the empty word. This option is used by + MultiFlex in order not to scare users with meaningless error + messages when they design an inflection grammar that matches + the empty word (default: False). + + tfst_check [bool] -- checks wether the given graph can be + considered as a valid sentence automaton or not + (default: False). + + silent_grf_name [bool] -- does not print the graph names + (default: True). + + named_repositories [list(str)] -- declaration of named + repositories. This argument is made of one or more X=Y + sequences, separated by ‘;’, where X is the name of the + repository denoted by pathname Y. + + debug [bool] -- compile graphs in debug mode (default: False). + + check_variables [bool] -- check output validity to avoid + malformed variable expressions (default: True). Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = Grf2Fst2Options() options.load(kwargs) @@ -621,75 +728,98 @@ def grf2fst2(grammar, alphabet, **kwargs): def locate(grammar, text, alphabet, **kwargs): """ - This function applies a grammar to a text and constructs an index of the occurrences - found. + This function applies a grammar to a text and constructs an index of + the occurrences found. - This function saves the references to the found occurrences in a file called concord.ind. - The number of occurrences, the number of units belonging to those occurrences, as - well as the percentage of recognized units within the text are saved in a file called + This function saves the references to the found occurrences in a + file called concord.ind. The number of occurrences, the number of + units belonging to those occurrences, as well as the percentage of + recognized units within the text are saved in a file called concord.n. These two files are stored in the directory of the text. Arguments: - grammar [str] -- The fst2 to apply on the text - text [str] -- the text file, with extension .snt - alphabet [str] -- the alphabet file of the language of the text + grammar [str] -- the fst2 to apply on the text. + + text [str] -- the text file, with extension .snt. + + alphabet [str] -- the alphabet file of the language of the text. Keyword arguments: - Generic options: - start_on_space [bool] -- this parameter indicates that the search will start at - any position in the text, even before a space. This parameter - should only be used to carry out morphological searches - (default: False) - char_by_char [bool] -- works in character by character tokenization mode. - This is useful for languages like Thai (default: False) - morpho [list(str)] -- this optional argument indicates which morphological mode - dictionaries are to be used, if needed by some .fst2 - dictionaries. The argument is a list of dictionary path - (bin format) - korean [bool] -- specify the dictionary is in korean (default: False) - arabic_rules [str] -- specifies the Arabic typographic rule configuration file path - sntdir [str] -- puts produced files in 'sntdir' instead of the text directory - Note that 'sntdir' must end with a file separator (\ or /); - negation_operator [str] -- specifies the negation operator to be used in Locate patterns. - The two legal values for X are minus and tilde (default). - Using minus provides backward compatibility with previous versions - of Unitex. + start_on_space [bool] -- this parameter indicates that the + search will start at any position in the text, even + before a space. This parameter should only be used to + carry out morphological searches (default: False). + + char_by_char [bool] -- works in character by character + tokenization mode. This is useful for languages like + Thai (default: False). + + morpho [list(str)] -- this optional argument indicates which + morphological mode dictionaries are to be used, if + needed by some .fst2 dictionaries. The argument is a + list of dictionary path (bin format). + + korean [bool] -- specify the dictionary is in korean + (default: False). + + arabic_rules [str] -- specifies the Arabic typographic rule + configuration file path. + + sntdir [str] -- puts produced files in 'sntdir' instead of + the text directory. Note that 'sntdir' must end with a + file separator (\ or /). + + negation_operator [str] -- specifies the negation operator + to be used in Locate patterns. The two legal values for + X are minus and tilde (default). Using minus provides + backward compatibility with previous versions of Unitex. - Search limit options: - number_of_matches [int] -- stops after the first N matches (default: all matches) + number_of_matches [int] -- stops after the first N matches + (default: all matches). - Maximum iterations per token options: - stop_token_count [list(int_1, int_2)] -- emits a warning after 'int_1' iterations on a - token and stops after 'int_2' iterations. + stop_token_count [list(int_1, int_2)] -- emits a warning + after 'int_1' iterations on a token and stops after + 'int_2' iterations. - Matching mode options: - match_mode [str] -- UnitexConstants.MATCH_MODE_SHORTEST: shortest match mode - UnitexConstants.MATCH_MODE_LONGEST: longest match mode (default) - UnitexConstants.MATCH_MODE_ALL: all match mode + match_mode [str] -- Possible values are: + - UnitexConstants.MATCH_MODE_SHORTEST + - UnitexConstants.MATCH_MODE_LONGEST (default) + - UnitexConstants.MATCH_MODE_ALL - Output options: - output_mode [str] -- UnitexConstants.OUTPUT_MODE_IGNORE: ignore outputs (default) - UnitexConstants.OUTPUT_MODE_MERGE: merge outputs with text - UnitexConstants.OUTPUT_MODE_REPLACE: replace texts inputs with - corresponding transducer outputs - protect_dic_chars [bool] -- when 'merge' or 'replace' mode is used, this option protects some - input characters with a backslash. This is useful when Locate is - called by Dico in order to avoid producing bad lines like: 3,14,.PI.NUM - (default: True) - variable [list(str_1, str_2)] -- sets an output variable named str_1 with content str_2. Note that str_2 - must be ASCII. + output_mode [str] -- Possible values are: + - UnitexConstants.OUTPUT_MODE_IGNORE (default) + - UnitexConstants.OUTPUT_MODE_MERGE + - UnitexConstants.OUTPUT_MODE_REPLACE + + protect_dic_chars [bool] -- when 'merge' or 'replace' mode + is used, this option protects some input characters with + a backslash. This is useful when Locate is called by + 'dico' in order to avoid producing bad lines like: + 3,14,.PI.NUM (default: True). + + variable [list(str_1, str_2)] -- sets an output variable + named str_1 with content str_2. Note that str_2 must be + ASCII. - Ambiguous output options: - ambiguous_outputs [bool] -- allows the production of several matches with same input but different - outputs. If False, in case of ambiguous outputs, one will be arbitrarily - chosen and kept, depending on the internal state of the function - (default: True) - variable_error [str] -- UnitexConstants.ON_ERROR_EXIT: kills the function if variable has an empty content - UnitexConstants.ON_ERROR_IGNORE: ignore the errors (default) - UnitexConstants.ON_ERROR_BACKTRACK: stop the current path exploration + ambiguous_outputs [bool] -- allows the production of several + matches with same input but different outputs. If False, + in case of ambiguous outputs, one will be arbitrarily + chosen and kept, depending on the internal state of the + function (default: True). + + variable_error [str] -- Possible values are: + - UnitexConstants.ON_ERROR_EXIT + - UnitexConstants.ON_ERROR_IGNORE (default) + - UnitexConstants.ON_ERROR_BACKTRACK Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = LocateOptions() options.load(kwargs) @@ -786,39 +916,45 @@ def locate(grammar, text, alphabet, **kwargs): def normalize(text, **kwargs): """ - This function carries out a normalization of text separators. The separators are - space, tab, and newline. Every sequence of separators that contains at least one - newline is replaced by a unique newline. All other sequences of separators are replaced - by a single space. + This function carries out a normalization of text separators. The + separators are space, tab, and newline. Every sequence of separators + that contains at least one newline is replaced by a unique newline. + All other sequences of separators are replaced by a single space. - This function also checks the syntax of lexical tags found in the text. All sequences in - curly brackets should be either the sentence delimiter {S}, the stop marker {STOP}, - or valid entries in the DELAF format ({aujourd’hui,.ADV}). + This function also checks the syntax of lexical tags found in the + text. All sequences in curly brackets should be either the sentence + delimiter {S}, the stop marker {STOP}, or valid entries in the DELAF + format ({aujourd’hui,.ADV}). - Parameter <text> represents the complete path of the text file. The function creates - a modified version of the text that is saved in a file with extension .snt. + NOTE: the function creates a modified version of the text that is + saved in a file with extension .snt. - WARNING: if you specify a normalization rule file, its rules will be applied prior to - anything else. So, you have to be very careful if you manipulate separators in such - rules. + WARNING: if you specify a normalization rule file, its rules will be + applied prior to anything else. So, you have to be very + careful if you manipulate separators in such rules. Arguments: - text [str] -- The text file to normalize + text [str] -- the text file to normalize. Keyword arguments: - no_carriage_return [bool] -- every separator sequence will be turned into a single - space (default: False) - input_offsets [str] -- base offset file to be used - output_offsets [str] -- offset file to be produced - replacement_rules [str] -- specifies the normalization rule file - to be used. See section 14.13.6 for details about the - format of this file. By default, the function only - replaces { and } by [ and ] - no_separator_normalization [bool] -- only applies replacement rules specified with the 'replacement_rules' - option (default: False) + no_carriage_return [bool] -- every separator sequence will be + turned into a single space (default: False). + + input_offsets [str] -- base offset file to be used. + + output_offsets [str] -- offset file to be produced. + + replacement_rules [str] -- specifies the normalization rule file + to be used. See section 14.13.6 for details about the format + of this file. By default, the function only replaces { and } + by [ and ]. + + no_separator_normalization [bool] -- only applies replacement + rules specified with the 'replacement_rules' option + (default: False). Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = NormalizeOptions() options.load(kwargs) @@ -857,31 +993,37 @@ def normalize(text, **kwargs): def sort_txt(text, **kwargs): """ - This function carries out a lexicographical sorting of the lines of file <txt>. <txt> - represents the complete path of the file to be sorted. + This function carries out a lexicographical sorting of the lines of + file <txt>. <txt> represents the complete path of the file to be + sorted. - The input text file is modified. By default, the sorting is performed in the order of - Unicode characters, removing duplicate lines. + The input text file is modified. By default, the sorting is + performed in the order of Unicode characters, removing duplicate + lines. Arguments: - text [str] -- The text file to sort + text [str] -- the text file to sort. Keyword arguments: - duplicates [bool] -- keep duplicate lines (default: False) - reverse [bool] -- sort in descending order (default: False) - sort_order [str] -- sorts using the alphabet order defined in this - file. If this parameter is missing, the sorting - is done according to the order of Unicode - characters - line_info [str] -- backup the number of lines of the result file - in this file - thai [bool] -- option for sorting Thai text (default: False) - factorize_inflectional_codes [bool] -- makes two entries XXX,YYY.ZZZ:A and XXX,YYY.ZZZ:B - become a single entry XXX,YYY.ZZZ:A:B - (default: False) + duplicates [bool] -- keep duplicate lines (default: False). + + reverse [bool] -- sort in descending order (default: False). + + sort_order [str] -- sorts using the alphabet order defined in + this file. If this parameter is missing, the sorting is done + according to the order of Unicode characters. + + line_info [str] -- backup the number of lines of the result file + in this file. + + thai [bool] -- option for sorting Thai text (default: False). + + factorize_inflectional_codes [bool] -- makes two entries X,Y.Z:A + and X,Y.Z:B become a single entry X,Y.Z:A:B + (default: False). Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = SortTxtOptions() options.load(kwargs) @@ -922,51 +1064,62 @@ def sort_txt(text, **kwargs): def tokenize(text, alphabet, **kwargs): """ - This function tokenizes a tet text into lexical units. <txt> the complete path of the - text file, without omitting the .snt extension. - - The function codes each unit as a whole. The list of units is saved in a text file called - tokens.txt. The sequence of codes representing the units now allows the coding - of the text. This sequence is saved in a binary file named text.cod. The function - also produces the following four files: - - tok_by_freq.txt: text file containing the units sorted by frequency - - tok_by_alph.txt: text file containing the units sorted alphabetically - - stats.n: text file containing information on the number of sentence separators, - the number of units, the number of simple words and the number of - numbers - - enter.pos: binary file containing the list of newline positions in the text. The - coded representation of the text does not contain newlines, but spaces. - Since a newline counts as two characters and a space as a single one, - it is necessary to know where newlines occur in the text when the - positions of occurrences located by the 'locate' function are to be - synchronized with the text file. File enter.pos is used for this by - the 'concord' function. Thanks to this, when clicking on an occurrence in - a concordance, it is correctly selected in the text. File enter.pos is - a binary file containing the list of the positions of newlines in the - text. + This function tokenizes a tet text into lexical units. <txt> the + complete path of the text file, without omitting the .snt extension. + + The function codes each unit as a whole. The list of units is saved + in a text file called tokens.txt. The sequence of codes representing + the units now allows the coding of the text. This sequence is saved + in a binary file named text.cod. The function also produces the + following four files: + + - tok_by_freq.txt: text file containing the units sorted by + frequency. + - tok_by_alph.txt: text file containing the units sorted + alphabetically. + - stats.n: text file containing information on the number of + sentence separators, the number of units, the number + of simple words and the number of numbers. + - enter.pos: binary file containing the list of newline + positions in the text. The coded representation of + the text does not contain newlines, but spaces. + Since a newline counts as two characters and a + space as a single one, it is necessary to know + where newlines occur in the text when the positions + of occurrences located by the 'locate' function are + to be synchronized with the text file. File + enter.pos is used for this by the 'concord' + function. Thanks to this, when clicking on an + occurrence in a concordance, it is correctly + selected in the text. File enter.pos is a binary + file containing the list of the positions of + newlines in the text. All produced files are saved in the text directory Arguments: - text [str] -- the text file to tokenize (snt format) - alphabet [str] -- the alphabet file + text [str] -- the text file to tokenize (.snt format). + + alphabet [str] -- the alphabet file. Keyword arguments: - Generic options: - char_by_char [bool] -- indicates whether the function is applied character by - character, with the exceptions of the sentence delimiter - {S}, the stop marker {STOP} and lexical tags like - {today,.ADV} which are considered to be single units - (default: False) - tokens [str] -- specifies a tokens.txt file to load and modify, instead - of creating a new one from scratch + char_by_char [bool] -- indicates whether the function is + applied character by character, with the exceptions of + the sentence delimiter {S}, the stop marker {STOP} and + lexical tags like {today,.ADV} which are considered to + be single units (default: False). + + tokens [str] -- specifies a tokens.txt file to load and + modify, instead of creating a new one from scratch. - Offsets options: - input_offsets [str] -- base offset file to be used; - output_offsets [str] -- offset file to be produced; + input_offsets [str] -- base offset file to be used. + + output_offsets [str] -- offset file to be produced. Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = TokenizeOptions() options.load(kwargs) @@ -1010,29 +1163,35 @@ def txt2tfst(text, alphabet, **kwargs): """ This function constructs an automaton of a text. - If the text is separated into sentences, the function constructs an automaton for each - sentence. If this is not the case, the function arbitrarily cuts the text into sequences - of 2000 tokens and produces an automaton for each of these sequences. + If the text is separated into sentences, the function constructs an + automaton for each sentence. If this is not the case, the function + arbitrarily cuts the text into sequences of 2000 tokens and produces + an automaton for each of these sequences. - The result is a file called text.tfst which is saved in the directory of the text. - Another file named text.tind is also produced. + The result is a file called text.tfst which is saved in the + directory of the text. Another file named text.tind is also produced. Arguments: - text [str] -- the path to the text file in snt format. - alphabet [str] -- the alphabet file + text [str] -- the path to the text file in .snt format. + + alphabet [str] -- the alphabet file. Keyword arguments: - clean [bool] -- indicates whether the rule of conservation of the best - paths (see section 7.2.4) should be applied - (default: False) - normalization_grammar [str] -- name of a normalization grammar that is to be applied - to the text automaton - tagset [str] -- Elag tagset file to use to normalize dictionary entries - korean [bool] -- tells the function that it works on Korean - (default: False) + clean [bool] -- indicates whether the rule of conservation of + the best paths (see section 7.2.4) should be applied + (default: False). + + normalization_grammar [str] -- name of a normalization grammar + that is to be applied to the text automaton. + + tagset [str] -- Elag tagset file to use to normalize dictionary + entries. + + korean [bool] -- tells the function that it works on Korean + (default: False). Return [bool]: - The function return 'True' if it succeeds and 'False' otherwise. + True if it succeeds and False otherwise. """ options = Txt2TFstOptions() options.load(kwargs)