From 3971d2ec76d109d309ac69f7d9bede871fb5828d Mon Sep 17 00:00:00 2001 From: Patrick Watrin <pat@lucy.local> Date: Sun, 28 Feb 2016 20:32:39 +0100 Subject: [PATCH] documentation reformatting for sphinx --- unitex/tools.py | 344 +++++++++++++++++++++++++----------------------- 1 file changed, 176 insertions(+), 168 deletions(-) diff --git a/unitex/tools.py b/unitex/tools.py index a2abe57..44ce8b4 100644 --- a/unitex/tools.py +++ b/unitex/tools.py @@ -753,86 +753,88 @@ def locate(grammar, text, alphabet, **kwargs): recognized units within the text are saved in a file called concord.n. These two files are stored in the directory of the text. - Arguments: - grammar [str] -- the fst2 to apply on the text. - - text [str] -- the text file, with extension .snt. - - alphabet [str] -- the alphabet file of the language of the text. - - Keyword arguments: - - Generic options: - start_on_space [bool] -- this parameter indicates that the - search will start at any position in the text, even - before a space. This parameter should only be used to - carry out morphological searches (default: False). - - char_by_char [bool] -- works in character by character - tokenization mode. This is useful for languages like - Thai (default: False). - - morpho [list(str)] -- this optional argument indicates which - morphological mode dictionaries are to be used, if - needed by some .fst2 dictionaries. The argument is a - list of dictionary path (bin format). - - korean [bool] -- specify the dictionary is in korean - (default: False). - - arabic_rules [str] -- specifies the Arabic typographic rule - configuration file path. - - sntdir [str] -- puts produced files in 'sntdir' instead of - the text directory. Note that 'sntdir' must end with a - file separator (\ or /). - - negation_operator [str] -- specifies the negation operator - to be used in Locate patterns. The two legal values for - X are minus and tilde (default). Using minus provides - backward compatibility with previous versions of Unitex. - - - Search limit options: - number_of_matches [int] -- stops after the first N matches - (default: all matches). - - - Maximum iterations per token options: - stop_token_count [list(int_1, int_2)] -- emits a warning - after 'int_1' iterations on a token and stops after - 'int_2' iterations. - - - Matching mode options: - match_mode [str] -- Possible values are: - - UnitexConstants.MATCH_MODE_SHORTEST - - UnitexConstants.MATCH_MODE_LONGEST (default) - - UnitexConstants.MATCH_MODE_ALL - - - Output options: - output_mode [str] -- Possible values are: - - UnitexConstants.OUTPUT_MODE_IGNORE (default) - - UnitexConstants.OUTPUT_MODE_MERGE - - UnitexConstants.OUTPUT_MODE_REPLACE - - protect_dic_chars [bool] -- when 'merge' or 'replace' mode - is used, this option protects some input characters with - a backslash. This is useful when Locate is called by - 'dico' in order to avoid producing bad lines like: - 3,14,.PI.NUM (default: True). - - variable [list(str_1, str_2)] -- sets an output variable - named str_1 with content str_2. Note that str_2 must be - ASCII. - - - Ambiguous output options: - ambiguous_outputs [bool] -- allows the production of several - matches with same input but different outputs. If False, - in case of ambiguous outputs, one will be arbitrarily - chosen and kept, depending on the internal state of the - function (default: True). - - variable_error [str] -- Possible values are: - - UnitexConstants.ON_ERROR_EXIT - - UnitexConstants.ON_ERROR_IGNORE (default) - - UnitexConstants.ON_ERROR_BACKTRACK + *Arguments:* + + - **grammar [str]** -- the fst2 to apply on the text. + + - **text [str]** -- the text file, with extension .snt. + + - **alphabet [str]** -- the alphabet file of the language of the + text. + + *Keyword arguments:* + + - *Generic options*: + + - **start_on_space [bool]** -- this parameter indicates that the + search will start at any position in the text, even before a + space. This parameter should only be used to carry out + morphological searches (default: False). + + - **char_by_char [bool]** -- works in character by character + tokenization mode. This is useful for languages like Thai + (default: False). + + - **morpho [list(str)]** -- this optional argument indicates which + morphological mode dictionaries are to be used, if needed by + some .fst2 dictionaries. The argument is a list of dictionary + path (bin format). + + - **korean [bool]** -- specify the dictionary is in korean + (default: False). + + - **arabic_rules [str]** -- specifies the Arabic typographic rule + configuration file path. + + - **sntdir [str]** -- puts produced files in 'sntdir' instead of + the text directory. Note that 'sntdir' must end with a file + separator (\ or /). + + - **negation_operator [str]** -- specifies the negation operator + to be used in Locate patterns. The two legal values for X are + minus and tilde (default). Using minus provides backward + compatibility with previous versions of Unitex. + + - *Search limit options:* + - **number_of_matches [int]** -- stops after the first N matches + (default: all matches). + + - *Maximum iterations per token options:* + - **stop_token_count [list(int_1, int_2)]** -- emits a warning + after 'int_1' iterations on a token and stops after 'int_2' + iterations. + + - *Matching mode options:* + - **match_mode [str]** -- Possible values are: + - UnitexConstants.MATCH_MODE_SHORTEST + - UnitexConstants.MATCH_MODE_LONGEST (default) + - UnitexConstants.MATCH_MODE_ALL + + - Output options: + - **output_mode [str]** -- Possible values are: + - UnitexConstants.OUTPUT_MODE_IGNORE (default) + - UnitexConstants.OUTPUT_MODE_MERGE + - UnitexConstants.OUTPUT_MODE_REPLACE + + - **protect_dic_chars [bool]** -- when 'merge' or 'replace' mode + is used, this option protects some input characters with a + backslash. This is useful when Locate is called by 'dico' in + order to avoid producing bad lines like: 3,14,.PI.NUM + (default: True). + + - **variable [list(str_1, str_2)]** -- sets an output variable + named str_1 with content str_2. Note that str_2 must be ASCII. + + - *Ambiguous output options:* + - **ambiguous_outputs [bool]** -- allows the production of several + matches with same input but different outputs. If False, in case + of ambiguous outputs, one will be arbitrarily chosen and kept, + depending on the internal state of the function (default: True). + + - **variable_error [str]** -- Possible values are: + - UnitexConstants.ON_ERROR_EXIT + - UnitexConstants.ON_ERROR_IGNORE (default) + - UnitexConstants.ON_ERROR_BACKTRACK *Return [bool]:* @@ -943,32 +945,34 @@ def normalize(text, **kwargs): delimiter {S}, the stop marker {STOP}, or valid entries in the DELAF format ({aujourd’hui,.ADV}). - NOTE: the function creates a modified version of the text that is - saved in a file with extension .snt. - - WARNING: if you specify a normalization rule file, its rules will be - applied prior to anything else. So, you have to be very - careful if you manipulate separators in such rules. + **NOTE:** the function creates a modified version of the text that is + saved in a file with extension .snt. - Arguments: - text [str] -- the text file to normalize. + **WARNING:** if you specify a normalization rule file, its rules + will be applied prior to anything else. So, you have to be very + careful if you manipulate separators in such rules. - Keyword arguments: - no_carriage_return [bool] -- every separator sequence will be - turned into a single space (default: False). - - input_offsets [str] -- base offset file to be used. + *Arguments:* - output_offsets [str] -- offset file to be produced. + - **text [str]** -- the text file to normalize. - replacement_rules [str] -- specifies the normalization rule file - to be used. See section 14.13.6 for details about the format - of this file. By default, the function only replaces { and } - by [ and ]. + *Keyword arguments:* - no_separator_normalization [bool] -- only applies replacement - rules specified with the 'replacement_rules' option - (default: False). + - ** no_carriage_return [bool]** -- every separator sequence will be + turned into a single space (default: False). + + - **input_offsets [str]** -- base offset file to be used. + + - **output_offsets [str]** -- offset file to be produced. + + - **replacement_rules [str]** -- specifies the normalization rule + file to be used. See section 14.13.6 for details about the format + of this file. By default, the function only replaces { and } by + [ and ]. + + - **no_separator_normalization [bool]** -- only applies replacement + rules specified with the 'replacement_rules' option + (default: False). *Return [bool]:* @@ -1019,26 +1023,28 @@ def sort_txt(text, **kwargs): performed in the order of Unicode characters, removing duplicate lines. - Arguments: - text [str] -- the text file to sort. - - Keyword arguments: - duplicates [bool] -- keep duplicate lines (default: False). - - reverse [bool] -- sort in descending order (default: False). - - sort_order [str] -- sorts using the alphabet order defined in - this file. If this parameter is missing, the sorting is done - according to the order of Unicode characters. + *Arguments:* - line_info [str] -- backup the number of lines of the result file - in this file. + - **text [str]** -- the text file to sort. - thai [bool] -- option for sorting Thai text (default: False). + *Keyword arguments:* - factorize_inflectional_codes [bool] -- makes two entries X,Y.Z:A - and X,Y.Z:B become a single entry X,Y.Z:A:B - (default: False). + - **duplicates [bool]** -- keep duplicate lines (default: False). + + - **reverse [bool]** -- sort in descending order (default: False). + + - **sort_order [str]** -- sorts using the alphabet order defined in + this file. If this parameter is missing, the sorting is done + according to the order of Unicode characters. + + - **line_info [str]** -- backup the number of lines of the result + file in this file. + + - **thai [bool]** -- option for sorting Thai text (default: False). + + - **factorize_inflectional_codes [bool]** -- makes two entries + X,Y.Z:A and X,Y.Z:B become a single entry X,Y.Z:A:B + (default: False). *Return [bool]:* @@ -1092,50 +1098,50 @@ def tokenize(text, alphabet, **kwargs): in a binary file named text.cod. The function also produces the following four files: - - tok_by_freq.txt: text file containing the units sorted by - frequency. - - tok_by_alph.txt: text file containing the units sorted - alphabetically. - - stats.n: text file containing information on the number of - sentence separators, the number of units, the number - of simple words and the number of numbers. - - enter.pos: binary file containing the list of newline - positions in the text. The coded representation of - the text does not contain newlines, but spaces. - Since a newline counts as two characters and a - space as a single one, it is necessary to know - where newlines occur in the text when the positions - of occurrences located by the 'locate' function are - to be synchronized with the text file. File - enter.pos is used for this by the 'concord' - function. Thanks to this, when clicking on an - occurrence in a concordance, it is correctly - selected in the text. File enter.pos is a binary - file containing the list of the positions of - newlines in the text. + - tok_by_freq.txt: text file containing the units sorted by + frequency. + - tok_by_alph.txt: text file containing the units sorted + alphabetically. + - stats.n: text file containing information on the number of + sentence separators, the number of units, the number of simple + words and the number of numbers. + - enter.pos: binary file containing the list of newline positions in + the text. The coded representation of the text does not contain + newlines, but spaces. Since a newline counts as two characters and + a space as a single one, it is necessary to know where newlines + occur in the text when the positions of occurrences located by the + 'locate' function are to be synchronized with the text file. File + enter.pos is used for this by the 'concord' function. Thanks to + this, when clicking on an occurrence in a concordance, it is + correctly selected in the text. File enter.pos is a binary file + containing the list of the positions of newlines in the text. All produced files are saved in the text directory - Arguments: - text [str] -- the text file to tokenize (.snt format). + *Arguments:* - alphabet [str] -- the alphabet file. + - **text [str]** -- the text file to tokenize (.snt format). - Keyword arguments: - - Generic options: - char_by_char [bool] -- indicates whether the function is - applied character by character, with the exceptions of - the sentence delimiter {S}, the stop marker {STOP} and - lexical tags like {today,.ADV} which are considered to - be single units (default: False). + - **alphabet [str]** -- the alphabet file. - tokens [str] -- specifies a tokens.txt file to load and - modify, instead of creating a new one from scratch. + *Keyword arguments:* + + - *Generic options:* - - Offsets options: - input_offsets [str] -- base offset file to be used. + - **char_by_char [bool]** -- indicates whether the function is + applied character by character, with the exceptions of the + sentence delimiter {S}, the stop marker {STOP} and lexical + tags like {today,.ADV} which are considered to be single units + (default: False). + + - **tokens [str]** -- specifies a tokens.txt file to load and + modify, instead of creating a new one from scratch. + + - *Offsets options:* - output_offsets [str] -- offset file to be produced. + - **input_offsets [str]** -- base offset file to be used. + + - **output_offsets [str]** -- offset file to be produced. *Return [bool]:* @@ -1191,24 +1197,26 @@ def txt2tfst(text, alphabet, **kwargs): The result is a file called text.tfst which is saved in the directory of the text. Another file named text.tind is also produced. - Arguments: - text [str] -- the path to the text file in .snt format. - - alphabet [str] -- the alphabet file. + *Arguments:* - Keyword arguments: - clean [bool] -- indicates whether the rule of conservation of - the best paths (see section 7.2.4) should be applied - (default: False). + - **text [str]** -- the path to the text file in .snt format. - normalization_grammar [str] -- name of a normalization grammar - that is to be applied to the text automaton. + - alphabet [str]** -- the alphabet file. - tagset [str] -- Elag tagset file to use to normalize dictionary - entries. + *Keyword arguments:* - korean [bool] -- tells the function that it works on Korean - (default: False). + - **clean [bool]** -- indicates whether the rule of conservation of + the best paths (see section 7.2.4) should be applied + (default: False). + + - **normalization_grammar [str]** -- name of a normalization grammar + that is to be applied to the text automaton. + + - **tagset [str]** -- Elag tagset file to use to normalize + dictionary entries. + + - **korean [bool]** -- tells the function that it works on Korean + (default: False). *Return [bool]:* -- GitLab