diff --git a/config/unitex-template.yaml b/config/unitex-template.yaml index 46b04df25ff7180aa81dc76a340af5ce38189558..a4c7c03d2f7d822f4a7a135fa25e7915fa755884 100644 --- a/config/unitex-template.yaml +++ b/config/unitex-template.yaml @@ -1,66 +1,68 @@ -# Do not modify this file. Use the 'build-config-file.py' script to generate a -# working version adapted to you local Unitex installation or copy this file -# before editing. +# Do not modify this file. Use the 'build-config-file.py' script to +# generate a working version adapted to you local Unitex installation +# or copy this file before editing. # The 'global' section contains the global configuration parameters. global: # There is 3 'debug' level: # 0: the error output is disabled; - # 1: the error output is limited to the logging system implemented in the - # bindings; - # 2: the error output is activated for both the bindings and the Unitex - # processor. - # NOTE: if you activate the debug for level >= 1, the verbose level is - # automatically activated at level 2. + # 1: the error output is limited to the logging system implemented + # in the bindings; + # 2: the error output is activated for both the bindings and the + # Unitex processor. + # NOTE: if you activate the debug for level >= 1, the verbose level + # is automatically activated at level 2. debug: 0 # There is 4 'verbose' level: # 0: the standard output is disabled; - # 1: the standard output shows 'warnings' emitted by the bindings logging - # system; + # 1: the standard output shows 'warnings' emitted by the bindings + # logging system; # 2: the standard output shows 'warnings' and various processing # informations emitted by the bindings logging system; - # 3: the full standard output is activated for both the bindings and the - # Unitex processor. + # 3: the full standard output is activated for both the bindings and + # the Unitex processor. verbose: 0 - # If not 'null', the error and standard outputs are redirected to the file - # specified by this parameters. Be sure to have write access to this file. + # If not 'null', the error and standard outputs are redirected to + # the file specified by this parameters. Be sure to have write + # access to this file. #log: /var/log/unitex.log log: null # If you are using the high-level 'Processor' class, this parameter # activate or deactivate the resource persistence. If persistency is - # activated, dictionaries, grammar and alphabet are loaded during the - # object initialization and kept in memory in order to improve + # activated, dictionaries, grammar and alphabet are loaded during + # the object initialization and kept in memory in order to improve # performances. # NOTE: you can manually activate the persistence by using the # 'load_persistent_X' functions from 'unitex.resources'. persistence: True - # The Unitex library implements a virtual filesystem which avoids a lot - # of I/O and improves the performance. If this parameter is set to True, - # The high-level 'Processor' class will activate automatically this virtual - # filesystem. - # NOTE: as for the persistence, you can activate manually the VFS by using - # the functions from 'unitex.io'. + # The Unitex library implements a virtual filesystem which avoids a + # lot of I/O and improves the performance. If this parameter is set + # to True, the high-level 'Processor' class will activate + # automatically this virtual filesystem. + # NOTE: as for the persistence, you can activate manually the VFS by + # using the functions from 'unitex.io'. virtualization: True -# The 'resources' section is automatically filled by the 'build-config-file.py' -# script. If you want to do it manually, be sure to give the absolute path of -# each resource as shown below. -# NOTE: the 'dictionaries' parameter is a list of path. As required by the YAML -# format, each item must be prefixed by the '-' character (cf. example). -# resources: -# language: fr -# alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt -# alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt -# sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2 -# replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2 -# dictionaries: -# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin -# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin -# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin +# The 'resources' section is automatically filled by the +# 'build-config-file.py' script. If you want to do it manually, be sure +# to give the absolute path of each resource as shown below. +# NOTE: the 'dictionaries' parameter is a list of path. As required by +# the YAML format, each item must be prefixed by the '-' character +# (cf. example). +#resources: +# language: fr +# alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt +# alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt +# sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2 +# replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2 +# dictionaries: +# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin +# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin +# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin resources: language: null @@ -71,25 +73,25 @@ resources: dictionaries: null -# The 'tools' section can contain any of the arguments used by the unitex -# tools. -# Most of the times, these parameters are the same than the one used by the -# original Unitex tools (as described in the Unitex manual). Changes are -# explained in the comments of this file. -# NOTE: if you use the 'Processor' high-level class some parameters will be -# overriden to fit the 'tag' functions behaviour. For instance, there is -# no point to define a font or a context for 'concord'. +# The 'tools' section can contain any of the arguments used by the +# unitex tools. +# Most of the times, these parameters are the same than the one used by +# the original Unitex tools (as described in the Unitex manual). Changes +# are explained in the comments of this file. +# NOTE: if you use the 'Processor' high-level class some parameters will +# be overriden to fit the 'tag' functions behaviour. For instance, +# there is no point to define a font or a context for 'concord'. # NOTE: ALL FILE PATH MUST BE ABSOLUTE!!! tools: # CheckDic command (Unitex manual, p.266) check_dic: - # If set to True, the function will use a strict syntax checking + # If set to True, the function will use a strict syntax checking # against unprotected dot and comma. strict: False - # If set to True, the function will tolerate spaces in grammatical, - # semantic and inflectional codes. + # If set to True, the function will tolerate spaces in + # grammatical, semantic and inflectional codes. no_space_warning: False # Compress command (Unitex manual, p.266) @@ -98,9 +100,10 @@ tools: # produce a file xxx.bin. output: null - # If set to True, 'flip' indicates that the inflected and canonical - # forms should be swapped in the compressed dictionary. This option is - # used to construct an inverse dictionary. + # If set to True, 'flip' indicates that the inflected and + # canonical forms should be swapped in the compressed + # dictionary. This option is used to construct an inverse + # dictionary. flip: False # If set to True, the function will use the semitic compression @@ -108,45 +111,48 @@ tools: semitic: False # 'version: v1' produces an old style .bin file. - # 'version: v2' produces a new style .bin file, with no file size - # limitation to 16 Mb and a smaller resulting size. + # 'version: v2' produces a new style .bin file, with no file + # size limitation to 16 Mb and a smaller resulting + # size. version: v2 # Concord command (Unitex manual, p.267) concord: - # 'font' specifies the name of the font to use if the output is an - # HTML file. + # 'font' specifies the name of the font to use if the output is + # an HTML file. #font: "Courier new" font: null - # 'fontsize' specifies the font size to use if the output is an HTML - # file. + # 'fontsize' specifies the font size to use if the output is an + # HTML file. #fontsize: 12 fontsize: null - # If 'only_ambiguous' is set to True, the function will only displays - # identical occurrences with ambiguous outputs, in text order. + # If 'only_ambiguous' is set to True, the function will only + # displays identical occurrences with ambiguous outputs, in text + # order. only_ambiguous: False - # If 'only_matches' is set to True, the function will force empty right - # and left contexts. Moreover, if used with -t/–text, the function will - # not surround matches with tabulations. + # If 'only_matches' is set to True, the function will force + # empty right and left contexts. Moreover, if used with 'text', + # the function will not surround matches with tabulations. only_matches: False # 'left' specifies the number of characters on the left of the - # occurrences. In Thai mode, this means the number of non-diacritic - # characters. For both 'left' and 'right' parameters, you can add the - # 's' character to stop at the first {S} tag. For instance, if you set - # '40s' for the left value, the left context will end at 40 characters - # at most, less if the {S} tag is found before. + # occurrences. In Thai mode, this means the number of + # non-diacritic characters. For both 'left' and 'right' + # parameters, you can add the 's' character to stop at the first + # {S} tag. For instance, if you set '40s' for the left value, + # the left context will end at 40 characters at most, less if + # the {S} tag is found before. # NOTE: the number must be quoted to avoid integer conversion. left: "0" - # 'right' specifies the number of characters (non-diacritic ones in - # Thai mode) on the right of the occurrences. If the occurrence is - # shorter than this value, the concordance line is completed up to - # right. If the occurrence is longer than the length defined by right, - # it is nevertheless saved as whole. + # 'right' specifies the number of characters (non-diacritic ones + # in Thai mode) on the right of the occurrences. If the + # occurrence is shorter than this value, the concordance line is + # completed up to right. If the occurrence is longer than the + # length defined by right, it is nevertheless saved as whole. # NOTE: the number must be quoted to avoid integer conversion. right: "0" @@ -164,101 +170,107 @@ tools: # 'format' specifies the output format. Possible values are: # - html: produces a concordance in HTML format; # - text: produces a concordance in text format; - # - glossanet: produces a concordance for GlossaNet in HTML format - # where occurrences are links described by the 'script' - # parameter; - # - script: produces a HTML concordance file where occurrences are - # links described by the 'script' parameter; - # - index: produces an index of the concordance, made of the content - # of the occurrences (with the grammar outputs, if any), - # preceded by the positions of the occurrences in the text - # file given in characters; + # - glossanet: produces a concordance for GlossaNet in HTML + # format where occurrences are links described by + # the 'script' parameter; + # - script: produces a HTML concordance file where occurrences + # are links described by the 'script' parameter; + # - index: produces an index of the concordance, made of the + # content of the occurrences (with the grammar + # outputs, if any), preceded by the positions of the + # occurrences in the text file given in characters; # - uima: produces an index of the concordance relative to the # original text file, before any Unitex operation. The # 'offsets' parameter must be provided; - # - prlg: produces a concordance for PRLG corpora where each line is - # prefixed by information extracted with Unxmlize’s 'prlg' - # option. You must provide both the 'offsets' and the - # 'unxmlize' parameter; + # - prlg: produces a concordance for PRLG corpora where each + # line is prefixed by information extracted with + # Unxmlize’s 'prlg' option. You must provide both the + # 'offsets' and the 'unxmlize' parameter; # - xml: produces xml index of the concordance; - # - xml-with-header: produces an xml index of the concordance with - # full xml header; - # - axis: quite the same as 'index', but the numbers represent the - # median character of each occurrence; - # - xalign: another index file, used by the text alignment module. - # Each line is made of 3 integers X Y Z followed by the - # content of the occurrence. X is the sentence number, - # starting from 1. Y and Z are the starting and ending - # positions of the occurrence in the sentence, given in - # characters; - # - merge: indicates to the function that it is supposed to produce - # a modified version of the text and save it in a file. The - # filename must be provided with the 'output' parameter. + # - xml-with-header: produces an xml index of the concordance + # with full xml header; + # - axis: quite the same as 'index', but the numbers represent + # the median character of each occurrence; + # - xalign: another index file, used by the text alignment + # module. Each line is made of 3 integers X Y Z + # followed by the content of the occurrence. X is + # the sentence number, starting from 1. Y and Z are + # the starting and ending positions of the + # occurrence in the sentence, given in characters; + # - merge: indicates to the function that it is supposed to + # produce a modified version of the text and save it + # in a file. The filename must be provided with the + # 'output' parameter. format: "text" - # 'script' describes the links format for 'glossanet' and 'script' - # output. For instance, if you use 'http://www.google.com/search?q=', - # you will obtain a HTML concordance file where occurrences are - # hyperlinks to Google queries. + # 'script' describes the links format for 'glossanet' and + # 'script' output. For instance, if you use + # 'http://www.google.com/search?q=', you will obtain a + # HTML concordance file where occurrences are hyperlinks to + # Google queries. script: null - # 'offsets' provides the file produced by tokenize’s output_offsets - # option (needed by the 'uima' and the 'prlg' format). + # 'offsets' provides the file produced by tokenize’s + # output_offsets option (needed by the 'uima' and the 'prlg' + # format). offsets: null - # 'unxmlize' provides the file produced by Unxmlize’s 'prlg' option - # (needed by the 'prlg' format). + # 'unxmlize' provides the file produced by Unxmlize’s 'prlg' + # option (needed by the 'prlg' format). unxmlize: null - # 'directory' indicates to the function that it must not work in the - # same directory than <index> but in 'directory' + # 'directory' indicates to the function that it must not work in + # the same directory than <index> but in 'directory'. directory: null - # If set to True, 'thai' indicates that the input text is in Thai - # language + # If set to True, 'thai' indicates that the input text is in + # Thai language. thai: False # Dico command (Unitex manual, p.272) dico: - # 'morpho' lists dictionaries to load in morphological mode, if needed - # by some .fst2 dictionaries. + # 'morpho' lists dictionaries to load in morphological mode, if + # needed by some .fst2 dictionaries. morpho: null - # If set to True, 'korean' indicates that the input text is in korean - # language. + # If set to True, 'korean' indicates that the input text is in + # korean language. korean: False - # If set to True, 'semitic' indicates that the input text is in a - # semitic language. + # If set to True, 'semitic' indicates that the input text is in + # a semitic language. semitic: False - # 'arabic_rules' specifies the Arabic typographic rule configuration - # file path. + # 'arabic_rules' specifies the Arabic typographic rule + # configuration file path. arabic_rules: null - # 'raw' specifies and alternative output file path containing both - # simple and compound words, without requiring a text directory. + # 'raw' specifies and alternative output file path containing + # both simple and compound words, without requiring a text + # directory. raw: null # Extract command (Unitex manual, p.277) extract: - # If set to True, 'non_matching_sentences' indicates to the function - # to extract all sentences that don’t contain matching units. + # If set to True, 'non_matching_sentences' indicates to the + # function to extract all sentences that don’t contain matching + # units. non_matching_sentences: False # Fst2Txt command (Unitex manual, p.280) fst2txt: - # If set to True, the search will start at any position in the text, - # even before a space. This parameter should only be used to carry out - # morphological searches. + # If set to True, the search will start at any position in the + # text, even before a space. This parameter should only be used + # to carry out morphological searches. start_on_space: False - # If set to True, the function will work in character by character - # tokenization mode. This is useful for languages like Thai. + # If set to True, the function will work in character by + # character tokenization mode. This is useful for languages like + # Thai. word_by_word: False - # If set to True, the function merge (instead of replace) transducer - # outputs with text inputs. + # If set to True, the function merge (instead of replace) + # transducer outputs with text inputs. merge: True # Grf2Fst2 command (Unitex manual, p.280) @@ -266,26 +278,28 @@ tools: # If set to True, 'loop_check' enables error (loop) checking. loop_check: False - # If set to True, tokenization will be done character by character. + # If set to True, tokenization will be done character by + # character. char_by_char: False - # 'pkgdir' specifies the repository directory to use (see section - # 5.2.2, p.99). + # 'pkgdir' specifies the repository directory to use (see + # section 5.2.2, p.99). pkgdir: null - # If set to True, no warning will be emitted when a graph matches the - # empty word. + # If set to True, no warning will be emitted when a graph + # matches the empty word. no_empty_graph_warning: False - # If set to True, the function checks wether the given graph can be - # considered as a valid sentence automaton or not. + # If set to True, the function checks wether the given graph can + # be considered as a valid sentence automaton or not. tfst_check: False # If set to True, the function does not print the graph names. silent_grf_name: True - # 'named_repository' must be a list of X=Y sequences, separated by ‘;’, - # where X is the name of the repository denoted by pathname Y. + # 'named_repository' must be a list of X=Y sequences, separated + # by ‘;’, where X is the name of the repository denoted by + # pathname Y. named_repository: null # If set to True, the graph is compiled in debug mode. @@ -297,95 +311,104 @@ tools: # Locate command (Unitex manual, p.283) locate: - # If set to True, the search will start at any position in the text, - # even before a space. This parameter should only be used to carry out - # morphological searches. + # If set to True, the search will start at any position in the + # text, even before a space. This parameter should only be used + # to carry out morphological searches. start_on_space: False - # If set to True, tokenization will be done character by character. + # If set to True, tokenization will be done character by + # character. char_by_char: False - # 'morpho' lists dictionaries to load in morphological mode, if needed - # by some .fst2 dictionaries. + # 'morpho' lists dictionaries to load in morphological mode, if + # needed by some .fst2 dictionaries. morpho: null - # If set to True, 'korean' indicates that the input text is in korean - # language. + # If set to True, 'korean' indicates that the input text is in + # korean language. korean: False - # 'arabic_rules' specifies the Arabic typographic rule configuration - # file path. + # 'arabic_rules' specifies the Arabic typographic rule + # configuration file path. arabic_rules: null - # If not null, the function puts produced files in 'sntdir' instead - # of the text directory. Note that 'sntdir' must end with a file - # separator (\ or /). + # If not null, the function puts produced files in 'sntdir' + # instead of the text directory. Note that 'sntdir' must end + # with a file separator (\ or /). sntdir: null - # This parameter specifies the negation operator to be used in Locate - # patterns. The two legal values for X are 'minus' and 'tilde'. + # This parameter specifies the negation operator to be used in + # Locate patterns. The two legal values for X are 'minus' and + # 'tilde'. negation_operator: "tilde" # If not null, the function stops after the first N matches. By # default, the function searches for all matches. number_of_matches: null - # 'stop_token_count' is a list of two integers. If specified, the - # function will emit a warning after 'int_1' iterations on a token and - # stops after 'int_2' iterations. + # 'stop_token_count' is a list of two integers. If specified, + # the function will emit a warning after 'int_1' iterations on a + # token and stops after 'int_2' iterations. #stop_token_count=[3,5] stop_token_count: null - # Possible values for 'match_mode' are: 'longest', 'shortest' and 'all' + # Possible values for 'match_mode' are: 'longest', 'shortest' + # and 'all'. match_mode: "longest" # Possible values for 'output_mode' are: # - 'ignore': the transducer outputs will be ignored; - # - 'merge': the transducer outputs will be merged with the input - # text; - # - 'replace': the transducer outputs replaces the matching text. + # - 'merge': the transducer outputs will be merged with the + # input text; + # - 'replace': the transducer outputs replaces the matching + # text. output_mode: "merge" - # If set to True, this parameter enables special characters protection - # when 'merge' or 'replace' mode is used. This is useful when Locate is - # called by Dico in order to avoid producing bad lines like: + # If set to True, this parameter enables special characters + # protection when 'merge' or 'replace' mode is used. This is + # useful when Locate is called by Dico in order to avoid + # producing bad lines like: # 3,14,.PI.NUM protect_dic_chars: True - # If not null, this parameter must be a list of two strings, where: - # 'str_1' is a variable name whith content 'str_2'. + # If not null, this parameter must be a list of two strings, + # where: 'str_1' is a variable name whith content 'str_2'. # NOTE: 'str_2' must be ASCII. variable: null # If set to True, the function allows the production of several - # matches with same input but different outputs. If False, in case of - # ambiguous outputs, one will be arbitrarily chosen and kept, - # depending on the internal state of the function. + # matches with same input but different outputs. If False, in + # case of ambiguous outputs, one will be arbitrarily chosen and + # kept, depending on the internal state of the function. ambiguous_outputs: True # Possible values are: - # - 'exit': kills the function if variable has an empty content; + # - 'exit': kills the function if variable has an empty + # content; # - 'ignore': ignore the errors; # - 'backtrack': stop the current path exploration. variable_error: "ignore" # Normalize command (Unitex manual, p.287) normalize: - # If set to True, every separator sequence will be turned into a single - # space. + # If set to True, every separator sequence will be turned into a + # single space. no_carriage_return: False - # 'input_offsets' specifies the base offset file path to be used. + # 'input_offsets' specifies the base offset file path to be + # used. input_offsets: null - # 'output_offsets' specifies the offset file path to be produced. + # 'output_offsets' specifies the offset file path to be + # produced. output_offsets: null - # 'replacement_rules' specifies the normalization rule file to be used. + # 'replacement_rules' specifies the normalization rule file to + # be used. replacement_rules: null - # If set to True, the function only applies replacement rules specified - # with the 'replacement_rules' parameter. + # If set to True, the function only applies replacement rules + # specified with the 'replacement_rules' parameter. no_separator_normalization: False # SortTxt command (Unitex manual, p.291) @@ -397,54 +420,57 @@ tools: reverse: False # The 'sort_order' parameter specifies the file path of the - # 'Alphabet_sort.txt' file or any other file defining the alphabet - # order. + # 'Alphabet_sort.txt' file or any other file defining the + # alphabet order. sort_order: null - # If not null, the function backups the number of lines of the result - # file in the file specified by this parameter. + # If not null, the function backups the number of lines of the + # result file in the file specified by this parameter. line_info: null - # If set to True, 'thai' indicates that the input text is in Thai - # language + # If set to True, 'thai' indicates that the input text is in + # Thai language. thai: False - # If set to True, the function makes two entries XXX,YYY.ZZZ:A and - # XXX,YYY.ZZZ:B become a single entry: XXX,YYY.ZZZ:A:B + # If set to True, the function makes two entries X,Y.Z:A and + # X,Y.Z:B become a single entry: X,Y.Z:A:B factorize_inflectional_codes: False # Tokenize command (Unitex manual, p.294) tokenize: - # If set to True, the function is applied character by character, with - # the exceptions of the sentence delimiter {S}, the stop marker {STOP} - # and lexical tags like {today,.ADV} which are considered to be single - # units. + # If set to True, the function is applied character by + # character, with the exceptions of the sentence delimiter {S}, + # the stop marker {STOP} and lexical tags like {today,.ADV} + # which are considered to be single units. char_by_char: False - # 'tokens' specifies the path of the 'tokens.txt' file to loa and - # modify, instead of creating a new one from scratch. + # 'tokens' specifies the path of the 'tokens.txt' file to load + # and modify, instead of creating a new one from scratch. tokens: null - # 'input_offsets' specifies the base offset file path to be used. + # 'input_offsets' specifies the base offset file path to be + # used. input_offsets: null - # 'output_offsets' specifies the offset file path to be produced. + # 'output_offsets' specifies the offset file path to be + # produced. output_offsets: null # Txt2Tfst command (Unitex manual, p.296) txt2tfst: - # If set to True, 'clean' indicates whether the rule of conservation of - # the best paths (see section 7.2.4) should be applied. + # If set to True, 'clean' indicates whether the rule of + # conservation of the best paths (see section 7.2.4) should be + # applied. clean: False - # This parameter specifies the file path of a normalization grammar - # that is to be applied to the text automaton. + # This parameter specifies the file path of a normalization + # grammar that is to be applied to the text automaton. normalization_grammar: null - # This parameter specifies the Elag tagset file to use to normalize - # dictionary entries. + # This parameter specifies the Elag tagset file to use to + # normalize dictionary entries. tagset: null - # If set to True, 'korean' indicates that the input text is in korean - # language. + # If set to True, 'korean' indicates that the input text is + # in korean language. korean: False