Skip to content
Extraits de code Groupes Projets
Valider a1c97108 rédigé par Patrick Watrin's avatar Patrick Watrin
Parcourir les fichiers

PEP 0008 cosmetics

parent ef167363
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
# Do not modify this file. Use the 'build-config-file.py' script to generate a # Do not modify this file. Use the 'build-config-file.py' script to
# working version adapted to you local Unitex installation or copy this file # generate a working version adapted to you local Unitex installation
# before editing. # or copy this file before editing.
# The 'global' section contains the global configuration parameters. # The 'global' section contains the global configuration parameters.
global: global:
# There is 3 'debug' level: # There is 3 'debug' level:
# 0: the error output is disabled; # 0: the error output is disabled;
# 1: the error output is limited to the logging system implemented in the # 1: the error output is limited to the logging system implemented
# bindings; # in the bindings;
# 2: the error output is activated for both the bindings and the Unitex # 2: the error output is activated for both the bindings and the
# processor. # Unitex processor.
# NOTE: if you activate the debug for level >= 1, the verbose level is # NOTE: if you activate the debug for level >= 1, the verbose level
# automatically activated at level 2. # is automatically activated at level 2.
debug: 0 debug: 0
# There is 4 'verbose' level: # There is 4 'verbose' level:
# 0: the standard output is disabled; # 0: the standard output is disabled;
# 1: the standard output shows 'warnings' emitted by the bindings logging # 1: the standard output shows 'warnings' emitted by the bindings
# system; # logging system;
# 2: the standard output shows 'warnings' and various processing # 2: the standard output shows 'warnings' and various processing
# informations emitted by the bindings logging system; # informations emitted by the bindings logging system;
# 3: the full standard output is activated for both the bindings and the # 3: the full standard output is activated for both the bindings and
# Unitex processor. # the Unitex processor.
verbose: 0 verbose: 0
# If not 'null', the error and standard outputs are redirected to the file # If not 'null', the error and standard outputs are redirected to
# specified by this parameters. Be sure to have write access to this file. # the file specified by this parameters. Be sure to have write
# access to this file.
#log: /var/log/unitex.log #log: /var/log/unitex.log
log: null log: null
# If you are using the high-level 'Processor' class, this parameter # If you are using the high-level 'Processor' class, this parameter
# activate or deactivate the resource persistence. If persistency is # activate or deactivate the resource persistence. If persistency is
# activated, dictionaries, grammar and alphabet are loaded during the # activated, dictionaries, grammar and alphabet are loaded during
# object initialization and kept in memory in order to improve # the object initialization and kept in memory in order to improve
# performances. # performances.
# NOTE: you can manually activate the persistence by using the # NOTE: you can manually activate the persistence by using the
# 'load_persistent_X' functions from 'unitex.resources'. # 'load_persistent_X' functions from 'unitex.resources'.
persistence: True persistence: True
# The Unitex library implements a virtual filesystem which avoids a lot # The Unitex library implements a virtual filesystem which avoids a
# of I/O and improves the performance. If this parameter is set to True, # lot of I/O and improves the performance. If this parameter is set
# The high-level 'Processor' class will activate automatically this virtual # to True, the high-level 'Processor' class will activate
# filesystem. # automatically this virtual filesystem.
# NOTE: as for the persistence, you can activate manually the VFS by using # NOTE: as for the persistence, you can activate manually the VFS by
# the functions from 'unitex.io'. # using the functions from 'unitex.io'.
virtualization: True virtualization: True
# The 'resources' section is automatically filled by the 'build-config-file.py' # The 'resources' section is automatically filled by the
# script. If you want to do it manually, be sure to give the absolute path of # 'build-config-file.py' script. If you want to do it manually, be sure
# each resource as shown below. # to give the absolute path of each resource as shown below.
# NOTE: the 'dictionaries' parameter is a list of path. As required by the YAML # NOTE: the 'dictionaries' parameter is a list of path. As required by
# format, each item must be prefixed by the '-' character (cf. example). # the YAML format, each item must be prefixed by the '-' character
# resources: # (cf. example).
# language: fr #resources:
# alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt # language: fr
# alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt # alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt
# sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2 # alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt
# replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2 # sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2
# dictionaries: # replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin # dictionaries:
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin # - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin # - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin
resources: resources:
language: null language: null
...@@ -71,25 +73,25 @@ resources: ...@@ -71,25 +73,25 @@ resources:
dictionaries: null dictionaries: null
# The 'tools' section can contain any of the arguments used by the unitex # The 'tools' section can contain any of the arguments used by the
# tools. # unitex tools.
# Most of the times, these parameters are the same than the one used by the # Most of the times, these parameters are the same than the one used by
# original Unitex tools (as described in the Unitex manual). Changes are # the original Unitex tools (as described in the Unitex manual). Changes
# explained in the comments of this file. # are explained in the comments of this file.
# NOTE: if you use the 'Processor' high-level class some parameters will be # NOTE: if you use the 'Processor' high-level class some parameters will
# overriden to fit the 'tag' functions behaviour. For instance, there is # be overriden to fit the 'tag' functions behaviour. For instance,
# no point to define a font or a context for 'concord'. # there is no point to define a font or a context for 'concord'.
# NOTE: ALL FILE PATH MUST BE ABSOLUTE!!! # NOTE: ALL FILE PATH MUST BE ABSOLUTE!!!
tools: tools:
# CheckDic command (Unitex manual, p.266) # CheckDic command (Unitex manual, p.266)
check_dic: check_dic:
# If set to True, the function will use a strict syntax checking # If set to True, the function will use a strict syntax checking
# against unprotected dot and comma. # against unprotected dot and comma.
strict: False strict: False
# If set to True, the function will tolerate spaces in grammatical, # If set to True, the function will tolerate spaces in
# semantic and inflectional codes. # grammatical, semantic and inflectional codes.
no_space_warning: False no_space_warning: False
# Compress command (Unitex manual, p.266) # Compress command (Unitex manual, p.266)
...@@ -98,9 +100,10 @@ tools: ...@@ -98,9 +100,10 @@ tools:
# produce a file xxx.bin. # produce a file xxx.bin.
output: null output: null
# If set to True, 'flip' indicates that the inflected and canonical # If set to True, 'flip' indicates that the inflected and
# forms should be swapped in the compressed dictionary. This option is # canonical forms should be swapped in the compressed
# used to construct an inverse dictionary. # dictionary. This option is used to construct an inverse
# dictionary.
flip: False flip: False
# If set to True, the function will use the semitic compression # If set to True, the function will use the semitic compression
...@@ -108,45 +111,48 @@ tools: ...@@ -108,45 +111,48 @@ tools:
semitic: False semitic: False
# 'version: v1' produces an old style .bin file. # 'version: v1' produces an old style .bin file.
# 'version: v2' produces a new style .bin file, with no file size # 'version: v2' produces a new style .bin file, with no file
# limitation to 16 Mb and a smaller resulting size. # size limitation to 16 Mb and a smaller resulting
# size.
version: v2 version: v2
# Concord command (Unitex manual, p.267) # Concord command (Unitex manual, p.267)
concord: concord:
# 'font' specifies the name of the font to use if the output is an # 'font' specifies the name of the font to use if the output is
# HTML file. # an HTML file.
#font: "Courier new" #font: "Courier new"
font: null font: null
# 'fontsize' specifies the font size to use if the output is an HTML # 'fontsize' specifies the font size to use if the output is an
# file. # HTML file.
#fontsize: 12 #fontsize: 12
fontsize: null fontsize: null
# If 'only_ambiguous' is set to True, the function will only displays # If 'only_ambiguous' is set to True, the function will only
# identical occurrences with ambiguous outputs, in text order. # displays identical occurrences with ambiguous outputs, in text
# order.
only_ambiguous: False only_ambiguous: False
# If 'only_matches' is set to True, the function will force empty right # If 'only_matches' is set to True, the function will force
# and left contexts. Moreover, if used with -t/–text, the function will # empty right and left contexts. Moreover, if used with 'text',
# not surround matches with tabulations. # the function will not surround matches with tabulations.
only_matches: False only_matches: False
# 'left' specifies the number of characters on the left of the # 'left' specifies the number of characters on the left of the
# occurrences. In Thai mode, this means the number of non-diacritic # occurrences. In Thai mode, this means the number of
# characters. For both 'left' and 'right' parameters, you can add the # non-diacritic characters. For both 'left' and 'right'
# 's' character to stop at the first {S} tag. For instance, if you set # parameters, you can add the 's' character to stop at the first
# '40s' for the left value, the left context will end at 40 characters # {S} tag. For instance, if you set '40s' for the left value,
# at most, less if the {S} tag is found before. # the left context will end at 40 characters at most, less if
# the {S} tag is found before.
# NOTE: the number must be quoted to avoid integer conversion. # NOTE: the number must be quoted to avoid integer conversion.
left: "0" left: "0"
# 'right' specifies the number of characters (non-diacritic ones in # 'right' specifies the number of characters (non-diacritic ones
# Thai mode) on the right of the occurrences. If the occurrence is # in Thai mode) on the right of the occurrences. If the
# shorter than this value, the concordance line is completed up to # occurrence is shorter than this value, the concordance line is
# right. If the occurrence is longer than the length defined by right, # completed up to right. If the occurrence is longer than the
# it is nevertheless saved as whole. # length defined by right, it is nevertheless saved as whole.
# NOTE: the number must be quoted to avoid integer conversion. # NOTE: the number must be quoted to avoid integer conversion.
right: "0" right: "0"
...@@ -164,101 +170,107 @@ tools: ...@@ -164,101 +170,107 @@ tools:
# 'format' specifies the output format. Possible values are: # 'format' specifies the output format. Possible values are:
# - html: produces a concordance in HTML format; # - html: produces a concordance in HTML format;
# - text: produces a concordance in text format; # - text: produces a concordance in text format;
# - glossanet: produces a concordance for GlossaNet in HTML format # - glossanet: produces a concordance for GlossaNet in HTML
# where occurrences are links described by the 'script' # format where occurrences are links described by
# parameter; # the 'script' parameter;
# - script: produces a HTML concordance file where occurrences are # - script: produces a HTML concordance file where occurrences
# links described by the 'script' parameter; # are links described by the 'script' parameter;
# - index: produces an index of the concordance, made of the content # - index: produces an index of the concordance, made of the
# of the occurrences (with the grammar outputs, if any), # content of the occurrences (with the grammar
# preceded by the positions of the occurrences in the text # outputs, if any), preceded by the positions of the
# file given in characters; # occurrences in the text file given in characters;
# - uima: produces an index of the concordance relative to the # - uima: produces an index of the concordance relative to the
# original text file, before any Unitex operation. The # original text file, before any Unitex operation. The
# 'offsets' parameter must be provided; # 'offsets' parameter must be provided;
# - prlg: produces a concordance for PRLG corpora where each line is # - prlg: produces a concordance for PRLG corpora where each
# prefixed by information extracted with Unxmlize’s 'prlg' # line is prefixed by information extracted with
# option. You must provide both the 'offsets' and the # Unxmlize’s 'prlg' option. You must provide both the
# 'unxmlize' parameter; # 'offsets' and the 'unxmlize' parameter;
# - xml: produces xml index of the concordance; # - xml: produces xml index of the concordance;
# - xml-with-header: produces an xml index of the concordance with # - xml-with-header: produces an xml index of the concordance
# full xml header; # with full xml header;
# - axis: quite the same as 'index', but the numbers represent the # - axis: quite the same as 'index', but the numbers represent
# median character of each occurrence; # the median character of each occurrence;
# - xalign: another index file, used by the text alignment module. # - xalign: another index file, used by the text alignment
# Each line is made of 3 integers X Y Z followed by the # module. Each line is made of 3 integers X Y Z
# content of the occurrence. X is the sentence number, # followed by the content of the occurrence. X is
# starting from 1. Y and Z are the starting and ending # the sentence number, starting from 1. Y and Z are
# positions of the occurrence in the sentence, given in # the starting and ending positions of the
# characters; # occurrence in the sentence, given in characters;
# - merge: indicates to the function that it is supposed to produce # - merge: indicates to the function that it is supposed to
# a modified version of the text and save it in a file. The # produce a modified version of the text and save it
# filename must be provided with the 'output' parameter. # in a file. The filename must be provided with the
# 'output' parameter.
format: "text" format: "text"
# 'script' describes the links format for 'glossanet' and 'script' # 'script' describes the links format for 'glossanet' and
# output. For instance, if you use 'http://www.google.com/search?q=', # 'script' output. For instance, if you use
# you will obtain a HTML concordance file where occurrences are # 'http://www.google.com/search?q=', you will obtain a
# hyperlinks to Google queries. # HTML concordance file where occurrences are hyperlinks to
# Google queries.
script: null script: null
# 'offsets' provides the file produced by tokenize’s output_offsets # 'offsets' provides the file produced by tokenize’s
# option (needed by the 'uima' and the 'prlg' format). # output_offsets option (needed by the 'uima' and the 'prlg'
# format).
offsets: null offsets: null
# 'unxmlize' provides the file produced by Unxmlize’s 'prlg' option # 'unxmlize' provides the file produced by Unxmlize’s 'prlg'
# (needed by the 'prlg' format). # option (needed by the 'prlg' format).
unxmlize: null unxmlize: null
# 'directory' indicates to the function that it must not work in the # 'directory' indicates to the function that it must not work in
# same directory than <index> but in 'directory' # the same directory than <index> but in 'directory'.
directory: null directory: null
# If set to True, 'thai' indicates that the input text is in Thai # If set to True, 'thai' indicates that the input text is in
# language # Thai language.
thai: False thai: False
# Dico command (Unitex manual, p.272) # Dico command (Unitex manual, p.272)
dico: dico:
# 'morpho' lists dictionaries to load in morphological mode, if needed # 'morpho' lists dictionaries to load in morphological mode, if
# by some .fst2 dictionaries. # needed by some .fst2 dictionaries.
morpho: null morpho: null
# If set to True, 'korean' indicates that the input text is in korean # If set to True, 'korean' indicates that the input text is in
# language. # korean language.
korean: False korean: False
# If set to True, 'semitic' indicates that the input text is in a # If set to True, 'semitic' indicates that the input text is in
# semitic language. # a semitic language.
semitic: False semitic: False
# 'arabic_rules' specifies the Arabic typographic rule configuration # 'arabic_rules' specifies the Arabic typographic rule
# file path. # configuration file path.
arabic_rules: null arabic_rules: null
# 'raw' specifies and alternative output file path containing both # 'raw' specifies and alternative output file path containing
# simple and compound words, without requiring a text directory. # both simple and compound words, without requiring a text
# directory.
raw: null raw: null
# Extract command (Unitex manual, p.277) # Extract command (Unitex manual, p.277)
extract: extract:
# If set to True, 'non_matching_sentences' indicates to the function # If set to True, 'non_matching_sentences' indicates to the
# to extract all sentences that don’t contain matching units. # function to extract all sentences that don’t contain matching
# units.
non_matching_sentences: False non_matching_sentences: False
# Fst2Txt command (Unitex manual, p.280) # Fst2Txt command (Unitex manual, p.280)
fst2txt: fst2txt:
# If set to True, the search will start at any position in the text, # If set to True, the search will start at any position in the
# even before a space. This parameter should only be used to carry out # text, even before a space. This parameter should only be used
# morphological searches. # to carry out morphological searches.
start_on_space: False start_on_space: False
# If set to True, the function will work in character by character # If set to True, the function will work in character by
# tokenization mode. This is useful for languages like Thai. # character tokenization mode. This is useful for languages like
# Thai.
word_by_word: False word_by_word: False
# If set to True, the function merge (instead of replace) transducer # If set to True, the function merge (instead of replace)
# outputs with text inputs. # transducer outputs with text inputs.
merge: True merge: True
# Grf2Fst2 command (Unitex manual, p.280) # Grf2Fst2 command (Unitex manual, p.280)
...@@ -266,26 +278,28 @@ tools: ...@@ -266,26 +278,28 @@ tools:
# If set to True, 'loop_check' enables error (loop) checking. # If set to True, 'loop_check' enables error (loop) checking.
loop_check: False loop_check: False
# If set to True, tokenization will be done character by character. # If set to True, tokenization will be done character by
# character.
char_by_char: False char_by_char: False
# 'pkgdir' specifies the repository directory to use (see section # 'pkgdir' specifies the repository directory to use (see
# 5.2.2, p.99). # section 5.2.2, p.99).
pkgdir: null pkgdir: null
# If set to True, no warning will be emitted when a graph matches the # If set to True, no warning will be emitted when a graph
# empty word. # matches the empty word.
no_empty_graph_warning: False no_empty_graph_warning: False
# If set to True, the function checks wether the given graph can be # If set to True, the function checks wether the given graph can
# considered as a valid sentence automaton or not. # be considered as a valid sentence automaton or not.
tfst_check: False tfst_check: False
# If set to True, the function does not print the graph names. # If set to True, the function does not print the graph names.
silent_grf_name: True silent_grf_name: True
# 'named_repository' must be a list of X=Y sequences, separated by ‘;’, # 'named_repository' must be a list of X=Y sequences, separated
# where X is the name of the repository denoted by pathname Y. # by ‘;’, where X is the name of the repository denoted by
# pathname Y.
named_repository: null named_repository: null
# If set to True, the graph is compiled in debug mode. # If set to True, the graph is compiled in debug mode.
...@@ -297,95 +311,104 @@ tools: ...@@ -297,95 +311,104 @@ tools:
# Locate command (Unitex manual, p.283) # Locate command (Unitex manual, p.283)
locate: locate:
# If set to True, the search will start at any position in the text, # If set to True, the search will start at any position in the
# even before a space. This parameter should only be used to carry out # text, even before a space. This parameter should only be used
# morphological searches. # to carry out morphological searches.
start_on_space: False start_on_space: False
# If set to True, tokenization will be done character by character. # If set to True, tokenization will be done character by
# character.
char_by_char: False char_by_char: False
# 'morpho' lists dictionaries to load in morphological mode, if needed # 'morpho' lists dictionaries to load in morphological mode, if
# by some .fst2 dictionaries. # needed by some .fst2 dictionaries.
morpho: null morpho: null
# If set to True, 'korean' indicates that the input text is in korean # If set to True, 'korean' indicates that the input text is in
# language. # korean language.
korean: False korean: False
# 'arabic_rules' specifies the Arabic typographic rule configuration # 'arabic_rules' specifies the Arabic typographic rule
# file path. # configuration file path.
arabic_rules: null arabic_rules: null
# If not null, the function puts produced files in 'sntdir' instead # If not null, the function puts produced files in 'sntdir'
# of the text directory. Note that 'sntdir' must end with a file # instead of the text directory. Note that 'sntdir' must end
# separator (\ or /). # with a file separator (\ or /).
sntdir: null sntdir: null
# This parameter specifies the negation operator to be used in Locate # This parameter specifies the negation operator to be used in
# patterns. The two legal values for X are 'minus' and 'tilde'. # Locate patterns. The two legal values for X are 'minus' and
# 'tilde'.
negation_operator: "tilde" negation_operator: "tilde"
# If not null, the function stops after the first N matches. By # If not null, the function stops after the first N matches. By
# default, the function searches for all matches. # default, the function searches for all matches.
number_of_matches: null number_of_matches: null
# 'stop_token_count' is a list of two integers. If specified, the # 'stop_token_count' is a list of two integers. If specified,
# function will emit a warning after 'int_1' iterations on a token and # the function will emit a warning after 'int_1' iterations on a
# stops after 'int_2' iterations. # token and stops after 'int_2' iterations.
#stop_token_count=[3,5] #stop_token_count=[3,5]
stop_token_count: null stop_token_count: null
# Possible values for 'match_mode' are: 'longest', 'shortest' and 'all' # Possible values for 'match_mode' are: 'longest', 'shortest'
# and 'all'.
match_mode: "longest" match_mode: "longest"
# Possible values for 'output_mode' are: # Possible values for 'output_mode' are:
# - 'ignore': the transducer outputs will be ignored; # - 'ignore': the transducer outputs will be ignored;
# - 'merge': the transducer outputs will be merged with the input # - 'merge': the transducer outputs will be merged with the
# text; # input text;
# - 'replace': the transducer outputs replaces the matching text. # - 'replace': the transducer outputs replaces the matching
# text.
output_mode: "merge" output_mode: "merge"
# If set to True, this parameter enables special characters protection # If set to True, this parameter enables special characters
# when 'merge' or 'replace' mode is used. This is useful when Locate is # protection when 'merge' or 'replace' mode is used. This is
# called by Dico in order to avoid producing bad lines like: # useful when Locate is called by Dico in order to avoid
# producing bad lines like:
# 3,14,.PI.NUM # 3,14,.PI.NUM
protect_dic_chars: True protect_dic_chars: True
# If not null, this parameter must be a list of two strings, where: # If not null, this parameter must be a list of two strings,
# 'str_1' is a variable name whith content 'str_2'. # where: 'str_1' is a variable name whith content 'str_2'.
# NOTE: 'str_2' must be ASCII. # NOTE: 'str_2' must be ASCII.
variable: null variable: null
# If set to True, the function allows the production of several # If set to True, the function allows the production of several
# matches with same input but different outputs. If False, in case of # matches with same input but different outputs. If False, in
# ambiguous outputs, one will be arbitrarily chosen and kept, # case of ambiguous outputs, one will be arbitrarily chosen and
# depending on the internal state of the function. # kept, depending on the internal state of the function.
ambiguous_outputs: True ambiguous_outputs: True
# Possible values are: # Possible values are:
# - 'exit': kills the function if variable has an empty content; # - 'exit': kills the function if variable has an empty
# content;
# - 'ignore': ignore the errors; # - 'ignore': ignore the errors;
# - 'backtrack': stop the current path exploration. # - 'backtrack': stop the current path exploration.
variable_error: "ignore" variable_error: "ignore"
# Normalize command (Unitex manual, p.287) # Normalize command (Unitex manual, p.287)
normalize: normalize:
# If set to True, every separator sequence will be turned into a single # If set to True, every separator sequence will be turned into a
# space. # single space.
no_carriage_return: False no_carriage_return: False
# 'input_offsets' specifies the base offset file path to be used. # 'input_offsets' specifies the base offset file path to be
# used.
input_offsets: null input_offsets: null
# 'output_offsets' specifies the offset file path to be produced. # 'output_offsets' specifies the offset file path to be
# produced.
output_offsets: null output_offsets: null
# 'replacement_rules' specifies the normalization rule file to be used. # 'replacement_rules' specifies the normalization rule file to
# be used.
replacement_rules: null replacement_rules: null
# If set to True, the function only applies replacement rules specified # If set to True, the function only applies replacement rules
# with the 'replacement_rules' parameter. # specified with the 'replacement_rules' parameter.
no_separator_normalization: False no_separator_normalization: False
# SortTxt command (Unitex manual, p.291) # SortTxt command (Unitex manual, p.291)
...@@ -397,54 +420,57 @@ tools: ...@@ -397,54 +420,57 @@ tools:
reverse: False reverse: False
# The 'sort_order' parameter specifies the file path of the # The 'sort_order' parameter specifies the file path of the
# 'Alphabet_sort.txt' file or any other file defining the alphabet # 'Alphabet_sort.txt' file or any other file defining the
# order. # alphabet order.
sort_order: null sort_order: null
# If not null, the function backups the number of lines of the result # If not null, the function backups the number of lines of the
# file in the file specified by this parameter. # result file in the file specified by this parameter.
line_info: null line_info: null
# If set to True, 'thai' indicates that the input text is in Thai # If set to True, 'thai' indicates that the input text is in
# language # Thai language.
thai: False thai: False
# If set to True, the function makes two entries XXX,YYY.ZZZ:A and # If set to True, the function makes two entries X,Y.Z:A and
# XXX,YYY.ZZZ:B become a single entry: XXX,YYY.ZZZ:A:B # X,Y.Z:B become a single entry: X,Y.Z:A:B
factorize_inflectional_codes: False factorize_inflectional_codes: False
# Tokenize command (Unitex manual, p.294) # Tokenize command (Unitex manual, p.294)
tokenize: tokenize:
# If set to True, the function is applied character by character, with # If set to True, the function is applied character by
# the exceptions of the sentence delimiter {S}, the stop marker {STOP} # character, with the exceptions of the sentence delimiter {S},
# and lexical tags like {today,.ADV} which are considered to be single # the stop marker {STOP} and lexical tags like {today,.ADV}
# units. # which are considered to be single units.
char_by_char: False char_by_char: False
# 'tokens' specifies the path of the 'tokens.txt' file to loa and # 'tokens' specifies the path of the 'tokens.txt' file to load
# modify, instead of creating a new one from scratch. # and modify, instead of creating a new one from scratch.
tokens: null tokens: null
# 'input_offsets' specifies the base offset file path to be used. # 'input_offsets' specifies the base offset file path to be
# used.
input_offsets: null input_offsets: null
# 'output_offsets' specifies the offset file path to be produced. # 'output_offsets' specifies the offset file path to be
# produced.
output_offsets: null output_offsets: null
# Txt2Tfst command (Unitex manual, p.296) # Txt2Tfst command (Unitex manual, p.296)
txt2tfst: txt2tfst:
# If set to True, 'clean' indicates whether the rule of conservation of # If set to True, 'clean' indicates whether the rule of
# the best paths (see section 7.2.4) should be applied. # conservation of the best paths (see section 7.2.4) should be
# applied.
clean: False clean: False
# This parameter specifies the file path of a normalization grammar # This parameter specifies the file path of a normalization
# that is to be applied to the text automaton. # grammar that is to be applied to the text automaton.
normalization_grammar: null normalization_grammar: null
# This parameter specifies the Elag tagset file to use to normalize # This parameter specifies the Elag tagset file to use to
# dictionary entries. # normalize dictionary entries.
tagset: null tagset: null
# If set to True, 'korean' indicates that the input text is in korean # If set to True, 'korean' indicates that the input text is
# language. # in korean language.
korean: False korean: False
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter