Skip to content
Extraits de code Groupes Projets
Valider a1c97108 rédigé par Patrick Watrin's avatar Patrick Watrin
Parcourir les fichiers

PEP 0008 cosmetics

parent ef167363
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
# Do not modify this file. Use the 'build-config-file.py' script to generate a
# working version adapted to you local Unitex installation or copy this file
# before editing.
# Do not modify this file. Use the 'build-config-file.py' script to
# generate a working version adapted to you local Unitex installation
# or copy this file before editing.
# The 'global' section contains the global configuration parameters.
global:
# There is 3 'debug' level:
# 0: the error output is disabled;
# 1: the error output is limited to the logging system implemented in the
# bindings;
# 2: the error output is activated for both the bindings and the Unitex
# processor.
# NOTE: if you activate the debug for level >= 1, the verbose level is
# automatically activated at level 2.
# 1: the error output is limited to the logging system implemented
# in the bindings;
# 2: the error output is activated for both the bindings and the
# Unitex processor.
# NOTE: if you activate the debug for level >= 1, the verbose level
# is automatically activated at level 2.
debug: 0
# There is 4 'verbose' level:
# 0: the standard output is disabled;
# 1: the standard output shows 'warnings' emitted by the bindings logging
# system;
# 1: the standard output shows 'warnings' emitted by the bindings
# logging system;
# 2: the standard output shows 'warnings' and various processing
# informations emitted by the bindings logging system;
# 3: the full standard output is activated for both the bindings and the
# Unitex processor.
# 3: the full standard output is activated for both the bindings and
# the Unitex processor.
verbose: 0
# If not 'null', the error and standard outputs are redirected to the file
# specified by this parameters. Be sure to have write access to this file.
# If not 'null', the error and standard outputs are redirected to
# the file specified by this parameters. Be sure to have write
# access to this file.
#log: /var/log/unitex.log
log: null
# If you are using the high-level 'Processor' class, this parameter
# activate or deactivate the resource persistence. If persistency is
# activated, dictionaries, grammar and alphabet are loaded during the
# object initialization and kept in memory in order to improve
# activated, dictionaries, grammar and alphabet are loaded during
# the object initialization and kept in memory in order to improve
# performances.
# NOTE: you can manually activate the persistence by using the
# 'load_persistent_X' functions from 'unitex.resources'.
persistence: True
# The Unitex library implements a virtual filesystem which avoids a lot
# of I/O and improves the performance. If this parameter is set to True,
# The high-level 'Processor' class will activate automatically this virtual
# filesystem.
# NOTE: as for the persistence, you can activate manually the VFS by using
# the functions from 'unitex.io'.
# The Unitex library implements a virtual filesystem which avoids a
# lot of I/O and improves the performance. If this parameter is set
# to True, the high-level 'Processor' class will activate
# automatically this virtual filesystem.
# NOTE: as for the persistence, you can activate manually the VFS by
# using the functions from 'unitex.io'.
virtualization: True
# The 'resources' section is automatically filled by the 'build-config-file.py'
# script. If you want to do it manually, be sure to give the absolute path of
# each resource as shown below.
# NOTE: the 'dictionaries' parameter is a list of path. As required by the YAML
# format, each item must be prefixed by the '-' character (cf. example).
# resources:
# language: fr
# alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt
# alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt
# sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2
# replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2
# dictionaries:
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin
# The 'resources' section is automatically filled by the
# 'build-config-file.py' script. If you want to do it manually, be sure
# to give the absolute path of each resource as shown below.
# NOTE: the 'dictionaries' parameter is a list of path. As required by
# the YAML format, each item must be prefixed by the '-' character
# (cf. example).
#resources:
# language: fr
# alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt
# alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt
# sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2
# replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2
# dictionaries:
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin
# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin
resources:
language: null
......@@ -71,25 +73,25 @@ resources:
dictionaries: null
# The 'tools' section can contain any of the arguments used by the unitex
# tools.
# Most of the times, these parameters are the same than the one used by the
# original Unitex tools (as described in the Unitex manual). Changes are
# explained in the comments of this file.
# NOTE: if you use the 'Processor' high-level class some parameters will be
# overriden to fit the 'tag' functions behaviour. For instance, there is
# no point to define a font or a context for 'concord'.
# The 'tools' section can contain any of the arguments used by the
# unitex tools.
# Most of the times, these parameters are the same than the one used by
# the original Unitex tools (as described in the Unitex manual). Changes
# are explained in the comments of this file.
# NOTE: if you use the 'Processor' high-level class some parameters will
# be overriden to fit the 'tag' functions behaviour. For instance,
# there is no point to define a font or a context for 'concord'.
# NOTE: ALL FILE PATH MUST BE ABSOLUTE!!!
tools:
# CheckDic command (Unitex manual, p.266)
check_dic:
# If set to True, the function will use a strict syntax checking
# If set to True, the function will use a strict syntax checking
# against unprotected dot and comma.
strict: False
# If set to True, the function will tolerate spaces in grammatical,
# semantic and inflectional codes.
# If set to True, the function will tolerate spaces in
# grammatical, semantic and inflectional codes.
no_space_warning: False
# Compress command (Unitex manual, p.266)
......@@ -98,9 +100,10 @@ tools:
# produce a file xxx.bin.
output: null
# If set to True, 'flip' indicates that the inflected and canonical
# forms should be swapped in the compressed dictionary. This option is
# used to construct an inverse dictionary.
# If set to True, 'flip' indicates that the inflected and
# canonical forms should be swapped in the compressed
# dictionary. This option is used to construct an inverse
# dictionary.
flip: False
# If set to True, the function will use the semitic compression
......@@ -108,45 +111,48 @@ tools:
semitic: False
# 'version: v1' produces an old style .bin file.
# 'version: v2' produces a new style .bin file, with no file size
# limitation to 16 Mb and a smaller resulting size.
# 'version: v2' produces a new style .bin file, with no file
# size limitation to 16 Mb and a smaller resulting
# size.
version: v2
# Concord command (Unitex manual, p.267)
concord:
# 'font' specifies the name of the font to use if the output is an
# HTML file.
# 'font' specifies the name of the font to use if the output is
# an HTML file.
#font: "Courier new"
font: null
# 'fontsize' specifies the font size to use if the output is an HTML
# file.
# 'fontsize' specifies the font size to use if the output is an
# HTML file.
#fontsize: 12
fontsize: null
# If 'only_ambiguous' is set to True, the function will only displays
# identical occurrences with ambiguous outputs, in text order.
# If 'only_ambiguous' is set to True, the function will only
# displays identical occurrences with ambiguous outputs, in text
# order.
only_ambiguous: False
# If 'only_matches' is set to True, the function will force empty right
# and left contexts. Moreover, if used with -t/–text, the function will
# not surround matches with tabulations.
# If 'only_matches' is set to True, the function will force
# empty right and left contexts. Moreover, if used with 'text',
# the function will not surround matches with tabulations.
only_matches: False
# 'left' specifies the number of characters on the left of the
# occurrences. In Thai mode, this means the number of non-diacritic
# characters. For both 'left' and 'right' parameters, you can add the
# 's' character to stop at the first {S} tag. For instance, if you set
# '40s' for the left value, the left context will end at 40 characters
# at most, less if the {S} tag is found before.
# occurrences. In Thai mode, this means the number of
# non-diacritic characters. For both 'left' and 'right'
# parameters, you can add the 's' character to stop at the first
# {S} tag. For instance, if you set '40s' for the left value,
# the left context will end at 40 characters at most, less if
# the {S} tag is found before.
# NOTE: the number must be quoted to avoid integer conversion.
left: "0"
# 'right' specifies the number of characters (non-diacritic ones in
# Thai mode) on the right of the occurrences. If the occurrence is
# shorter than this value, the concordance line is completed up to
# right. If the occurrence is longer than the length defined by right,
# it is nevertheless saved as whole.
# 'right' specifies the number of characters (non-diacritic ones
# in Thai mode) on the right of the occurrences. If the
# occurrence is shorter than this value, the concordance line is
# completed up to right. If the occurrence is longer than the
# length defined by right, it is nevertheless saved as whole.
# NOTE: the number must be quoted to avoid integer conversion.
right: "0"
......@@ -164,101 +170,107 @@ tools:
# 'format' specifies the output format. Possible values are:
# - html: produces a concordance in HTML format;
# - text: produces a concordance in text format;
# - glossanet: produces a concordance for GlossaNet in HTML format
# where occurrences are links described by the 'script'
# parameter;
# - script: produces a HTML concordance file where occurrences are
# links described by the 'script' parameter;
# - index: produces an index of the concordance, made of the content
# of the occurrences (with the grammar outputs, if any),
# preceded by the positions of the occurrences in the text
# file given in characters;
# - glossanet: produces a concordance for GlossaNet in HTML
# format where occurrences are links described by
# the 'script' parameter;
# - script: produces a HTML concordance file where occurrences
# are links described by the 'script' parameter;
# - index: produces an index of the concordance, made of the
# content of the occurrences (with the grammar
# outputs, if any), preceded by the positions of the
# occurrences in the text file given in characters;
# - uima: produces an index of the concordance relative to the
# original text file, before any Unitex operation. The
# 'offsets' parameter must be provided;
# - prlg: produces a concordance for PRLG corpora where each line is
# prefixed by information extracted with Unxmlize’s 'prlg'
# option. You must provide both the 'offsets' and the
# 'unxmlize' parameter;
# - prlg: produces a concordance for PRLG corpora where each
# line is prefixed by information extracted with
# Unxmlize’s 'prlg' option. You must provide both the
# 'offsets' and the 'unxmlize' parameter;
# - xml: produces xml index of the concordance;
# - xml-with-header: produces an xml index of the concordance with
# full xml header;
# - axis: quite the same as 'index', but the numbers represent the
# median character of each occurrence;
# - xalign: another index file, used by the text alignment module.
# Each line is made of 3 integers X Y Z followed by the
# content of the occurrence. X is the sentence number,
# starting from 1. Y and Z are the starting and ending
# positions of the occurrence in the sentence, given in
# characters;
# - merge: indicates to the function that it is supposed to produce
# a modified version of the text and save it in a file. The
# filename must be provided with the 'output' parameter.
# - xml-with-header: produces an xml index of the concordance
# with full xml header;
# - axis: quite the same as 'index', but the numbers represent
# the median character of each occurrence;
# - xalign: another index file, used by the text alignment
# module. Each line is made of 3 integers X Y Z
# followed by the content of the occurrence. X is
# the sentence number, starting from 1. Y and Z are
# the starting and ending positions of the
# occurrence in the sentence, given in characters;
# - merge: indicates to the function that it is supposed to
# produce a modified version of the text and save it
# in a file. The filename must be provided with the
# 'output' parameter.
format: "text"
# 'script' describes the links format for 'glossanet' and 'script'
# output. For instance, if you use 'http://www.google.com/search?q=',
# you will obtain a HTML concordance file where occurrences are
# hyperlinks to Google queries.
# 'script' describes the links format for 'glossanet' and
# 'script' output. For instance, if you use
# 'http://www.google.com/search?q=', you will obtain a
# HTML concordance file where occurrences are hyperlinks to
# Google queries.
script: null
# 'offsets' provides the file produced by tokenize’s output_offsets
# option (needed by the 'uima' and the 'prlg' format).
# 'offsets' provides the file produced by tokenize’s
# output_offsets option (needed by the 'uima' and the 'prlg'
# format).
offsets: null
# 'unxmlize' provides the file produced by Unxmlize’s 'prlg' option
# (needed by the 'prlg' format).
# 'unxmlize' provides the file produced by Unxmlize’s 'prlg'
# option (needed by the 'prlg' format).
unxmlize: null
# 'directory' indicates to the function that it must not work in the
# same directory than <index> but in 'directory'
# 'directory' indicates to the function that it must not work in
# the same directory than <index> but in 'directory'.
directory: null
# If set to True, 'thai' indicates that the input text is in Thai
# language
# If set to True, 'thai' indicates that the input text is in
# Thai language.
thai: False
# Dico command (Unitex manual, p.272)
dico:
# 'morpho' lists dictionaries to load in morphological mode, if needed
# by some .fst2 dictionaries.
# 'morpho' lists dictionaries to load in morphological mode, if
# needed by some .fst2 dictionaries.
morpho: null
# If set to True, 'korean' indicates that the input text is in korean
# language.
# If set to True, 'korean' indicates that the input text is in
# korean language.
korean: False
# If set to True, 'semitic' indicates that the input text is in a
# semitic language.
# If set to True, 'semitic' indicates that the input text is in
# a semitic language.
semitic: False
# 'arabic_rules' specifies the Arabic typographic rule configuration
# file path.
# 'arabic_rules' specifies the Arabic typographic rule
# configuration file path.
arabic_rules: null
# 'raw' specifies and alternative output file path containing both
# simple and compound words, without requiring a text directory.
# 'raw' specifies and alternative output file path containing
# both simple and compound words, without requiring a text
# directory.
raw: null
# Extract command (Unitex manual, p.277)
extract:
# If set to True, 'non_matching_sentences' indicates to the function
# to extract all sentences that don’t contain matching units.
# If set to True, 'non_matching_sentences' indicates to the
# function to extract all sentences that don’t contain matching
# units.
non_matching_sentences: False
# Fst2Txt command (Unitex manual, p.280)
fst2txt:
# If set to True, the search will start at any position in the text,
# even before a space. This parameter should only be used to carry out
# morphological searches.
# If set to True, the search will start at any position in the
# text, even before a space. This parameter should only be used
# to carry out morphological searches.
start_on_space: False
# If set to True, the function will work in character by character
# tokenization mode. This is useful for languages like Thai.
# If set to True, the function will work in character by
# character tokenization mode. This is useful for languages like
# Thai.
word_by_word: False
# If set to True, the function merge (instead of replace) transducer
# outputs with text inputs.
# If set to True, the function merge (instead of replace)
# transducer outputs with text inputs.
merge: True
# Grf2Fst2 command (Unitex manual, p.280)
......@@ -266,26 +278,28 @@ tools:
# If set to True, 'loop_check' enables error (loop) checking.
loop_check: False
# If set to True, tokenization will be done character by character.
# If set to True, tokenization will be done character by
# character.
char_by_char: False
# 'pkgdir' specifies the repository directory to use (see section
# 5.2.2, p.99).
# 'pkgdir' specifies the repository directory to use (see
# section 5.2.2, p.99).
pkgdir: null
# If set to True, no warning will be emitted when a graph matches the
# empty word.
# If set to True, no warning will be emitted when a graph
# matches the empty word.
no_empty_graph_warning: False
# If set to True, the function checks wether the given graph can be
# considered as a valid sentence automaton or not.
# If set to True, the function checks wether the given graph can
# be considered as a valid sentence automaton or not.
tfst_check: False
# If set to True, the function does not print the graph names.
silent_grf_name: True
# 'named_repository' must be a list of X=Y sequences, separated by ‘;’,
# where X is the name of the repository denoted by pathname Y.
# 'named_repository' must be a list of X=Y sequences, separated
# by ‘;’, where X is the name of the repository denoted by
# pathname Y.
named_repository: null
# If set to True, the graph is compiled in debug mode.
......@@ -297,95 +311,104 @@ tools:
# Locate command (Unitex manual, p.283)
locate:
# If set to True, the search will start at any position in the text,
# even before a space. This parameter should only be used to carry out
# morphological searches.
# If set to True, the search will start at any position in the
# text, even before a space. This parameter should only be used
# to carry out morphological searches.
start_on_space: False
# If set to True, tokenization will be done character by character.
# If set to True, tokenization will be done character by
# character.
char_by_char: False
# 'morpho' lists dictionaries to load in morphological mode, if needed
# by some .fst2 dictionaries.
# 'morpho' lists dictionaries to load in morphological mode, if
# needed by some .fst2 dictionaries.
morpho: null
# If set to True, 'korean' indicates that the input text is in korean
# language.
# If set to True, 'korean' indicates that the input text is in
# korean language.
korean: False
# 'arabic_rules' specifies the Arabic typographic rule configuration
# file path.
# 'arabic_rules' specifies the Arabic typographic rule
# configuration file path.
arabic_rules: null
# If not null, the function puts produced files in 'sntdir' instead
# of the text directory. Note that 'sntdir' must end with a file
# separator (\ or /).
# If not null, the function puts produced files in 'sntdir'
# instead of the text directory. Note that 'sntdir' must end
# with a file separator (\ or /).
sntdir: null
# This parameter specifies the negation operator to be used in Locate
# patterns. The two legal values for X are 'minus' and 'tilde'.
# This parameter specifies the negation operator to be used in
# Locate patterns. The two legal values for X are 'minus' and
# 'tilde'.
negation_operator: "tilde"
# If not null, the function stops after the first N matches. By
# default, the function searches for all matches.
number_of_matches: null
# 'stop_token_count' is a list of two integers. If specified, the
# function will emit a warning after 'int_1' iterations on a token and
# stops after 'int_2' iterations.
# 'stop_token_count' is a list of two integers. If specified,
# the function will emit a warning after 'int_1' iterations on a
# token and stops after 'int_2' iterations.
#stop_token_count=[3,5]
stop_token_count: null
# Possible values for 'match_mode' are: 'longest', 'shortest' and 'all'
# Possible values for 'match_mode' are: 'longest', 'shortest'
# and 'all'.
match_mode: "longest"
# Possible values for 'output_mode' are:
# - 'ignore': the transducer outputs will be ignored;
# - 'merge': the transducer outputs will be merged with the input
# text;
# - 'replace': the transducer outputs replaces the matching text.
# - 'merge': the transducer outputs will be merged with the
# input text;
# - 'replace': the transducer outputs replaces the matching
# text.
output_mode: "merge"
# If set to True, this parameter enables special characters protection
# when 'merge' or 'replace' mode is used. This is useful when Locate is
# called by Dico in order to avoid producing bad lines like:
# If set to True, this parameter enables special characters
# protection when 'merge' or 'replace' mode is used. This is
# useful when Locate is called by Dico in order to avoid
# producing bad lines like:
# 3,14,.PI.NUM
protect_dic_chars: True
# If not null, this parameter must be a list of two strings, where:
# 'str_1' is a variable name whith content 'str_2'.
# If not null, this parameter must be a list of two strings,
# where: 'str_1' is a variable name whith content 'str_2'.
# NOTE: 'str_2' must be ASCII.
variable: null
# If set to True, the function allows the production of several
# matches with same input but different outputs. If False, in case of
# ambiguous outputs, one will be arbitrarily chosen and kept,
# depending on the internal state of the function.
# matches with same input but different outputs. If False, in
# case of ambiguous outputs, one will be arbitrarily chosen and
# kept, depending on the internal state of the function.
ambiguous_outputs: True
# Possible values are:
# - 'exit': kills the function if variable has an empty content;
# - 'exit': kills the function if variable has an empty
# content;
# - 'ignore': ignore the errors;
# - 'backtrack': stop the current path exploration.
variable_error: "ignore"
# Normalize command (Unitex manual, p.287)
normalize:
# If set to True, every separator sequence will be turned into a single
# space.
# If set to True, every separator sequence will be turned into a
# single space.
no_carriage_return: False
# 'input_offsets' specifies the base offset file path to be used.
# 'input_offsets' specifies the base offset file path to be
# used.
input_offsets: null
# 'output_offsets' specifies the offset file path to be produced.
# 'output_offsets' specifies the offset file path to be
# produced.
output_offsets: null
# 'replacement_rules' specifies the normalization rule file to be used.
# 'replacement_rules' specifies the normalization rule file to
# be used.
replacement_rules: null
# If set to True, the function only applies replacement rules specified
# with the 'replacement_rules' parameter.
# If set to True, the function only applies replacement rules
# specified with the 'replacement_rules' parameter.
no_separator_normalization: False
# SortTxt command (Unitex manual, p.291)
......@@ -397,54 +420,57 @@ tools:
reverse: False
# The 'sort_order' parameter specifies the file path of the
# 'Alphabet_sort.txt' file or any other file defining the alphabet
# order.
# 'Alphabet_sort.txt' file or any other file defining the
# alphabet order.
sort_order: null
# If not null, the function backups the number of lines of the result
# file in the file specified by this parameter.
# If not null, the function backups the number of lines of the
# result file in the file specified by this parameter.
line_info: null
# If set to True, 'thai' indicates that the input text is in Thai
# language
# If set to True, 'thai' indicates that the input text is in
# Thai language.
thai: False
# If set to True, the function makes two entries XXX,YYY.ZZZ:A and
# XXX,YYY.ZZZ:B become a single entry: XXX,YYY.ZZZ:A:B
# If set to True, the function makes two entries X,Y.Z:A and
# X,Y.Z:B become a single entry: X,Y.Z:A:B
factorize_inflectional_codes: False
# Tokenize command (Unitex manual, p.294)
tokenize:
# If set to True, the function is applied character by character, with
# the exceptions of the sentence delimiter {S}, the stop marker {STOP}
# and lexical tags like {today,.ADV} which are considered to be single
# units.
# If set to True, the function is applied character by
# character, with the exceptions of the sentence delimiter {S},
# the stop marker {STOP} and lexical tags like {today,.ADV}
# which are considered to be single units.
char_by_char: False
# 'tokens' specifies the path of the 'tokens.txt' file to loa and
# modify, instead of creating a new one from scratch.
# 'tokens' specifies the path of the 'tokens.txt' file to load
# and modify, instead of creating a new one from scratch.
tokens: null
# 'input_offsets' specifies the base offset file path to be used.
# 'input_offsets' specifies the base offset file path to be
# used.
input_offsets: null
# 'output_offsets' specifies the offset file path to be produced.
# 'output_offsets' specifies the offset file path to be
# produced.
output_offsets: null
# Txt2Tfst command (Unitex manual, p.296)
txt2tfst:
# If set to True, 'clean' indicates whether the rule of conservation of
# the best paths (see section 7.2.4) should be applied.
# If set to True, 'clean' indicates whether the rule of
# conservation of the best paths (see section 7.2.4) should be
# applied.
clean: False
# This parameter specifies the file path of a normalization grammar
# that is to be applied to the text automaton.
# This parameter specifies the file path of a normalization
# grammar that is to be applied to the text automaton.
normalization_grammar: null
# This parameter specifies the Elag tagset file to use to normalize
# dictionary entries.
# This parameter specifies the Elag tagset file to use to
# normalize dictionary entries.
tagset: null
# If set to True, 'korean' indicates that the input text is in korean
# language.
# If set to True, 'korean' indicates that the input text is
# in korean language.
korean: False
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter