diff --git a/config/unitex-example.yaml b/config/unitex-example.yaml
index 60538783f3b1f6afdb38a14348c45c382bd2ba75..d616d596585e6d8e273dd0a97d5b8a8ef569a0fc 100644
--- a/config/unitex-example.yaml
+++ b/config/unitex-example.yaml
@@ -11,7 +11,7 @@ resources:
- /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin
- /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin
- /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin
- language: null
+ language: fr
replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2
sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2
tools:
diff --git a/config/unitex-template.yaml b/config/unitex-template.yaml
index 8a3484e1a96f733cdc0a716704e3010af06299fd..e422f9b1fc67a4ec2aae84cb030d5f73bfd140d9 100644
--- a/config/unitex-template.yaml
+++ b/config/unitex-template.yaml
@@ -1,11 +1,60 @@
+# Do not modify this file. Use the 'build-config-file.py' script to generate a
+# working version adapted to you local Unitex installation or copy this file
+# before editing.
+
+# The 'global' section contains the global configuration parameters.
global:
+ # There is 3 'debug' level:
+ # 0: the error output is disabled;
+ # 1: the error output is limited to the logging system implemented in the
+ # bindings;
+ # 2: the error output is activated for both the bindings and the Unitex
+ # processor.
+ # NOTE: if you activate the debug for level >= 1, the verbose level is
+ # automatically activated at level 2.
debug: 0
+
+ # There is 4 'verbose' level:
+ # 0: the standard output is disabled;
+ # 1: the standard output shows 'warnings' emitted by the bindings logging
+ # system;
+ # 2: the standard output shows 'warnings' and various other informations
+ # emitted by the bindings logging system;
+ # 3: the standard output is activated for both the bindings and the Unitex
+ # processor.
verbose: 0
+
+ # If not 'null', the error and standard outputs are redirected to the file
+ # specified by this parameters.
+ #log: /var/log/unitex.log
log: null
+ # If you are using the high-level 'Processor' class from, this parameter
+ # activate or deactivate the resource persistence. If persistency is
+ # activated, dictionaries, grammar and alphabet are loaded during the
+ # object initialization and kept in memory in order to improve
+ # performances.
persistence: True
+
+ # The Unitex library implements a virtual filesystem which avoids a lot
+ # of I/O and improves the performance. If this parameter is set to 'True',
+ # The high-level 'Processor' class will activate automatically this virtual
+ # filesystem.
virtualization: True
+# The 'resources' section is automatically filled by the 'build-config-file.py'
+# script. If you want to do it manually, be sure to give the absolute path of
+# each resource as shown below:
+# resources:
+# language: fr
+# alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt
+# alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt
+# sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2
+# replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2
+# dictionaries:
+# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin
+# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin
+# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin
resources:
language: null
@@ -16,35 +65,126 @@ resources:
dictionaries: null
-# The 'tools' section can contain any of the argument used by the unitex tools
-# functions. Note that, if you use the 'Processor' high-level class some argument
-# could be overriden to fit the 'tag', 'extract' and 'search' functions
-# behaviour. For intance, there is no point to define a font or a context for
-# 'concord'.
+# The 'tools' section can contain any of the argument used by the unitex tools.
+# Note that, if you use the 'Processor' high-level class some parameters will
+# be overriden to fit the 'tag' functions behaviour. For intance, there is no
+# point to define a font or a context for 'concord'.
tools:
+ # CheckDic command (Unitex manual, p.266)
check_dic:
+ # If set to True, Unitex will use a strict syntax checking against
+ # unprotected dot and comma
strict: False
+ # If set to 'True', 'no_space_warning' tells Unitex to tolerate spaces
+ # in grammatical, semantic and inflectional codes.
no_space_warning: False
+ # Compress command (Unitex manual, p.266)
compress:
+ # 'output' sets the output file. By default, a file xxx.dic will
+ # produce a file xxx.bin.
output: null
+ # If set to 'True', 'flip' indicates that the inflected and canonical
+ # forms should be swapped in the compressed dictionary. This option is
+ # used to construct an inverse dictionary.
flip: False
+ # If set to 'True', 'semitic' indicates that the semitic compression
+ # algorithm should be used. Setting this option with semitic languages
+ # like Arabic significantly reduces the size of the output dictionary.
semitic: False
+ # 'version: v1' produces an old style .bin file
+ # 'version: v2' produces a new style .bin file, with no file size
+ # limitation to 16 Mb and a smaller resulting size
version: "v2"
+ # Concord command (Unitex manual, p.267)
concord:
+ # 'font' specifies the name of the font to use if the output is an
+ # HTML file.
+ #font: "Courier new"
font: null
+ # 'fontsize' specifies the font size to use if the output is an HTML
+ # file.
+ #fontsize: 12
fontsize: null
+ # If 'only_ambiguous' is set to 'True', Unitex will only displays
+ # identical occurrences with ambiguous outputs, in text order
only_ambiguous: False
+ # If 'only_matches' is set to 'True', Unitex will force empty right
+ # and left contexts. Moreover, if used with -t/–text, Concord will
+ # not surround matches with tabulations
only_matches: False
+ # 'left' specifies the number of characters on the left of the
+ # occurrences. In Thai mode, this means the number of non-diacritic
+ # characters. For both 'left' and 'right' parameters, you can add the
+ # 's' character to stop at the first {S} tag. For instance, if you set
+ # 40s for the left value, the left context will end at 40 characters at
+ # most, less if the {S} tag is found before.
left: "0"
+ # 'right' specifies the number of characters (non-diacritic ones in
+ # Thai mode) on the right of the occurrences (default=0). If the
+ # occurrence is shorter than this value, the concordance line is
+ # completed up to right. If the occurrence is longer than the length
+ # defined by right, it is nevertheless saved as whole.
right: "0"
+ # 'sort' specifies the sort order. Possible values are:
+ # - TO: text order
+ # - LC: first left context then center
+ # - LC: first left context then right
+ # - CL: first center then left context
+ # - CR: first center then right context
+ # - RL: first right context then left context
+ # - RC: first right context then center
sort: "TO"
+ # 'format' specifies the output format. Possible values are:
+ # - html: produces a concordance in HTML format
+ # - text: produces a concordance in text format
+ # - glossanet: produces a concordance for GlossaNet in HTML format
+ # where occurrences are links described by the 'script'
+ # parameter
+ # - script: produces a HTML concordance file where occurrences are
+ # links described by the 'script' parameter
+ # - index: produces an index of the concordance, made of the content
+ # of the occurrences (with the grammar outputs, if any),
+ # preceded by the positions of the occurrences in the text
+ # file given in characters
+ # - uima: produces an index of the concordance relative to the
+ # original text file, before any Unitex operation. The
+ # 'offsets' parameter must be provided
+ # - prlg: produces a concordance for PRLG corpora where each line is
+ # prefixed by information extracted with Unxmlize’s 'prlg'
+ # option. You must provide both the 'offsets' and the
+ # 'unxmlize' parameter
+ # - xml: produces xml index of the concordance
+ # - xml-with-header: produces an xml index of the concordance with
+ # full xml header
+ # - axis: quite the same as 'index', but the numbers represent the
+ # median character of each occurrence
+ # - xalign: another index file, used by the text alignment module.
+ # Each line is made of 3 integers X Y Z followed by the
+ # content of the occurrence. X is the sentence number,
+ # starting from 1. Y and Z are the starting and ending
+ # positions of the occurrence in the sentence, given in
+ # characters
+ # - merge: indicates to the function that it is supposed to produce
+ # a modified version of the text and save it in a file. The
+ # filename must be provided with the 'output' parameter
format: "text"
+ # 'script' describes the links format for 'glossanet' and 'script'
+ # output. For instance, if you use 'http://www.google.com/search?q=',
+ # you will obtain a HTML concordance file where occurrences are
+ # hyperlinks to Google queries.
script: null
+ # 'offsets' provides the file produced by Tokenize’s output_offsets
+ # option (needed by the 'uima' and the 'prlg' format).
offsets: null
+ # 'unxmlize' provides the file produced by Unxmlize’s 'prlg' option
+ # (needed by the 'prlg' format).
unxmlize: null
+ # 'directory' indicates to the function that it must not work in the
+ # same directory than <index> but in 'directory'
directory: null
+ # 'thai' indicates that the input text is in Thai language
thai: False
dico:
diff --git a/examples/build-config-file.py b/examples/build-config-file.py
index b789904c9e5b3c5f646101c84ce7ae876b6e6041..c6fbef5f4714697db736cfe58f37f90d8d885b68 100644
--- a/examples/build-config-file.py
+++ b/examples/build-config-file.py
@@ -161,6 +161,7 @@ if __name__ == "__main__":
sentence, replace = load_preprocessing_fsts(directory)
alphabet, alphabet_sorted = load_alphabets(directory)
+ options["resources"]["language"] = language
options["resources"]["dictionaries"] = dictionaries
options["resources"]["sentence"] = sentence
options["resources"]["replace"] = replace
diff --git a/unitex/tools.py b/unitex/tools.py
index 2770d17aa55ec409ee4eb6614d2ae0cc10c8a779..8747f03a630b85b5d5d8715c3df3c4c8df2a51d9 100644
--- a/unitex/tools.py
+++ b/unitex/tools.py
@@ -164,9 +164,8 @@ def concord(index, alphabet, **kwargs):
- Generic options:
font [str] -- the name of the font to use if the output is an HTML
- file
- fontsize [int] -- the font size to use if the output is an HTML file. The
- font parameters are required if the output is an HTML file;
+ file.
+ fontsize [int] -- the font size to use if the output is an HTML file.
only_ambiguous [bool] -- Only displays identical occurrences with ambiguous
outputs, in text order (default: False)
only_matches [bool] -- this option will force empty right and left contexts. Moreover,
@@ -210,8 +209,8 @@ def concord(index, alphabet, **kwargs):
UnitexConstants.FORMAT_PRLG: produces a concordance for PRLG corpora where each line is prefixed
by information extracted with Unxmlize’s 'prlg' option. You must
provide both the 'offsets' and the 'unxmlize' argument
- UnitexConstants.FORMAT_XML: produces xml index of the concordance
- UnitexConstants.FORMAT_XML_WITH_HEADER: produces xml index of the concordance with full xml header
+ UnitexConstants.FORMAT_XML: produces an xml index of the concordance
+ UnitexConstants.FORMAT_XML_WITH_HEADER: produces an xml index of the concordance with full xml header
UnitexConstants.FORMAT_AXIS: quite the same as 'index', but the numbers represent the median
character of each occurrence
UnitexConstants.FORMAT_XALIGN: another index file, used by the text alignment module. Each line is