From 6af98b9337467b668f7ac45f20279d2525025e4f Mon Sep 17 00:00:00 2001 From: Patrick Watrin <pat@lucy.local> Date: Wed, 24 Feb 2016 17:32:24 +0100 Subject: [PATCH] adding documentation to the parameters in the config file --- config/unitex-example.yaml | 2 +- config/unitex-template.yaml | 150 ++++++++++++++++++++++++++++++++-- examples/build-config-file.py | 1 + unitex/tools.py | 9 +- 4 files changed, 151 insertions(+), 11 deletions(-) diff --git a/config/unitex-example.yaml b/config/unitex-example.yaml index 6053878..d616d59 100644 --- a/config/unitex-example.yaml +++ b/config/unitex-example.yaml @@ -11,7 +11,7 @@ resources: - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin - language: null + language: fr replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2 sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2 tools: diff --git a/config/unitex-template.yaml b/config/unitex-template.yaml index 8a3484e..e422f9b 100644 --- a/config/unitex-template.yaml +++ b/config/unitex-template.yaml @@ -1,11 +1,60 @@ +# Do not modify this file. Use the 'build-config-file.py' script to generate a +# working version adapted to you local Unitex installation or copy this file +# before editing. + +# The 'global' section contains the global configuration parameters. global: + # There is 3 'debug' level: + # 0: the error output is disabled; + # 1: the error output is limited to the logging system implemented in the + # bindings; + # 2: the error output is activated for both the bindings and the Unitex + # processor. + # NOTE: if you activate the debug for level >= 1, the verbose level is + # automatically activated at level 2. debug: 0 + + # There is 4 'verbose' level: + # 0: the standard output is disabled; + # 1: the standard output shows 'warnings' emitted by the bindings logging + # system; + # 2: the standard output shows 'warnings' and various other informations + # emitted by the bindings logging system; + # 3: the standard output is activated for both the bindings and the Unitex + # processor. verbose: 0 + + # If not 'null', the error and standard outputs are redirected to the file + # specified by this parameters. + #log: /var/log/unitex.log log: null + # If you are using the high-level 'Processor' class from, this parameter + # activate or deactivate the resource persistence. If persistency is + # activated, dictionaries, grammar and alphabet are loaded during the + # object initialization and kept in memory in order to improve + # performances. persistence: True + + # The Unitex library implements a virtual filesystem which avoids a lot + # of I/O and improves the performance. If this parameter is set to 'True', + # The high-level 'Processor' class will activate automatically this virtual + # filesystem. virtualization: True +# The 'resources' section is automatically filled by the 'build-config-file.py' +# script. If you want to do it manually, be sure to give the absolute path of +# each resource as shown below: +# resources: +# language: fr +# alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt +# alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt +# sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2 +# replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2 +# dictionaries: +# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin +# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin +# - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin resources: language: null @@ -16,35 +65,126 @@ resources: dictionaries: null -# The 'tools' section can contain any of the argument used by the unitex tools -# functions. Note that, if you use the 'Processor' high-level class some argument -# could be overriden to fit the 'tag', 'extract' and 'search' functions -# behaviour. For intance, there is no point to define a font or a context for -# 'concord'. +# The 'tools' section can contain any of the argument used by the unitex tools. +# Note that, if you use the 'Processor' high-level class some parameters will +# be overriden to fit the 'tag' functions behaviour. For intance, there is no +# point to define a font or a context for 'concord'. tools: + # CheckDic command (Unitex manual, p.266) check_dic: + # If set to True, Unitex will use a strict syntax checking against + # unprotected dot and comma strict: False + # If set to 'True', 'no_space_warning' tells Unitex to tolerate spaces + # in grammatical, semantic and inflectional codes. no_space_warning: False + # Compress command (Unitex manual, p.266) compress: + # 'output' sets the output file. By default, a file xxx.dic will + # produce a file xxx.bin. output: null + # If set to 'True', 'flip' indicates that the inflected and canonical + # forms should be swapped in the compressed dictionary. This option is + # used to construct an inverse dictionary. flip: False + # If set to 'True', 'semitic' indicates that the semitic compression + # algorithm should be used. Setting this option with semitic languages + # like Arabic significantly reduces the size of the output dictionary. semitic: False + # 'version: v1' produces an old style .bin file + # 'version: v2' produces a new style .bin file, with no file size + # limitation to 16 Mb and a smaller resulting size version: "v2" + # Concord command (Unitex manual, p.267) concord: + # 'font' specifies the name of the font to use if the output is an + # HTML file. + #font: "Courier new" font: null + # 'fontsize' specifies the font size to use if the output is an HTML + # file. + #fontsize: 12 fontsize: null + # If 'only_ambiguous' is set to 'True', Unitex will only displays + # identical occurrences with ambiguous outputs, in text order only_ambiguous: False + # If 'only_matches' is set to 'True', Unitex will force empty right + # and left contexts. Moreover, if used with -t/–text, Concord will + # not surround matches with tabulations only_matches: False + # 'left' specifies the number of characters on the left of the + # occurrences. In Thai mode, this means the number of non-diacritic + # characters. For both 'left' and 'right' parameters, you can add the + # 's' character to stop at the first {S} tag. For instance, if you set + # 40s for the left value, the left context will end at 40 characters at + # most, less if the {S} tag is found before. left: "0" + # 'right' specifies the number of characters (non-diacritic ones in + # Thai mode) on the right of the occurrences (default=0). If the + # occurrence is shorter than this value, the concordance line is + # completed up to right. If the occurrence is longer than the length + # defined by right, it is nevertheless saved as whole. right: "0" + # 'sort' specifies the sort order. Possible values are: + # - TO: text order + # - LC: first left context then center + # - LC: first left context then right + # - CL: first center then left context + # - CR: first center then right context + # - RL: first right context then left context + # - RC: first right context then center sort: "TO" + # 'format' specifies the output format. Possible values are: + # - html: produces a concordance in HTML format + # - text: produces a concordance in text format + # - glossanet: produces a concordance for GlossaNet in HTML format + # where occurrences are links described by the 'script' + # parameter + # - script: produces a HTML concordance file where occurrences are + # links described by the 'script' parameter + # - index: produces an index of the concordance, made of the content + # of the occurrences (with the grammar outputs, if any), + # preceded by the positions of the occurrences in the text + # file given in characters + # - uima: produces an index of the concordance relative to the + # original text file, before any Unitex operation. The + # 'offsets' parameter must be provided + # - prlg: produces a concordance for PRLG corpora where each line is + # prefixed by information extracted with Unxmlize’s 'prlg' + # option. You must provide both the 'offsets' and the + # 'unxmlize' parameter + # - xml: produces xml index of the concordance + # - xml-with-header: produces an xml index of the concordance with + # full xml header + # - axis: quite the same as 'index', but the numbers represent the + # median character of each occurrence + # - xalign: another index file, used by the text alignment module. + # Each line is made of 3 integers X Y Z followed by the + # content of the occurrence. X is the sentence number, + # starting from 1. Y and Z are the starting and ending + # positions of the occurrence in the sentence, given in + # characters + # - merge: indicates to the function that it is supposed to produce + # a modified version of the text and save it in a file. The + # filename must be provided with the 'output' parameter format: "text" + # 'script' describes the links format for 'glossanet' and 'script' + # output. For instance, if you use 'http://www.google.com/search?q=', + # you will obtain a HTML concordance file where occurrences are + # hyperlinks to Google queries. script: null + # 'offsets' provides the file produced by Tokenize’s output_offsets + # option (needed by the 'uima' and the 'prlg' format). offsets: null + # 'unxmlize' provides the file produced by Unxmlize’s 'prlg' option + # (needed by the 'prlg' format). unxmlize: null + # 'directory' indicates to the function that it must not work in the + # same directory than <index> but in 'directory' directory: null + # 'thai' indicates that the input text is in Thai language thai: False dico: diff --git a/examples/build-config-file.py b/examples/build-config-file.py index b789904..c6fbef5 100644 --- a/examples/build-config-file.py +++ b/examples/build-config-file.py @@ -161,6 +161,7 @@ if __name__ == "__main__": sentence, replace = load_preprocessing_fsts(directory) alphabet, alphabet_sorted = load_alphabets(directory) + options["resources"]["language"] = language options["resources"]["dictionaries"] = dictionaries options["resources"]["sentence"] = sentence options["resources"]["replace"] = replace diff --git a/unitex/tools.py b/unitex/tools.py index 2770d17..8747f03 100644 --- a/unitex/tools.py +++ b/unitex/tools.py @@ -164,9 +164,8 @@ def concord(index, alphabet, **kwargs): - Generic options: font [str] -- the name of the font to use if the output is an HTML - file - fontsize [int] -- the font size to use if the output is an HTML file. The - font parameters are required if the output is an HTML file; + file. + fontsize [int] -- the font size to use if the output is an HTML file. only_ambiguous [bool] -- Only displays identical occurrences with ambiguous outputs, in text order (default: False) only_matches [bool] -- this option will force empty right and left contexts. Moreover, @@ -210,8 +209,8 @@ def concord(index, alphabet, **kwargs): UnitexConstants.FORMAT_PRLG: produces a concordance for PRLG corpora where each line is prefixed by information extracted with Unxmlize’s 'prlg' option. You must provide both the 'offsets' and the 'unxmlize' argument - UnitexConstants.FORMAT_XML: produces xml index of the concordance - UnitexConstants.FORMAT_XML_WITH_HEADER: produces xml index of the concordance with full xml header + UnitexConstants.FORMAT_XML: produces an xml index of the concordance + UnitexConstants.FORMAT_XML_WITH_HEADER: produces an xml index of the concordance with full xml header UnitexConstants.FORMAT_AXIS: quite the same as 'index', but the numbers represent the median character of each occurrence UnitexConstants.FORMAT_XALIGN: another index file, used by the text alignment module. Each line is -- GitLab