Newer
Older
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# NOTE: The documentation is somehow a copy/paste from the Unitex
# manual.
import logging
from unitex import *
from unitex.config import CheckDicOptions,\
CompressOptions,\
ConcordOptions,\
DicoOptions,\
ExtractOptions,\
Fst2TxtOptions,\
Grf2Fst2Options,\
LocateOptions,\
NormalizeOptions,\
SortTxtOptions,\
TokenizeOptions,\
Txt2TFstOptions
from unitex.io import exists
_LOGGER = logging.getLogger(__name__)
def check_dic(dictionary, dtype, alphabet, **kwargs):
"""
This function checks the format of <dela> and produces a file named
CHECK_DIC.TXT that contains check result informations. This file is
stored in the <dela> directory.
- **dictionary [str]** -- the dictionary file path.
- **dtype [str]** -- the dictionary type:
- UnitexConstants.DELAF (inflected);
- UnitexConstants.DELAS (non inflected).
- **alphabet [str]** -- the alphabet file path.
*Keyword arguments:*
- **strict [bool]** -- strict syntax checking against unprotected
dot and comma (default: False).
- **no_space_warning [bool]** -- tolerates spaces in grammatical,
semantic and inflectional codes (default: True).
Patrick Watrin
a validé
**True** if it succeeds, **False** otherwise.
options = CheckDicOptions()
options.load(kwargs)
if exists(dictionary) is False:
raise UnitexException("[CHECKDIC] Dictionary file '%s' doesn't exists" % dictionary)
command = ["UnitexTool", "CheckDic"]
if dtype == UnitexConstants.DELAF:
command.append("--delaf")
elif dtype == UnitexConstants.DELAS:
command.append("--delas")
if options["strict"] is True:
command.append("--strict")
if options["no_space_warning"] is True:
command.append("--no_space_warning")
command .append("--alphabet=%s" % alphabet)
command.append(dictionary)
command.append("-qutf8-no-bom")
command = " ".join(command)
_LOGGER.info("Checking dic '%s'" % dictionary)
_LOGGER.debug("Command: %s", command)
def compress(dictionary, **kwargs):
This function takes a DELAF dictionary as a parameter and compresses
it. The compression of a dictionary dico.dic produces two files:
- dico.bin: a binary file containing the minimum automaton of
the inflected forms of the dictionary;
- dico.inf: a text file containing the compressed forms required
for the reconstruction of the dictionary lines from the inflected
forms contained in the automaton.
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
- **dictionary [str]** -- the dictionary file path.
*Keyword arguments:*
- **output [str]** -- sets the output file. By default, a file
xxx.dic will produce a file xxx.bin.
- **flip [bool]** -- indicates that the inflected and canonical
forms should be swapped in the compressed dictionary. This option
is used to construct an inverse dictionary which is necessary for
the program 'Reconstrucao' (default: False).
- **semitic [bool]** -- indicates that the semitic compression
algorithm should be used. Setting this option with semitic
languages like Arabic significantly reduces the size of the output
dictionary (default: False).
- **version [str]** -- Possible values are:
- UnitexConstants.DICTIONARY_VERSION_1: produces an old style .bin
UnitexConstants.DICTIONARY_VERfile;
- UnitexConstants.DICTIONARY_VERSION_2: produces a new style .bin
file, with no file size limitation to 16 Mb and a smaller size
(default).
*Return [bool]:*
**True** if it succeeds, **False** otherwise.
options = CompressOptions()
options.load(kwargs)
if exists(dictionary) is False:
raise UnitexException("[COMPRESS] Dictionary file '%s' doesn't exists" % dictionary)
command = ["UnitexTool", "Compress"]
if options["output"] is not None:
command.append("--output=%s" % options["output"])
if options["flip"] is True:
command.append("--flip")
if options["semitic"] is True:
command.append("--semitic")
if options["version"] == UnitexConstants.DICTIONARY_VERSION_1:
command.append("--v1")
elif options["version"] == UnitexConstants.DICTIONARY_VERSION_2:
command.append("--v2")
command.append(dictionary)
command.append("-qutf8-no-bom")
command = " ".join(command)
_LOGGER.info("Compressing dic '%s'" % dictionary)
_LOGGER.debug("Command: %s", command)
return ret
def concord(index, alphabet, **kwargs):
This function takes a concordance index file produced by the
function 'locate' and produces a concordance. It is also possible to
produce a modified text version taking into account the transducer
outputs associated to the occurrences.
The result of the application of this function is a file called
concord.txt if the concordance was constructed in text mode, a file
called concord.html if 'output_mode' is UnitexConstants.FORMAT_HTML,
UnitexConstants.FORMAT_GLOSSANET' or UnitexConstants.FORMAT_SCRIPT,
and a text file with the name defined by the user of the function if
the function has constructed a modified version of the text.
In html mode, the occurrence is coded as a hypertext link. The
reference associated to this link is of the form <a href="X Y Z">.
X et Y represent the beginning and ending positions of the
occurrence in characters in the file text_name.snt. Z represents the
number of the sentence in which the occurrence was found.
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
*Arguments:*
- **index [str]** -- the index file path (produced by the 'locate'
function).
- **alphabet [str]** -- alphabet file used for sorting.
*Keyword arguments:*
- *Generic options:*
- **font [str]** -- the name of the font to use if the output is
an HTML file.
- **fontsize [int]** -- the font size to use if the output is an
HTML file.
- **only_ambiguous [bool]** -- Only displays identical occurrences
with ambiguous outputs, in text order (default: False).
- **only_matches [bool]** -- this option will force empty right
and left contexts. Moreover, if used with
UnitexConstants.FORMAT_TEXT, the function will not surround
matches with tabulations (default: False).
- **left [str]** -- number of characters on the left of the
occurrences (default=0). In Thai mode, this means the number of
non-diacritic characters.
- **right [str]** -- number of characters (non-diacritic ones in
Thai mode) on the right of the occurrences (default=0). If the
occurrence is shorter than this value, the concordance line is
completed up to right. If the occurrence is longer than the
length defined by right, it is nevertheless saved as whole.
**NOTE:** For both 'left' and 'right', you can add the 's'
character to stop at the first {S} tag. For instance, if you set
'40s' for the left value, the left context will end at 40
characters at most, less if the {S} tag is found before.
- *Sort options:*
- **sort [str]** -- specifies the sort order. Possible values:
- UnitexConstants.SORT_TEXT_ORDER: order in which the
occurrences appear in the text (default);
- UnitexConstants.SORT_LEFT_CENTER: left context for primary
sort, then occurrence for secondary sort;
- UnitexConstants.SORT_LEFT_RIGHT: left context, then right
context;
- UnitexConstants.SORT_CENTER_LEFT: occurrence, then left
context;
- UnitexConstants.SORT_CENTER_RIGHT: occurrence, then right
context;
- UnitexConstants.SORT_RIGHT_LEFT: right context, then left
context;
- UnitexConstants.SORT_RIGHT_CENTER: left context, then
occurrence.
- *Output options:*
- **format [str]** -- specifies the output fomat. Possible values:
- UnitexConstants.FORMAT_HTML: produces a concordance in HTML
format encoded in UTF-8 (default);
- UnitexConstants.FORMAT_TEXT: produces a concordance in Unicode
text format;
- UnitexConstants.FORMAT_GLOSSANET: produces a concordance for
GlossaNet in HTML format where occurrences are links described
by the 'script' argument (cf. Unitex manual p. 268). The HTML
file is encoded in UTF-8;
- UnitexConstants.FORMAT_SCRIPT: produces a HTML concordance
file where occurrences are links described by the 'script'
argument;
- UnitexConstants.FORMAT_INDEX: produces an index of the
concordance, made of the content of the occurrences (with the
grammar outputs, if any), preceded by the positions of the
occurrences in the text file given in characters;
- UnitexConstants.FORMAT_UIMA: produces an index of the
concordance relative to the original text file, before any
Unitex operation. The 'offsets' argument must be provided;
- UnitexConstants.FORMAT_PRLG: produces a concordance for PRLG
corpora where each line is prefixed by information extracted
with Unxmlize’s 'prlg' option. You must provide both the
'offsets' and the 'unxmlize' argument;
- UnitexConstants.FORMAT_XML: produces an xml index of the
concordance;
- UnitexConstants.FORMAT_XML_WITH_HEADER: produces an xml index
of the concordance with full xml header;
- UnitexConstants.FORMAT_AXIS: quite the same as 'index', but
the numbers represent the median character of each occurrence;
- UnitexConstants.FORMAT_XALIGN: another index file, used by the
text alignment module. Each line is made of 3 integers X Y Z
followed by the content of the occurrence. X is the sentence
number, starting from 1. Y and Z are the starting and ending
positions of the occurrence in the sentence, given in
characters;
- UnitexConstants.FORMAT_MERGE: indicates to the function that
it is supposed to produce a modified version of the text and
save it in a file. The filename must be provided with the
'output' argument.
- **script [str]** -- string describing the links format for
'glossanet' and 'script' output. For instance, if you use
'http://www.google.com/search?q=', you will obtain a HTML
concordance file where occurrences are hyperlinks to Google
queries.
- **offsets [str]** -- the file produced by Tokenize’s
output_offsets option (needed by the 'uima' and the 'prlg'
format).
- **unxmlize [str]** -- file produced by Unxmlize’s 'prlg' option
(needed by the 'prlg' format).
- **output [str]** -- the output filename (needed by the 'merge'
format).
- *Other options:*
- **directory [str]** -- indicates to the function that it must
not work in the same directory than <index> but in
'directory'.
- **thai [bool]** -- option to use for Thai concordances
(default: False).
*Return [bool]:*
**True** if it succeeds, **False** otherwise.
options = ConcordOptions()
options.load(kwargs)
if exists(index) is False:
raise UnitexException("[CONCORD] Index file '%s' doesn't exists" % index)
if exists(alphabet) is False:
raise UnitexException("[CONCORD] Alphabet file '%s' doesn't exists" % alphabet)
command = ["UnitexTool", "Concord"]
if options["font"] is not None:
command.append("--font=%s" % options["font"])
if options["fontsize"] is not None:
command.append("--fontsize=%s" % options["fontsize"])
if options["only_ambiguous"] is True:
command.append("--only_ambiguous")
if options["only_matches"] is True:
command.append("--only_matches")
command.append("--left=%s" % options["left"])
command.append("--right=%s" % options["right"])
if options["sort"] == UnitexConstants.SORT_TEXT_ORDER:
command.append("--TO")
elif options["sort"] == UnitexConstants.SORT_LEFT_CENTER:
command.append("--LC")
elif options["sort"] == UnitexConstants.SORT_LEFT_RIGHT:
command.append("--LR")
elif options["sort"] == UnitexConstants.SORT_CENTER_LEFT:
command.append("--CL")
elif options["sort"] == UnitexConstants.SORT_CENTER_RIGHT:
command.append("--CR")
elif options["sort"] == UnitexConstants.SORT_RIGHT_LEFT:
command.append("--RL")
elif options["sort"] == UnitexConstants.SORT_RIGHT_CENTER:
command.append("--RC")
if options["format"] == UnitexConstants.FORMAT_HTML:
command.append("--html")
elif options["format"] == UnitexConstants.FORMAT_TEXT:
command.append("--text")
elif options["format"] == UnitexConstants.FORMAT_GLOSSANET:
command.append("--glossanet=%s" % options["script"])
elif options["format"] == UnitexConstants.FORMAT_SCRIPT:
command.append("--script=%s" % options["script"])
elif options["format"] == UnitexConstants.FORMAT_INDEX:
command.append("--index")
elif options["format"] == UnitexConstants.FORMAT_UIMA:
command.append("--uima=%s" % options["offsets"])
elif options["format"] == UnitexConstants.FORMAT_PRLG:
command.append("--PRLG=%s,%s" % options["unxmlize"], options["offsets"])
elif options["format"] == UnitexConstants.FORMAT_XML:
command.append("--xml")
elif options["format"] == UnitexConstants.FORMAT_XML_WITH_HEADERS:
command.append("--xml-with-header")
elif options["format"] == UnitexConstants.FORMAT_AXIS:
command.append("--axis")
elif options["format"] == UnitexConstants.FORMAT_XALIGN:
command.append("--xalign")
elif options["format"] == UnitexConstants.FORMAT_MERGE:
command.append("--merge=%s" % options["output"])
if options["directory"] is not None:
command.append("--directory=%s" % options["directory"])
command.append("--alphabet=%s" % alphabet)
if options["thai"] is not True:
command.append("--thai")
command.append(index)
command.append("-qutf8-no-bom")
command = " ".join(command)
_LOGGER.info("Create concordance for '%s'" % index)
_LOGGER.debug("Command: %s", command)
return ret
def dico(dictionaries, text, alphabet, **kwargs):
This function applies dictionaries to a text. The text must have
been cut up into lexical units by the 'tokenize' function.
The function 'dico' produces the following files, and saves them in
the directory of the text:
- dlf: dictionary of simple words in the text;
- dlc: dictionary of compound words in the text;
- err: list of unknown words in the text;
- tags_err: unrecognized simple words that are not matched by the
tags.ind file;
- tags.ind: sequences to be inserted in the text automaton (see
section 3.8.3, page 69);
- stat_dic.n: file containing the number of simple words, the number
of compound words, and the number of unknown words in the text.
**NOTE:** Files dlf, dlc, err and tags_err are not sorted. Use the
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
*Arguments:*
- **dictionaries [list(str)]** -- list of dictionary pathes ('bin'
or 'fst2' formats).
- **text [str]** -- text (snt format) file path.
- **alphabet [str]** -- alphabet file path.
*Keyword arguments:*
- **morpho [list(str)]** -- this optional argument indicates which
morphological mode dictionaries are to be used, if needed by some
.fst2 dictionaries. The argument is a list of dictionary path (bin
format).
- **korean [bool]** -- specify the dictionary is in korean
(default: False).
- **semitic [bool]** -- specify the dictionary is in a semitic
language (default: False).
- **arabic_rules [str]** -- specifies the Arabic typographic rule
configuration file path.
- **raw [str]** -- alternative output file path containing both
simple and compound words, without requiring a text directory.
*Return [bool]:*
**True** if it succeeds, **False** otherwise.
options = DicoOptions()
options.load(kwargs)
for dictionary in dictionaries:
if exists(dictionary) is False:
raise UnitexException("[DICO] Dictionary file '%s' doesn't exists" % dictionary)
if exists(text) is False:
raise UnitexException("[DICO] Text file '%s' doesn't exists" % text)
if exists(alphabet) is False:
raise UnitexException("[DICO] Alphabet file '%s' doesn't exists" % alphabet)
command = ["UnitexTool", "Dico"]
command.append("--text=%s" % text)
command.append("--alphabet=%s" % alphabet)
if options["morpho"] is not None:
command.append("--morpho=%s" % ",".join(options["morpho"]))
if options["korean"] is True:
command.append("--korean")
if options["semitic"] is True:
command.append("--semitic")
if options["arabic_rules"] is not None:
command.append("--arabic_rules=%s" % options["arabic_rules"])
if options["raw"] is not None:
command.append("--raw=%s" % raw)
command += dictionaries
command.append("-qutf8-no-bom")
command = " ".join(command)
_LOGGER.info("Applying dictionaries")
_LOGGER.debug("Command: %s", command)
return ret
def extract(text, output, index, **kwargs):
This function extracts from the given text all sentences that
contain at least one occurrence from the concordance. The parameter
<text> represents the complete path of the text file, without
omitting the extension .snt.
- **text [str]** -- the text file (.snt format).
- **output [str]** -- the output text file.
- **index [str]** -- the index file path (produced by the 'locate'
function).
- **non_matching_sentences [bool]** -- extracts all sentences that
don’t contain matching units (default: False).
*Return [bool]:*
**True** if it succeeds, **False** otherwise.
"""
options = ExtractOptions()
options.load(kwargs)
if exists(text) is False:
raise UnitexException("[EXTRACT] Text file '%s' doesn't exists" % text)
if exists(index) is False:
raise UnitexException("[EXTRACT] Index file '%s' doesn't exists" % index)
command = ["UnitexTool", "Extract"]
if options["non_matching_sentences"] is False:
command.append("--yes")
else:
command.append("--no")
command.append("--output=%s" % output)
command.append("--index=%s" % index)
command.append(text)
command.append("-qutf8-no-bom")
command = " ".join(command)
_LOGGER.info("Extracting sentences")
_LOGGER.debug("Command: %s", command)
return ret
def fst2txt(grammar, text, alphabet, **kwargs):
This function applies a transducer to a text in longest match mode
at the preprocessing stage, when the text has not been cut into
lexical units yet. This function modifies the input text file.
**NOTE:** This function modifies the input text file.
- **grammar [str]** -- the fst2 to apply on the text.
- **text [str]** -- the (.snt) text file to be modified.
- **alphabet [str]** -- the alphabet file of the language of the
text.
- **start_on_space [bool]** -- this parameter indicates that the
search will start at any position in the text, even before a
space. This parameter should only be used to carry out
morphological searches (default: False).
- **char_by_char [bool]** -- works in character by character
tokenization mode. This is useful for languages like Thai
(default: False).
- **merge [bool]** -- merge (instead of replace) transducer outputs
with text inputs (default: True).
**True** if it succeeds, **False** otherwise.
options = Fst2TxtOptions()
options.load(kwargs)
if exists(grammar) is False:
raise UnitexException("[FST2TXT] Grammar file '%s' doesn't exists" % grammar)
if exists(text) is False:
raise UnitexException("[FST2TXT] Text file '%s' doesn't exists" % text)
if exists(alphabet) is False:
raise UnitexException("[FST2TXT] Alphabet file '%s' doesn't exists" % alphabet)
command = ["UnitexTool", "Fst2Txt"]
command.append("--text=%s" % text)
command.append("--alphabet=%s" % alphabet)
if options["start_on_space"] is False:
command.append("--dont_start_on_space")
else:
command.append("--start_on_space")
if options["char_by_char"] is False:
command.append("--word_by_word")
else:
command.append("--char_by_char")
if options["merge"] is True:
command.append("--merge")
else:
command.append("--replace")
command.append(grammar)
command.append("-qutf8-no-bom")
command = " ".join(command)
_LOGGER.info("Applying grammar '%s'..." % grammar)
_LOGGER.debug("Command: %s", command)
return ret
def grf2fst2(grammar, alphabet, **kwargs):
This function compiles a grammar into a .fst2 file (for more details
see section 6.2). The parameter <grf> denotes the complete path of
the main graph of the grammar, without omitting the extension .grf.
The result is a file with the same name as the graph passed to the
function as a parameter, but with extension .fst2. This file is
saved in the same directory as <grf>.
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
*Arguments:*
- **grammar [str]** -- the grf to compile.
- **alphabet [str]** -- specifies the alphabet file to be used for
tokenizing the content of the grammar boxes into lexical units.
*Keyword arguments:*
- **loop_check [bool]** -- enables error (loop) checking
(default: False).
- **char_by_char [bool]** -- tokenization will be done character by
character. If neither -c nor -a option is used, lexical units will
be sequences of any Unicode letters (default: False).
- **pkgdir [str]** -- specifies the repository directory to use (see
section 5.2.2, page 99).
- **no_empty_graph_warning [bool]** -- no warning will be emitted
when a graph matches the empty word. This option is used by
MultiFlex in order not to scare users with meaningless error
messages when they design an inflection grammar that matches the
empty word (default: False).
- **tfst_check [bool]** -- checks wether the given graph can be
considered as a valid sentence automaton or not (default: False).
- **silent_grf_name [bool]** -- does not print the graph names
(default: True).
- **named_repositories [list(str)]** -- declaration of named
repositories. This argument is made of one or more X=Y sequences,
separated by ‘;’, where X is the name of the repository denoted by
pathname Y.
- **debug [bool]** -- compile graphs in debug mode (default: False).
- **check_variables [bool]** -- check output validity to avoid
malformed variable expressions (default: True).
*Return [bool]:*
**True** if it succeeds, **False** otherwise.
options = Grf2Fst2Options()
options.load(kwargs)
if exists(grammar) is False:
raise UnitexException("[GRF2FST2] Grammar file '%s' doesn't exists" % grammar)
if exists(alphabet) is False:
raise UnitexException("[GRF2FST2] Alphabet file '%s' doesn't exists" % alphabet)
command = ["UnitexTool", "Grf2Fst2"]
if options["loop_check"] is False:
command.append("--no_loop_check")
else:
command.append("--loop_check")
command.append("--alphabet=%s" % alphabet)
if options["char_by_char"] is True:
command.append("--char_by_char")
if options["pkgdir"] is not None:
command.append("--pkgdir=%s" % options["pkgdir"])
if options["no_empty_graph_warning"] is True:
command.append("--no_empty_graph_warning")
if options["tfst_check"] is True:
command.append("--tfst_check")
if options["silent_grf_name"] is True:
command.append("--silent_grf_name")
if options["named_repositories"] is not None:
command.append("--named_repositories=%s" % ";".join(options["named_repositories"]))
if options["debug"] is True:
command.append("--debug")
if options["check_variables"] is True:
command.append("--check_variables")
command.append(grammar)
command.append("-qutf8-no-bom")
command = " ".join(command)
_LOGGER.info("Compiling grammar '%s'..." % grammar)
_LOGGER.debug("Command: %s", command)
return ret
def locate(grammar, text, alphabet, **kwargs):
This function applies a grammar to a text and constructs an index of
the occurrences found.
This function saves the references to the found occurrences in a
file called concord.ind. The number of occurrences, the number of
units belonging to those occurrences, as well as the percentage of
recognized units within the text are saved in a file called
concord.n. These two files are stored in the directory of the text.
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
*Arguments:*
- **grammar [str]** -- the fst2 to apply on the text.
- **text [str]** -- the text file, with extension .snt.
- **alphabet [str]** -- the alphabet file of the language of the
text.
*Keyword arguments:*
- *Generic options*:
- **start_on_space [bool]** -- this parameter indicates that the
search will start at any position in the text, even before a
space. This parameter should only be used to carry out
morphological searches (default: False).
- **char_by_char [bool]** -- works in character by character
tokenization mode. This is useful for languages like Thai
(default: False).
- **morpho [list(str)]** -- this optional argument indicates which
morphological mode dictionaries are to be used, if needed by
some .fst2 dictionaries. The argument is a list of dictionary
path (bin format).
- **korean [bool]** -- specify the dictionary is in korean
(default: False).
- **arabic_rules [str]** -- specifies the Arabic typographic rule
configuration file path.
- **sntdir [str]** -- puts produced files in 'sntdir' instead of
the text directory. Note that 'sntdir' must end with a file
separator (\ or /).
- **negation_operator [str]** -- specifies the negation operator
to be used in Locate patterns. The two legal values for X are
minus and tilde (default). Using minus provides backward
compatibility with previous versions of Unitex.
- *Search limit options:*
- **number_of_matches [int]** -- stops after the first N matches
(default: all matches).
- *Maximum iterations per token options:*
- **stop_token_count [list(int_1, int_2)]** -- emits a warning
after 'int_1' iterations on a token and stops after 'int_2'
iterations.
- *Matching mode options:*
- **match_mode [str]** -- Possible values are:
- UnitexConstants.MATCH_MODE_SHORTEST
- UnitexConstants.MATCH_MODE_LONGEST (default)
- UnitexConstants.MATCH_MODE_ALL
- Output options:
- **output_mode [str]** -- Possible values are:
- UnitexConstants.OUTPUT_MODE_IGNORE (default)
- UnitexConstants.OUTPUT_MODE_MERGE
- UnitexConstants.OUTPUT_MODE_REPLACE
- **protect_dic_chars [bool]** -- when 'merge' or 'replace' mode
is used, this option protects some input characters with a
backslash. This is useful when Locate is called by 'dico' in
order to avoid producing bad lines like: 3,14,.PI.NUM
(default: True).
- **variable [list(str_1, str_2)]** -- sets an output variable
named str_1 with content str_2. Note that str_2 must be ASCII.
- *Ambiguous output options:*
- **ambiguous_outputs [bool]** -- allows the production of several
matches with same input but different outputs. If False, in case
of ambiguous outputs, one will be arbitrarily chosen and kept,
depending on the internal state of the function (default: True).
- **variable_error [str]** -- Possible values are:
- UnitexConstants.ON_ERROR_EXIT
- UnitexConstants.ON_ERROR_IGNORE (default)
- UnitexConstants.ON_ERROR_BACKTRACK
Patrick Watrin
a validé
*Return [bool]:*
**True** if it succeeds, **False** otherwise.
options = LocateOptions()
options.load(kwargs)
if exists(grammar) is False:
raise UnitexException("[LOCATE] Grammar file '%s' doesn't exists" % grammar)
if exists(text) is False:
raise UnitexException("[LOCATE] Text file '%s' doesn't exists" % text)
if exists(alphabet) is False:
raise UnitexException("[LOCATE] Alphabet file '%s' doesn't exists" % alphabet)
command = ["UnitexTool", "Locate"]
command.append("--text=%s" % text)
command.append("--alphabet=%s" % alphabet)
if options["morpho"] is not None:
command.append("--morpho=%s" % ",".join(options["morpho"]))
if options["start_on_space"] is False:
command.append("--dont_start_on_space")
else:
command.append("--start_on_space")
if options["char_by_char"] is False:
command.append("--word_by_word")
else:
command.append("--char_by_char")
if options["sntdir"] is not None:
command.append("--sntdir=%s" % options["sntdir"])
if options["korean"] is True:
command.append("--korean")
if options["arabic_rules"] is not None:
command.append("--arabic_rules=%s" % options["arabic_rules"])
if options["negation_operator"] is not None:
command.append("--negation_operator=%s" % options["negation_operator"])
if options["number_of_matches"] is None:
command.append("--all")
else:
command.append("--number_of_matches=%s" % options["number_of_matches"])
if options["stop_token_count"] is not None:
if options["stop_token_count[0]"] is None:
command.append("--stop_token_count=%s" % stop_token_count[1])
else:
command.append("--stop_token_count=%s,%s" % (stop_token_count[0], stop_token_count[1]))
if options["match_mode"] == UnitexConstants.MATCH_MODE_LONGEST:
command.append("--longest_matches")
elif options["match_mode"] == UnitexConstants.MATCH_MODE_SHORTEST:
command.append("--shortest_matches")
elif options["match_mode"] == UnitexConstants.MATCH_MODE_ALL:
command.append("--all_matches")
if options["output_mode"] == UnitexConstants.OUTPUT_MODE_IGNORE:
command.append("--ignore")
elif options["output_mode"] == UnitexConstants.OUTPUT_MODE_MERGE:
command.append("--merge")
elif options["output_mode"] == UnitexConstants.OUTPUT_MODE_RELACE:
command.append("--replace")
if options["protect_dic_chars"] is True:
command.append("--protect_dic_chars")
if options["variable"] is not None:
command.append("--variable=%s=%s" % (options["variable"][0], options["variable"][1]))
if options["ambiguous_outputs"] is True:
command.append("--ambiguous_outputs")
else:
command.append("--no_ambiguous_outputs")
if options["variable_error"] == UnitexConstants.ON_ERROR_IGNORE:
command.append("--ignore_variable_error")
elif options["variable_error"] == UnitexConstants.ON_ERROR_EXIT:
command.append("--exit_on_variable_error")
elif options["variable_error"] == UnitexConstants.ON_ERROR_BACKTRACK:
command.append("--backtrack_on_variable_error")
command.append(grammar)
command.append("-qutf8-no-bom")
command = " ".join(command)
_LOGGER.info("Locating pattern '%s'..." % grammar)
_LOGGER.debug("Command: %s", command)
return ret
def normalize(text, **kwargs):
This function carries out a normalization of text separators. The
separators are space, tab, and newline. Every sequence of separators
that contains at least one newline is replaced by a unique newline.
All other sequences of separators are replaced by a single space.
This function also checks the syntax of lexical tags found in the
text. All sequences in curly brackets should be either the sentence
delimiter {S}, the stop marker {STOP}, or valid entries in the DELAF
format ({aujourd’hui,.ADV}).
**NOTE:** the function creates a modified version of the text that is
saved in a file with extension .snt.
**WARNING:** if you specify a normalization rule file, its rules
will be applied prior to anything else. So, you have to be very
careful if you manipulate separators in such rules.
- **text [str]** -- the text file to normalize.
- ** no_carriage_return [bool]** -- every separator sequence will be
turned into a single space (default: False).
- **input_offsets [str]** -- base offset file to be used.
- **output_offsets [str]** -- offset file to be produced.
- **replacement_rules [str]** -- specifies the normalization rule
file to be used. See section 14.13.6 for details about the format
of this file. By default, the function only replaces { and } by
[ and ].
- **no_separator_normalization [bool]** -- only applies replacement
rules specified with the 'replacement_rules' option
(default: False).
Patrick Watrin
a validé
*Return [bool]:*
**True** if it succeeds, **False** otherwise.
options = NormalizeOptions()
options.load(kwargs)
if exists(text) is False:
raise UnitexException("[NORMALIZE] Text file '%s' doesn't exists" % text)
command = ["UnitexTool", "Normalize"]
if options["no_carriage_return"] is True:
command.append("--no_carriage_return")
if options["input_offsets"] is not None:
command.append("--input_offsets=%s" % options["input_offsets"])
if options["output_offsets"] is not None:
command.append("--output_offsets=%s" % options["output_offsets"])
if options["replacement_rules"] is not None:
command.append("--replacement_rules=%s" % options["replacement_rules"])
if options["no_separator_normalization"] is True: