Skip to content
Extraits de code Groupes Projets
Valider 59d7ab92 rédigé par Patrick Watrin's avatar Patrick Watrin
Parcourir les fichiers

Adaptation of the tools function arguments in the unit tests

parent cfcd9ff7
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
global:
debug: 1
verbose: 1
tempdir: "/tmp"
persistence: True
virtualization: True
resources:
language: fr
alphabet: /home/resources/media/fr/unitex/preprocessing/Alphabet.txt
alphabet-sorted: /home/resources/media/fr/unitex/preprocessing/Alphabet_sort.txt
sentence: /home/resources/media/fr/unitex/preprocessing/sentence/Sentence.fst2
replace: /home/resources/media/fr/unitex/preprocessing/replace/Replace.fst2
dictionaries:
- /home/resources/media/fr/unitex/dictionary/delaf-short.bin
- /home/resources/media/fr/unitex/dictionary/delacf-light.bin
- /home/resources/media/fr/unitex/dictionary/toponyms.bin
# The 'options' section can contain any of the argument used by the unitex tools
# functions. Note that some argument will be overriden to fit the 'tag' and 'extract'
# behaviour. For intance, there is not point to define a font or a context for
# 'concord'.
options:
locate:
match_mode: longest
global:
debug: 1
verbose: 1
tempdir: "/tmp"
persistence: True
virtualization: True
resources:
language: "fr"
alphabet: "/home/resources/media/fr/unitex/preprocessing/Alphabet.txt"
alphabet-sorted: "/home/resources/media/fr/unitex/preprocessing/Alphabet_sort.txt"
sentence: "/home/resources/media/fr/unitex/preprocessing/sentence/Sentence.fst2"
replace: "/home/resources/media/fr/unitex/preprocessing/replace/Replace.fst2"
dictionaries:
- "/home/resources/media/fr/unitex/dictionary/delaf-short.bin"
- "/home/resources/media/fr/unitex/dictionary/delacf-light.bin"
- "/home/resources/media/fr/unitex/dictionary/toponyms.bin"
# The 'options' section can contain any of the argument used by the unitex tools
# functions. Note that, if you use the 'Processor' high-level class some argument
# could be overriden to fit the 'tag', 'extract' and 'search' functions
# behaviour. For intance, there is not point to define a font or a context for
# 'concord'.
options:
check_dic:
strict: False
no_space_warning: False
compress:
output: null
flip: False
semitic: False
version: "v2"
concord:
font: null
fontsize: null
only_ambiguous: False
only_matches: False
left: 0
right: 0
sort: "TO"
format: "text"
script: null
offsets: null
unxmlize: null
directory: null
thai: False
dico:
morpho: null
korean: False
semitic: False
arabic_rules: null
raw: null
extract:
non_matching_sentences: False
fst2txt:
start_on_space: False
word_by_word: False
merge: True
grf2fst2:
loop_check: False
char_by_char: False
pkgdir: null
no_empty_graph_warning: False
tfst_check: False
silent_grf_name: True
named_repository: null
debug: False
check_variables: True
locate:
start_on_space: False
char_by_char: False
morpho: null
korean: False
arabic_rules: null
sntdir: null
negation_operator: "tilde"
number_of_matches: null
stop_token_count: null
match_mode: "longest"
output_mode: "merge"
protect_dic_chars: True
variable: null
ambiguous_outputs: True
variable_error: "ignore"
normalize:
no_carriage_return: False
input_offsets: null
output_offsets: null
no_separator_normalization: False
replacement_rules: null
sort_txt:
duplicates: False
revers: False
sort_order: null
line_info: null
thai: False
factorize_inflectional_codes: False
tokenize:
char_by_char: False
tokens: null
input_offsets: null
output_offsets: null
txt2fst:
clean: False
normalization_grammar: null
tagset: null
korean: False
...@@ -87,46 +87,39 @@ class CustomClean(clean): ...@@ -87,46 +87,39 @@ class CustomClean(clean):
setup( setup(name = "unitex",
name = "unitex", version = "1.0",
version = "1.0", description = "Python 3 binding for the Unitex library",
description = "Python 3 binding for the Unitex library", long_description = open('README.md').read(),
long_description = open('README.md').read(),
author = "Patrick Watrin",
author = "Patrick Watrin", author_email = "patrick.watrin@gmail.com",
author_email = "patrick.watrin@gmail.com",
# https://pypi.python.org/pypi?%3Aaction=list_classifiers
# https://pypi.python.org/pypi?%3Aaction=list_classifiers classifiers = ["License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
classifiers = [ "Programming Language :: Python",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Development Status :: 4 - Beta",
"Programming Language :: Python", "Intended Audience :: Developers",
"Development Status :: 4 - Beta", "Topic :: Scientific/Engineering :: Information Analysis"],
"Intended Audience :: Developers",
"Topic :: Scientific/Engineering :: Information Analysis", keywords = "Unitex, Finite-States Transducers, Natural Language Processing",
],
keywords = "Unitex, Finite-States Transducers, Natural Language Processing", license = "GPLv3",
install_requires = [],
license = "GPLv3",
install_requires = [ package_dir = {"unitex":"unitex"},
# TO FILL packages = ["unitex"],
],
data_files = [],
packages = ["unitex"],
package_dir = {'unitex': 'unitex'}, ext_modules=[Extension("_unitex",
include_dirs = [UNITEX_INC, get_python_inc()],
data_files = [ libraries=["unitex"],
], library_dirs=['/usr/local/lib'],
sources = ["extensions/_unitex.cpp"])],
ext_modules=[
Extension("_unitex", #cmdclass = {
include_dirs = [UNITEX_INC, get_python_inc()], # "build": CustomBuild,
libraries=["unitex"], # "clean": CustomClean
library_dirs=['/usr/local/lib'], #}
sources = ["extensions/_unitex.cpp"])
],
# cmdclass = {
# "build": CustomBuild,
# "clean": CustomClean
# }
) )
...@@ -5,17 +5,17 @@ import os ...@@ -5,17 +5,17 @@ import os
import shutil import shutil
import unittest import unittest
from unitex import *
from unitex.tools import * from unitex.tools import *
class Arguments: class Arguments:
def __init__(self, language=None): def __init__(self, language=None):
self.__arguments = {} self.__arguments = {}
self.__arguments["dic"] = "data/dictionary.dic" self.__arguments["dic"] = "data/dictionary.dic"
self.__arguments["dic_type"] = "delaf" self.__arguments["dic_type"] = UnitexConstants.DELAF
self.__arguments["dic_check"] = "data/CHECK_DIC.TXT" self.__arguments["dic_check"] = "data/CHECK_DIC.TXT"
self.__arguments["bin"] = "data/dictionary.bin" self.__arguments["bin"] = "data/dictionary.bin"
...@@ -29,6 +29,7 @@ class Arguments: ...@@ -29,6 +29,7 @@ class Arguments:
self.__arguments["txt"] = "data/corpus.txt" self.__arguments["txt"] = "data/corpus.txt"
self.__arguments["snt"] = "data/corpus.snt" self.__arguments["snt"] = "data/corpus.snt"
self.__arguments["dir"] = "data/corpus_snt" self.__arguments["dir"] = "data/corpus_snt"
self.__arguments["xtr"] = "data/corpus.xtr"
self.__arguments["text.cod"] = os.path.join(self.__arguments["dir"], "text.cod") self.__arguments["text.cod"] = os.path.join(self.__arguments["dir"], "text.cod")
self.__arguments["tok_by_freq.txt"] = os.path.join(self.__arguments["dir"], "tok_by_freq.txt") self.__arguments["tok_by_freq.txt"] = os.path.join(self.__arguments["dir"], "tok_by_freq.txt")
...@@ -90,37 +91,42 @@ class TestUnitexTools(unittest.TestCase): ...@@ -90,37 +91,42 @@ class TestUnitexTools(unittest.TestCase):
if os.path.exists(self._arguments["fst"]): if os.path.exists(self._arguments["fst"]):
os.remove(self._arguments["fst"]) os.remove(self._arguments["fst"])
# Removing output file from the 'extract' command.
if os.path.exists(self._arguments["xtr"]):
os.remove(self._arguments["xtr"])
def test_01_check_dic(self): def test_01_check_dic(self):
args = [self._arguments["dic"]] dictionary = self._arguments["dic"]
dtype = self._arguments["dic_type"]
alphabet = self._arguments["alphabet"]
kwargs = {} kwargs = {}
kwargs["type"] = self._arguments["dic_type"]
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["strict"] = False kwargs["strict"] = False
kwargs["no_space_warning"] = True kwargs["no_space_warning"] = True
ret = check_dic(*args, **kwargs) ret = check_dic(dictionary, dtype, alphabet, **kwargs)
ok = os.path.exists(self._arguments["dic_check"]) and ret ok = os.path.exists(self._arguments["dic_check"]) and ret
self.assertTrue(ok, "Dictionary checking failed!") self.assertTrue(ok, "Dictionary checking failed!")
def test_02_compress(self): def test_02_compress(self):
args = [self._arguments["dic"]] dictionary = self._arguments["dic"]
kwargs = {} kwargs = {}
kwargs["output"] = None
kwargs["flip"] = False kwargs["flip"] = False
kwargs["semitic"] = False kwargs["semitic"] = False
kwargs["version"] = "v2" kwargs["version"] = UnitexConstants.DICTIONARY_VERSION_1
ret = compress(*args, **kwargs) ret = compress(dictionary, **kwargs)
ok = os.path.exists(self._arguments["bin"]) and os.path.exists(self._arguments["inf"]) and ret ok = os.path.exists(self._arguments["bin"]) and os.path.exists(self._arguments["inf"]) and ret
self.assertTrue(ok, "Compression failed!") self.assertTrue(ok, "Compression failed!")
def test_03_normalize(self): def test_03_normalize(self):
args = [self._arguments["txt"]] text = self._arguments["txt"]
kwargs = {} kwargs = {}
kwargs["no_carriage_return"] = False kwargs["no_carriage_return"] = False
...@@ -129,23 +135,23 @@ class TestUnitexTools(unittest.TestCase): ...@@ -129,23 +135,23 @@ class TestUnitexTools(unittest.TestCase):
kwargs["replacement_rules"] = None kwargs["replacement_rules"] = None
kwargs["no_separator_normalization"] = False kwargs["no_separator_normalization"] = False
ret = normalize(*args, **kwargs) ret = normalize(text, **kwargs)
ok = os.path.exists(self._arguments["snt"]) and ret ok = os.path.exists(self._arguments["snt"]) and ret
self.assertTrue(ok, "Normalisation failed!") self.assertTrue(ok, "Normalisation failed!")
def test_04_fst2txt(self): def test_04_fst2txt(self):
args = [self._arguments["sentence"]] grammar = self._arguments["sentence"]
text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
kwargs = {} kwargs = {}
kwargs["text"] = self._arguments["snt"]
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["start_on_space"] = False kwargs["start_on_space"] = False
kwargs["char_by_char"] = False kwargs["char_by_char"] = False
kwargs["merge"] = True kwargs["merge"] = True
ret = fst2txt(*args, **kwargs) ret = fst2txt(grammar, text, alphabet, **kwargs)
ok = ret ok = ret
...@@ -155,16 +161,16 @@ class TestUnitexTools(unittest.TestCase): ...@@ -155,16 +161,16 @@ class TestUnitexTools(unittest.TestCase):
if not os.path.exists(self._arguments["dir"]): if not os.path.exists(self._arguments["dir"]):
os.mkdir(self._arguments["dir"]) os.mkdir(self._arguments["dir"])
args = [self._arguments["snt"]] text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
kwargs = {} kwargs = {}
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["char_by_char"] = False kwargs["char_by_char"] = False
kwargs["tokens"] = None kwargs["tokens"] = None
kwargs["input_offsets"] = None kwargs["input_offsets"] = None
kwargs["output_offsets"] = None kwargs["output_offsets"] = None
ret = tokenize(*args, **kwargs) ret = tokenize(text, alphabet, **kwargs)
ok = ret ok = ret
ok = ok and os.path.exists(self._arguments["text.cod"]) ok = ok and os.path.exists(self._arguments["text.cod"])
...@@ -176,18 +182,18 @@ class TestUnitexTools(unittest.TestCase): ...@@ -176,18 +182,18 @@ class TestUnitexTools(unittest.TestCase):
self.assertTrue(ok, "Tokenisation failed!") self.assertTrue(ok, "Tokenisation failed!")
def test_06_dico(self): def test_06_dico(self):
args = [self._arguments["bin"]] dictionaries = [self._arguments["bin"]]
text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
kwargs = {} kwargs = {}
kwargs["text"] = self._arguments["snt"]
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["morpho"] = None kwargs["morpho"] = None
kwargs["korean"] = False kwargs["korean"] = False
kwargs["semitic"] = False kwargs["semitic"] = False
kwargs["arabic_rules"] = None kwargs["arabic_rules"] = None
kwargs["raw"] = None kwargs["raw"] = None
ret = dico(*args, **kwargs) ret = dico(dictionaries, text, alphabet, **kwargs)
ok = ret ok = ret
ok = ok and os.path.exists(self._arguments["dlf"]) ok = ok and os.path.exists(self._arguments["dlf"])
...@@ -216,21 +222,19 @@ class TestUnitexTools(unittest.TestCase): ...@@ -216,21 +222,19 @@ class TestUnitexTools(unittest.TestCase):
ok = True ok = True
for f in files: for text in files:
args = [f] ret = sort_txt(text, **kwargs)
ret = sort_txt(*args, **kwargs)
ok = ok and ret ok = ok and ret
self.assertTrue(ok, "Sorting failed!") self.assertTrue(ok, "Sorting failed!")
def test_08_grf2fst2(self): def test_08_grf2fst2(self):
args = [self._arguments["grf"]] grammar = self._arguments["grf"]
alphabet = self._arguments["alphabet"]
kwargs = {} kwargs = {}
kwargs["loop_check"] = False kwargs["loop_check"] = False
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["char_by_char"] = False kwargs["char_by_char"] = False
kwargs["pkgdir"] = None kwargs["pkgdir"] = None
kwargs["no_empty_graph_warning"] = False kwargs["no_empty_graph_warning"] = False
...@@ -240,47 +244,48 @@ class TestUnitexTools(unittest.TestCase): ...@@ -240,47 +244,48 @@ class TestUnitexTools(unittest.TestCase):
kwargs["debug"] = False kwargs["debug"] = False
kwargs["check_variables"] = False kwargs["check_variables"] = False
ret = grf2fst2(*args, **kwargs) ret = grf2fst2(grammar, alphabet, **kwargs)
ok = os.path.exists(self._arguments["fst"]) and ret ok = os.path.exists(self._arguments["fst"]) and ret
self.assertTrue(ok, "Grammar compilation failed!") self.assertTrue(ok, "Grammar compilation failed!")
def test_09_locate(self): def test_09_locate(self):
args = [self._arguments["fst"]] grammar = self._arguments["fst"]
text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
kwargs = {} kwargs = {}
kwargs["text"] = self._arguments["snt"]
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["start_on_space"] = False kwargs["start_on_space"] = False
kwargs["char_by_char"] = False kwargs["char_by_char"] = False
kwargs["morpho"] = None kwargs["morpho"] = None
kwargs["korean"] = False kwargs["korean"] = False
kwargs["arabic_rules"] = None kwargs["arabic_rules"] = None
kwargs["sntdir"] = None kwargs["sntdir"] = None
kwargs["negation_operator"] = None kwargs["negation_operator"] = UnitexConstants.NEGATION_OPERATOR
kwargs["number_of_matches"] = None kwargs["number_of_matches"] = None
kwargs["stop_token_count"] = None kwargs["stop_token_count"] = None
kwargs["match_mode"] = "longest" kwargs["match_mode"] = UnitexConstants.MATCH_MODE_LONGEST
kwargs["output_mode"] = "merge" kwargs["output_mode"] = UnitexConstants.OUTPUT_MODE_MERGE
kwargs["protect_dic_chars"] = True kwargs["protect_dic_chars"] = True
kwargs["variable"] = None kwargs["variable"] = None
kwargs["ambiguous_outputs"] = True kwargs["ambiguous_outputs"] = True
kwargs["variable_error"] = "ignore" kwargs["variable_error"] = UnitexConstants.ON_ERROR_IGNORE
ret = locate(*args, **kwargs) ret = locate(grammar, text, alphabet, **kwargs)
ok = os.path.exists(self._arguments["ind"]) and os.path.exists(self._arguments["concord.n"]) and ret ok = os.path.exists(self._arguments["ind"]) and os.path.exists(self._arguments["concord.n"]) and ret
self.assertTrue(ok, "Locate failed!") self.assertTrue(ok, "Locate failed!")
def test_10_concord(self): def test_10_concord(self):
args = [self._arguments["ind"]] index = self._arguments["ind"]
alphabet = self._arguments["alphabet"]
kwargs = {} kwargs = {}
kwargs["font"] = None kwargs["font"] = None
...@@ -290,35 +295,34 @@ class TestUnitexTools(unittest.TestCase): ...@@ -290,35 +295,34 @@ class TestUnitexTools(unittest.TestCase):
kwargs["left"] = "1000s" kwargs["left"] = "1000s"
kwargs["right"] = "1000s" kwargs["right"] = "1000s"
kwargs["sort"] = "CR" kwargs["sort"] = UnitexConstants.SORT_CENTER_RIGHT
kwargs["format"] = "text" kwargs["format"] = UnitexConstants.FORMAT_TEXT
kwargs["script"] = None kwargs["script"] = None
kwargs["offsets"] = None kwargs["offsets"] = None
kwargs["unxmlize"] = None kwargs["unxmlize"] = None
kwargs["output"] = None kwargs["output"] = None
kwargs["directory"] = None kwargs["directory"] = None
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["thai"] = False kwargs["thai"] = False
ret = concord(*args, **kwargs) ret = concord(index, alphabet, **kwargs)
ok = os.path.exists(self._arguments["concordances"]) and ret ok = os.path.exists(self._arguments["concordances"]) and ret
self.assertTrue(ok, "Concord failed!") self.assertTrue(ok, "Concord failed!")
def test_11_txt2tfst(self): def test_11_txt2tfst(self):
args = [self._arguments["snt"]] text = self._arguments["snt"]
alphabet = self._arguments["alphabet"]
kwargs = {} kwargs = {}
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["clean"] = False kwargs["clean"] = False
kwargs["normalization_grammar"] = None kwargs["normalization_grammar"] = None
kwargs["tagset"] = None kwargs["tagset"] = None
kwargs["korean"] = False kwargs["korean"] = False
ret = txt2tfst(*args, **kwargs) ret = txt2tfst(text, alphabet, **kwargs)
ok = ret ok = ret
ok = ok and os.path.exists(self._arguments["text.tfst"]) ok = ok and os.path.exists(self._arguments["text.tfst"])
...@@ -326,6 +330,20 @@ class TestUnitexTools(unittest.TestCase): ...@@ -326,6 +330,20 @@ class TestUnitexTools(unittest.TestCase):
self.assertTrue(ok, "Txt2Tfst failed!") self.assertTrue(ok, "Txt2Tfst failed!")
def test_12_extract(self):
text = self._arguments["snt"]
output = self._arguments["xtr"]
index = self._arguments["ind"]
kwargs = {}
kwargs["non_matching_sentences"] = False
ret = extract(text, output, index, **kwargs)
ok = ret
ok = ok and os.path.exists(self._arguments["xtr"])
self.assertTrue(ok, "Extract failed!")
if __name__ == '__main__': if __name__ == '__main__':
......
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__all__ = ["io", "tools", "processor"]
import logging import logging
import os import os
import sys import sys
...@@ -93,7 +91,7 @@ else: ...@@ -93,7 +91,7 @@ else:
if DEBUG not in (0, 1): if DEBUG not in (0, 1):
raise UnitexException( "Wrong $UNITEX_DEBUG value..." ) raise UnitexException( "Wrong $UNITEX_DEBUG value..." )
# If a log file is specified, the log will be duplicated # If a log file is specified, the log will be redirected
# to this file # to this file
LOG = os.path.expandvars('$UNITEX_LOG') LOG = os.path.expandvars('$UNITEX_LOG')
if LOG != '$UNITEX_LOG': if LOG != '$UNITEX_LOG':
...@@ -101,37 +99,22 @@ if LOG != '$UNITEX_LOG': ...@@ -101,37 +99,22 @@ if LOG != '$UNITEX_LOG':
else: else:
LOG = None LOG = None
LOGGER = logging.getLogger("unitex") kwargs = {}
ch = logging.StreamHandler()
if DEBUG == 1: if DEBUG == 1:
ch.setLevel(logging.DEBUG) kwargs["level"] = logging.DEBUG
elif VERBOSE == 1: elif VERBOSE == 1:
ch.setLevel(logging.WARNING) kwargs["level"] = logging.WARNING
elif VERBOSE == 2: elif VERBOSE == 2:
ch.setLevel(logging.INFO) kwargs["level"] = logging.INFO
else: else:
ch.setLevel(logging.ERROR) kwargs["level"] = logging.ERROR
cf = logging.Formatter("%(name)-12s: %(levelname)-8s %(message)s")
ch.setFormatter(cf)
LOGGER.addHandler(ch)
if LOG is not None: if LOG is not None:
fh = logging.FileHandler(LOG) kwargs["format"] = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
kwargs["filename"] = LOG
if DEBUG == 1: kwargs["filemode"] = "a"
fh.setLevel(logging.DEBUG) else:
elif VERBOSE == 1: kwargs["format"] = "%(name)-12s: %(levelname)-8s %(message)s"
fh.setLevel(logging.WARNING)
elif VERBOSE == 2:
fh.setLevel(logging.INFO)
else:
fh.setLevel(logging.ERROR)
ff = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(ff)
LOGGER.addHandler(fh) logging.basicConfig(**kwargs)
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging
import os import os
import tempfile import tempfile
from unitex import * from unitex import *
from unitex.io import exists from unitex.io import exists
LOGGER = logging.getLogger(__name__)
class Options(object): class Options(object):
def __init__(self): def __init__(self, options=None):
self.__options = {} self.__options = {}
if options is not None:
self.load(options)
def __contains__(self, key): def __contains__(self, key):
return key in self.__options return key in self.__options
...@@ -180,10 +186,11 @@ class ConcordOptions(Options): ...@@ -180,10 +186,11 @@ class ConcordOptions(Options):
self["output"] = output self["output"] = output
directory = options.get("directory", None) directory = options.get("directory", None)
if directory is not None and isinstance(directory, str) is False: if directory is not None:
raise UnitexException("[CONCORD] Wrong value for the 'directory' option. String required.") if isinstance(directory, str) is False:
if exists(directory) is False: raise UnitexException("[CONCORD] Wrong value for the 'directory' option. String required.")
raise UnitexException("[CONCORD] The text 'directory' doesn't exist.") if exists(directory) is False:
raise UnitexException("[CONCORD] The text 'directory' doesn't exist.")
self["directory"] = directory self["directory"] = directory
thai = options.get("thai", False) thai = options.get("thai", False)
...@@ -218,7 +225,7 @@ class DicoOptions(Options): ...@@ -218,7 +225,7 @@ class DicoOptions(Options):
raise UnitexException("[DICO] Wrong value for the 'semitic' option. Boolean required.") raise UnitexException("[DICO] Wrong value for the 'semitic' option. Boolean required.")
self["semitic"] = semitic self["semitic"] = semitic
arabic_rules = options.get("arabic_rules", False) arabic_rules = options.get("arabic_rules", None)
if arabic_rules is not None: if arabic_rules is not None:
if isinstance(arabic_rules, str) is False: if isinstance(arabic_rules, str) is False:
raise UnitexException("[DICO] Wrong value for the 'arabic_rules' option. String required.") raise UnitexException("[DICO] Wrong value for the 'arabic_rules' option. String required.")
...@@ -226,7 +233,7 @@ class DicoOptions(Options): ...@@ -226,7 +233,7 @@ class DicoOptions(Options):
raise UnitexException("[DICO] Rules file '%s' doesn't exist." % arabic_rules) raise UnitexException("[DICO] Rules file '%s' doesn't exist." % arabic_rules)
self["arabic_rules"] = arabic_rules self["arabic_rules"] = arabic_rules
raw = options.get("raw", False) raw = options.get("raw", None)
if raw is not None and isinstance(raw, str) is False: if raw is not None and isinstance(raw, str) is False:
raise UnitexException("[DICO] Wrong value for the 'raw' option. String required.") raise UnitexException("[DICO] Wrong value for the 'raw' option. String required.")
self["raw"] = raw self["raw"] = raw
...@@ -257,10 +264,10 @@ class Fst2TxtOptions(Options): ...@@ -257,10 +264,10 @@ class Fst2TxtOptions(Options):
raise UnitexException("[FST2TXT] Wrong value for the 'start_on_space' option. Boolean required.") raise UnitexException("[FST2TXT] Wrong value for the 'start_on_space' option. Boolean required.")
self["start_on_space"] = start_on_space self["start_on_space"] = start_on_space
word_by_word = options.get("word_by_word", False) char_by_char = options.get("char_by_char", False)
if isinstance(word_by_word, bool) is False: if isinstance(char_by_char, bool) is False:
raise UnitexException("[FST2TXT] Wrong value for the 'word_by_word' option. Boolean required.") raise UnitexException("[FST2TXT] Wrong value for the 'char_by_char' option. Boolean required.")
self["word_by_word"] = word_by_word self["char_by_char"] = char_by_char
merge = options.get("merge", True) merge = options.get("merge", True)
if isinstance(merge, bool) is False: if isinstance(merge, bool) is False:
...@@ -306,7 +313,7 @@ class Grf2Fst2Options(Options): ...@@ -306,7 +313,7 @@ class Grf2Fst2Options(Options):
self["silent_grf_name"] = silent_grf_name self["silent_grf_name"] = silent_grf_name
named_repositories = options.get("named_repositories", None) named_repositories = options.get("named_repositories", None)
if isinstance(named_repositories, str) is False: if named_repositories is not None and isinstance(named_repositories, str) is False:
raise UnitexException("[GRF2FST2] Wrong value for the 'named_repositories' option. String required.") raise UnitexException("[GRF2FST2] Wrong value for the 'named_repositories' option. String required.")
self["named_repositories"] = named_repositories self["named_repositories"] = named_repositories
...@@ -315,7 +322,7 @@ class Grf2Fst2Options(Options): ...@@ -315,7 +322,7 @@ class Grf2Fst2Options(Options):
raise UnitexException("[GRF2FST2] Wrong value for the 'debug' option. Boolean required.") raise UnitexException("[GRF2FST2] Wrong value for the 'debug' option. Boolean required.")
self["debug"] = debug self["debug"] = debug
check_variables = options.get("check_variables", False) check_variables = options.get("check_variables", True)
if isinstance(check_variables, bool) is False: if isinstance(check_variables, bool) is False:
raise UnitexException("[GRF2FST2] Wrong value for the 'check_variables' option. Boolean required.") raise UnitexException("[GRF2FST2] Wrong value for the 'check_variables' option. Boolean required.")
self["check_variables"] = check_variables self["check_variables"] = check_variables
...@@ -368,8 +375,8 @@ class LocateOptions(Options): ...@@ -368,8 +375,8 @@ class LocateOptions(Options):
raise UnitexException("[LOCATE] Directory '%s' doesn't exist." % sntdir) raise UnitexException("[LOCATE] Directory '%s' doesn't exist." % sntdir)
self["sntdir"] = sntdir self["sntdir"] = sntdir
negation_operator = options.get("negation_operator", None) negation_operator = options.get("negation_operator", UnitexConstants.NEGATION_OPERATOR)
if negation_operator is not None and negation_operator not in (UnitexConstants.NEGATION_OPERATOR, UnitexConstants.NEGATION_OPERATOR_OLD): if negation_operator not in (UnitexConstants.NEGATION_OPERATOR, UnitexConstants.NEGATION_OPERATOR_OLD):
raise UnitexException("[LOCATE] Wrong value for the 'negation_operator' option. UnitexConstants.NEGATION_OPERATOR(_OLD) required.") raise UnitexException("[LOCATE] Wrong value for the 'negation_operator' option. UnitexConstants.NEGATION_OPERATOR(_OLD) required.")
self["negation_operator"] = negation_operator self["negation_operator"] = negation_operator
...@@ -469,7 +476,7 @@ class NormalizeOptions(Options): ...@@ -469,7 +476,7 @@ class NormalizeOptions(Options):
raise UnitexException("[NORMALIZE] Wrong value for the 'no_separator_normalization' option. Boolean required.") raise UnitexException("[NORMALIZE] Wrong value for the 'no_separator_normalization' option. Boolean required.")
self["no_separator_normalization"] = no_separator_normalization self["no_separator_normalization"] = no_separator_normalization
replacement_rules = options.get("replacement_rules", False) replacement_rules = options.get("replacement_rules", None)
if replacement_rules is not None: if replacement_rules is not None:
if isinstance(replacement_rules, str) is False: if isinstance(replacement_rules, str) is False:
raise UnitexException("[NORMALIZE] Wrong value for the 'replacement_rules' option. String required.") raise UnitexException("[NORMALIZE] Wrong value for the 'replacement_rules' option. String required.")
...@@ -536,7 +543,7 @@ class TokenizeOptions(Options): ...@@ -536,7 +543,7 @@ class TokenizeOptions(Options):
if isinstance(tokens, str) is False: if isinstance(tokens, str) is False:
raise UnitexException("[TOKENIZE] Wrong value for the 'tokens' option. String required.") raise UnitexException("[TOKENIZE] Wrong value for the 'tokens' option. String required.")
if exists(tokens) is False: if exists(tokens) is False:
raise UnitexException("[TOKENIZE] Offsets file '%s' doesn't exist." % tokens) raise UnitexException("[TOKENIZE] Tokens file '%s' doesn't exist." % tokens)
self["tokens"] = tokens self["tokens"] = tokens
input_offsets = options.get("input_offsets", None) input_offsets = options.get("input_offsets", None)
...@@ -664,6 +671,8 @@ class UnitexConfig(Options): ...@@ -664,6 +671,8 @@ class UnitexConfig(Options):
super(UnitexConfig, self).__init__() super(UnitexConfig, self).__init__()
def load(self, settings): def load(self, settings):
options = settings.get("global", {})
verbose = options.get("verbose", VERBOSE) verbose = options.get("verbose", VERBOSE)
if verbose not in (0, 1, 2): if verbose not in (0, 1, 2):
raise UnitexException("Wrong value for the 'verbose' global option.") raise UnitexException("Wrong value for the 'verbose' global option.")
...@@ -689,25 +698,19 @@ class UnitexConfig(Options): ...@@ -689,25 +698,19 @@ class UnitexConfig(Options):
raise UnitexException("Wrong value for the 'virtualization' global option.") raise UnitexException("Wrong value for the 'virtualization' global option.")
self["virtualization"] = bool(virtualization) self["virtualization"] = bool(virtualization)
resources = ResourcesOptions() self["resources"] = ResourcesOptions(settings.get("resources", {}))
self["resources"] = resources.load(settings.get("resources", {}))
options = settings.get("options", {}) options = settings.get("options", {})
normalize = NormalizeOptions() self["check_dic"] = CheckDicOptions(options.get("normalize", {}))
self["normalize"] = normalize.load(options.get("normalize", {})) self["compress"] = CheckDicOptions(options.get("normalize", {}))
self["concord"] = ConcordOptions(options.get("concord", {}))
tokenize = TokenizeOptions() self["dico"] = DicoOptions(options.get("dico", {}))
self["tokenize"] = current.load(options.get("tokenize", {})) self["extract"] = ExtractOptions(options.get("extract", {}))
self["fst2txt"] = Fst2TxtOptions(options.get("extract", {}))
dico = DicoOptions() self["Grf2Fst2"] = Grf2Fst2Options(options.get("extract", {}))
self["dico"] = current.load(options.get("dico", {})) self["locate"] = LocateOptions(options.get("locate", {}))
self["normalize"] = NormalizeOptions(options.get("normalize", {}))
locate = LocateOptions() self["sort_txt"] = SortTxtOptions(options.get("normalize", {}))
self["locate"] = current.load(options.get("locate", {})) self["tokenize"] = TokenizeOptions(options.get("tokenize", {}))
self["txt2tfst"] = Txt2TFstOptions(options.get("tokenize", {}))
concord = ConcordOptions()
self["concord"] = current.load(options.get("concord", {}))
extract = ExtractOptions()
self["extract"] = current.load(options.get("extract", {}))
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging
import os import os
from _unitex import * from _unitex import *
from unitex import UnitexException, UnitexConstants, LOGGER from unitex import UnitexException, UnitexConstants
LOGGER = logging.getLogger(__name__)
...@@ -168,6 +171,14 @@ def ls(path): ...@@ -168,6 +171,14 @@ def ls(path):
return unitex_ls(path) return unitex_ls(path)
def exists(path): def exists(path):
"""This function verify if a file exists (on disk or virtual filesystem).
Argument:
path [str] -- directory path
Return [bool]:
The function returns 'True' if it succeeds and 'False' otherwise.
"""
if path.startswith(UnitexConstants.VFS_PREFIX) is False: if path.startswith(UnitexConstants.VFS_PREFIX) is False:
return os.path.exists(path) return os.path.exists(path)
return path in ls(path) return path in ls(path)
......
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging
from unitex import * from unitex import *
from unitex.resources import * from unitex.resources import *
from unitex.tools import * from unitex.tools import *
LOGGER = logging.getLogger(__name__)
class UnitexProcessor(object): class UnitexProcessor(object):
......
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging
from _unitex import * from _unitex import *
from unitex import LOGGER
LOGGER = logging.getLogger(__name__)
......
Ce diff est replié.
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter