Skip to content
Extraits de code Groupes Projets
Valider b4a1d6a9 rédigé par Patrick Watrin's avatar Patrick Watrin
Parcourir les fichiers

First commit: only the basic processing functions

parent c522cc42
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
# Python
*.pyc
*.egg
*.egg-info
dist
build
# VIM
*.swp
# Dependencies
dependencies/
Unitex binding for python 3
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, subprocess, sys
from distutils.core import setup
from distutils.command.build import build
from distutils.command.clean import clean
from distutils.command.install import install
UNITEX_INC = os.path.expandvars("$UNITEX_INC")
if UNITEX_INC == "$UNITEX_INC":
sys.stderr.write( "You need to specify the UNTIEX_INC variable (i.e. Unitex C++ src directory)!\n" )
sys.stderr.write( " -> e.g.: UNITEX_INC=/path/to/unitex/Src/C++ python setup.py cmd\n" )
sys.exit(1)
UNITEX_INC = os.path.abspath(UNITEX_INC)
class CustomBuild(build):
def run(self):
command = "cd %s && make 64BITS=yes LIBRARY=yes" % os.path.join(UNITEX_INC, "build")
try:
process = subprocess.Popen(command, stderr=subprocess.PIPE, shell=True)
except Exception as e:
sys.stderr.write("Error in command: %s\n" % command)
raise e
process.wait()
if process.returncode != 0:
raise OSError(process.stderr.read())
build.run(self)
class CustomClean(clean):
def run(self):
command = "cd %s && make clean" % os.path.join(UNITEX_INC, "build")
try:
process = subprocess.Popen(command, stderr=subprocess.PIPE, shell=True)
except Exception as e:
sys.stderr.write("Error in command: %s\n" % command)
raise e
process.wait()
if process.returncode != 0:
raise OSError(process.stderr.read())
clean.run(self)
class CustomInstall(install):
def run(self):
library = None
if sys.platform == "darwin":
library = "libunitex.dylib"
elif sys.platform == "linux2":
library = "libunitex.so"
else:
sys.stderr.write("Plateform '%s' not supported...\n" % sys.platform)
sys.exit(1)
command = "cd %s && cp %s /usr/local/lib" % (os.path.join(UNITEX_INC, "bin"), library)
try:
process = subprocess.Popen(command, stderr=subprocess.PIPE, shell=True)
except Exception as e:
sys.stderr.write("Error in command: %s\n" % command)
raise e
process.wait()
if process.returncode != 0:
raise OSError(process.stderr.read())
install.run(self)
setup(
name = "unitex",
version = "1.0",
description = "Python 3 binding for the Unitex library",
long_description = open('README').read(),
author = "Patrick Watrin",
author_email = "patrick.watrin@gmail.com",
# https://pypi.python.org/pypi?%3Aaction=list_classifiers
classifiers = [
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3 :: Only",
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Topic :: Scientific/Engineering :: Information Analysis",
],
keywords = "Unitex, Finite-States Transducers, Natural Language Processing",
license = "MIT",
install_requires = [
# TO FILL
],
packages = ["unitex"],
package_dir = {'unitex': 'unitex'},
data_files = [
],
cmdclass = {
"build": CustomBuild,
"clean": CustomClean,
"install": CustomInstall
}
)
Le fichier a été supprimé par une entrée .gitattributes, ou son encodage n'est pas pris en charge.
Le fichier a été supprimé par une entrée .gitattributes, ou son encodage n'est pas pris en charge.
Le fichier a été supprimé par une entrée .gitattributes, ou son encodage n'est pas pris en charge.
Le fichier a été supprimé par une entrée .gitattributes, ou son encodage n'est pas pris en charge.
Le fichier a été supprimé par une entrée .gitattributes, ou son encodage n'est pas pris en charge.
Le fichier a été supprimé par une entrée .gitattributes, ou son encodage n'est pas pris en charge.
#Unigraph
SIZE 1188 840
FONT Times New Roman: 10
OFONT Arial Unicode MS:B 12
BCOLOR 16777215
FCOLOR 0
ACOLOR 13487565
SCOLOR 16711680
CCOLOR 255
DBOXES y
DFRAME n
DDATE n
DFILE n
DDIR n
DRIG n
DRST n
FITS 100
PORIENT L
#
6
"<E>/<ITEM>" 70 200 1 2
"" 450 200 0
"<être>" 162 200 1 3
"en" 277 200 1 4
"<N>" 353 200 1 5
"<E>/</ITEM>" 411 200 1 1
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import shutil
import unittest
from unitex.tools import *
class Arguments:
def __init__(self, language=None):
self.__arguments = {}
self.__arguments["dic"] = "data/delaf-short.dic"
self.__arguments["dic_type"] = "delaf"
self.__arguments["dic_check"] = "data/CHECK_DIC.TXT"
self.__arguments["bin"] = "data/delaf-short.bin"
self.__arguments["inf"] = "data/delaf-short.inf"
self.__arguments["alphabet"] = "data/Alphabet.txt"
self.__arguments["alphabet_sort"] = "data/Alphabet_sort.txt"
self.__arguments["sentence"] = "data/Sentence.fst2"
self.__arguments["txt"] = "data/80jours.txt"
self.__arguments["snt"] = "data/80jours.snt"
self.__arguments["dir"] = "data/80jours_snt"
self.__arguments["text.cod"] = os.path.join(self.__arguments["dir"], "text.cod")
self.__arguments["tok_by_freq.txt"] = os.path.join(self.__arguments["dir"], "tok_by_freq.txt")
self.__arguments["tok_by_alph.txt"] = os.path.join(self.__arguments["dir"], "tok_by_alph.txt")
self.__arguments["stats.n"] = os.path.join(self.__arguments["dir"], "stats.n")
self.__arguments["enter.pos"] = os.path.join(self.__arguments["dir"], "enter.pos")
self.__arguments["dlf"] = os.path.join(self.__arguments["dir"], "dlf")
self.__arguments["dlc"] = os.path.join(self.__arguments["dir"], "dlc")
self.__arguments["err"] = os.path.join(self.__arguments["dir"], "err")
self.__arguments["tags_err"] = os.path.join(self.__arguments["dir"], "tags_err")
self.__arguments["tags.ind"] = os.path.join(self.__arguments["dir"], "tags.ind")
self.__arguments["stat_dic.n"] = os.path.join(self.__arguments["dir"], "stat_dic.n")
self.__arguments["grf"] = "data/test.txt"
self.__arguments["fst"] = "data/test.fst2"
self.__arguments["ind"] = os.path.join(self.__arguments["dir"], "concord.ind")
self.__arguments["concord.n"] = os.path.join(self.__arguments["dir"], "concord.n")
self.__arguments["concordances"] = os.path.join(self.__arguments["dir"], "concord.txt")
def __getitem__(self, key):
if key not in self.__arguments:
raise KeyError("Argument '%s' not found ..." % key)
return self.__arguments[key]
class TestUnitexTools(unittest.TestCase):
@classmethod
def setUpClass(self):
self._arguments = Arguments()
@classmethod
def tearDownClass(self):
# Removing output file from the 'check_dic' command.
if os.path.exists(self._arguments["dic_check"]):
os.remove(self._arguments["dic_check"])
# Removing output file from the 'compress' command.
if os.path.exists(self._arguments["bin"]):
os.remove(self._arguments["bin"])
if os.path.exists(self._arguments["inf"]):
os.remove(self._arguments["inf"])
# Removing output file from the 'normalize' and 'fst2txt' commands.
if os.path.exists(self._arguments["snt"]):
os.remove(self._arguments["snt"])
# Removing (recursively) the text directory.
if os.path.exists(self._arguments["dir"]):
shutil.rmtree(self._arguments["dir"])
# Removing output file from the 'grf2fst2' command.
if os.path.exists(self._arguments["fst"]):
os.remove(self._arguments["fst"])
def test_01_check_dic(self):
args = [self._arguments["dic"]]
kwargs = {}
kwargs["type"] = self._arguments["dic_type"]
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["strict"] = False
kwargs["no_space_warning"] = True
ret = check_dic(*args, **kwargs)
ok = os.path.exists(self._arguments["dic_check"]) and (ret == 0)
self.assertTrue(ok, "Dictionary checking failed! Return code is '%s'" % ret)
def test_02_compress(self):
args = [self._arguments["dic"]]
kwargs = {}
kwargs["flip"] = False
kwargs["semitic"] = False
kwargs["version"] = "v2"
ret = compress(*args, **kwargs)
ok = os.path.exists(self._arguments["bin"]) and os.path.exists(self._arguments["inf"]) and (ret == 0)
self.assertTrue(ok, "Compression failed! Return code is '%s'" % ret)
def test_03_normalize(self):
args = [self._arguments["txt"]]
kwargs = {}
kwargs["no_carriage_return"] = False
kwargs["input_offsets"] = None
kwargs["output_offsets"] = None
kwargs["replacement_rules"] = None
kwargs["no_separator_normalization"] = False
ret = normalize(*args, **kwargs)
ok = os.path.exists(self._arguments["snt"]) and (ret == 0)
self.assertTrue(ok, "Normalisation failed! Return code is '%s'" % ret)
def test_04_fst2txt(self):
args = [self._arguments["sentence"]]
kwargs = {}
kwargs["text"] = self._arguments["snt"]
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["start_on_space"] = False
kwargs["char_by_char"] = False
kwargs["merge"] = True
ret = fst2txt(*args, **kwargs)
ok = (ret == 0)
self.assertTrue(ok, "FST application failed! Return code is '%s'" % ret)
def test_05_tokenize(self):
if not os.path.exists(self._arguments["dir"]):
os.mkdir(self._arguments["dir"])
args = [self._arguments["snt"]]
kwargs = {}
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["char_by_char"] = False
kwargs["tokens"] = None
kwargs["input_offsets"] = None
kwargs["output_offsets"] = None
ret = tokenize(*args, **kwargs)
ok = (ret == 0)
ok = ok and os.path.exists(self._arguments["text.cod"])
ok = ok and os.path.exists(self._arguments["tok_by_freq.txt"])
ok = ok and os.path.exists(self._arguments["tok_by_alph.txt"])
ok = ok and os.path.exists(self._arguments["stats.n"])
ok = ok and os.path.exists(self._arguments["enter.pos"])
self.assertTrue(ok, "Tokenisation failed! Return code is '%s'" % ret)
def test_06_dico(self):
args = [self._arguments["bin"]]
kwargs = {}
kwargs["text"] = self._arguments["snt"]
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["morpho"] = None
kwargs["korean"] = False
kwargs["semitic"] = False
kwargs["arabic_rules"] = None
kwargs["raw"] = None
ret = dico(*args, **kwargs)
ok = (ret == 0)
ok = ok and os.path.exists(self._arguments["dlf"])
ok = ok and os.path.exists(self._arguments["dlc"])
ok = ok and os.path.exists(self._arguments["err"])
ok = ok and os.path.exists(self._arguments["tags_err"])
ok = ok and os.path.exists(self._arguments["tags.ind"])
ok = ok and os.path.exists(self._arguments["stat_dic.n"])
self.assertTrue(ok, "Dictionary application failed! Return code is '%s'" % ret)
def test_07_sort_txt(self):
files = []
files.append(self._arguments["dlf"])
files.append(self._arguments["dlc"])
files.append(self._arguments["err"])
files.append(self._arguments["tags_err"])
kwargs = {}
kwargs["duplicates"] = False
kwargs["reverse"] = False
kwargs["sort_order"] = self._arguments["alphabet_sort"]
kwargs["line_info"] = None
kwargs["thai"] = False
kwargs["factorize_inflectional_codes"] = False
ok = True
for f in files:
args = [f]
ret = sort_txt(*args, **kwargs)
ok = ok and (ret == 0)
self.assertTrue(ok, "Sorting failed! Return code is '%s'" % ret)
def test_08_grf2fst2(self):
args = [self._arguments["grf"]]
kwargs = {}
kwargs["loop_check"] = False
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["char_by_char"] = False
kwargs["pkgdir"] = None
kwargs["no_empty_graph_warning"] = False
kwargs["tfst_check"] = False
kwargs["silent_grf_name"] = False
kwargs["named_repositories"] = None
kwargs["debug"] = False
kwargs["check_variables"] = False
ret = grf2fst2(*args, **kwargs)
ok = os.path.exists(self._arguments["fst"]) and (ret == 0)
self.assertTrue(ok, "Grammar compilation failed! Return code is '%s'" % ret)
def test_09_locate(self):
args = [self._arguments["fst"]]
kwargs = {}
kwargs["text"] = self._arguments["snt"]
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["start_on_space"] = False
kwargs["char_by_char"] = False
kwargs["morpho"] = None
kwargs["korean"] = False
kwargs["arabic_rules"] = None
kwargs["sntdir"] = None
kwargs["negation_operator"] = None
kwargs["number_of_matches"] = None
kwargs["stop_token_count"] = None
kwargs["match_mode"] = "longest"
kwargs["output_mode"] = "merge"
kwargs["protect_dic_chars"] = True
kwargs["variable"] = None
kwargs["ambiguous_outputs"] = True
kwargs["variable_error"] = "ignore"
ret = locate(*args, **kwargs)
ok = os.path.exists(self._arguments["ind"]) and os.path.exists(self._arguments["concord.n"]) and (ret == 0)
self.assertTrue(ok, "Locate failed! Return code is '%s'" % ret)
def test_10_concord(self):
args = [self._arguments["ind"]]
kwargs = {}
kwargs["font"] = None
kwargs["fontsize"] = None
kwargs["only_ambiguous"] = False
kwargs["only_matches"] = False
kwargs["left"] = "1000s"
kwargs["right"] = "1000s"
kwargs["sort"] = "CR"
kwargs["format"] = "text"
kwargs["script"] = None
kwargs["offsets"] = None
kwargs["unxmlize"] = None
kwargs["output"] = None
kwargs["directory"] = None
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["thai"] = False
ret = concord(*args, **kwargs)
ok = os.path.exists(self._arguments["concordances"]) and (ret == 0)
self.assertTrue(ok, "Concord failed! Return code is '%s'" % ret)
if __name__ == '__main__':
unittest.main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__all__ = ["io", "tools", "processor"]
import ctypes
import logging
import os
import sys
class UnitexException(Exception):
def __init__(self, message):
message = "## UNITEX EXCEPTION ## %s" % message
super(UnitexException, self).__init__(message)
LIBUNITEX = None
if sys.platform == "darwin":
LIBUNITEX = ctypes.cdll.LoadLibrary("libunitex.dylib")
elif sys.platform == "linux2":
LIBUNITEX = ctypes.cdll.LoadLibrary("libunitex.so")
else:
raise UnitexException("Plateform '%s' not supported..." % sys.platform)
# VERBOSE = 0: ERROR logging level
# VERBOSE = 1: WARNING logging level
# VERBOSE = 2: INFO logging level
VERBOSE = os.path.expandvars('$UNITEX_VERBOSE')
if VERBOSE == '$UNITEX_VERBOSE':
VERBOSE = 0
else:
VERBOSE = int(VERBOSE)
if VERBOSE not in (0, 1, 2):
raise UnitexException( "Wrong $UNITEX_VERBOSE value..." )
# DEBUG = 0: --
# DEBUG = 1: DEBUG logging level
# -> if set to 1, it overrides the VERBOSE variable
DEBUG = os.path.expandvars('$UNITEX_DEBUG')
if DEBUG == '$UNITEX_DEBUG':
DEBUG = 0
else:
DEBUG = int(DEBUG)
if DEBUG not in (0, 1):
raise UnitexException( "Wrong $UNITEX_DEBUG value..." )
# If a log file is specified, the log will be duplicated
# to this file
LOG = os.path.expandvars('$UNITEX_LOG')
if LOG != '$UNITEX_LOG':
os.path.expandvars('$UNITEX_LOG')
else:
LOG = None
LOGGER = logging.getLogger("unitex")
ch = logging.StreamHandler()
if DEBUG == 1:
ch.setLevel(logging.DEBUG)
elif VERBOSE == 1:
ch.setLevel(logging.WARNING)
elif VERBOSE == 2:
ch.setLevel(logging.INFO)
else:
ch.setLevel(logging.ERROR)
cf = logging.Formatter("%(name)-12s: %(levelname)-8s %(message)s")
ch.setFormatter(cf)
LOGGER.addHandler(ch)
if LOG is not None:
fh = logging.FileHandler(LOG)
if DEBUG == 1:
fh.setLevel(logging.DEBUG)
elif VERBOSE == 1:
fh.setLevel(logging.WARNING)
elif VERBOSE == 2:
fh.setLevel(logging.INFO)
else:
fh.setLevel(logging.ERROR)
ff = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(ff)
LOGGER.addHandler(fh)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import ctypes
from unitex import UnitexException, LOGGER, LIBUNITEX
class UnitexFile:
def __init__(self):
raise NotImplementedError
def open(self, path, mode=None, encoding=None):
raise NotImplementedError
def close(self):
raise NotImplementedError
def flush(self):
raise NotImplementedError
def seek(self, offset):
raise NotImplementedError
def tell(self):
raise NotImplementedError
def write(self, data):
raise NotImplementedError
def writelines(self, lines):
raise NotImplementedError
def read(self, size=None):
raise NotImplementedError
def readline(self):
raise NotImplementedError
def readlines(self):
raise NotImplementedError
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import ctypes
from unitex import UnitexException, LOGGER, LIBUNITEX
class UnitexSettings:
def __init__(self):
raise NotImplementedError
def get(self, key, default=None):
raise NotImplementedError
def set(self, key, value):
raise NotImplementedError
class UnitexProcessor:
def __init__(self):
raise NotImplementedError
def open(self, path, mode=None, encoding=None):
raise NotImplementedError
def close(self):
raise NotImplementedError
Ce diff est replié.
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter