Skip to content
Extraits de code Groupes Projets
Valider adf344c8 rédigé par Patrick Watrin's avatar Patrick Watrin
Parcourir les fichiers

(1) add txt2tfst command. (2) add disclaimer to the _unitex.cpp extension. (3)...

(1) add txt2tfst command. (2) add disclaimer to the _unitex.cpp extension. (3) unit test to the IO module (work in progress)
parent 445dfd92
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
/*
* NOTE: some parts of this file are an adaptation of the 'fr_umlv_unitex_jni_UnitexJni.cpp' file
* which is included in the Unitex source distribution.
*/
#include <Python.h> #include <Python.h>
#include "AbstractFilePlugCallback.h" #include "AbstractFilePlugCallback.h"
......
...@@ -43,6 +43,9 @@ class Arguments: ...@@ -43,6 +43,9 @@ class Arguments:
self.__arguments["tags.ind"] = os.path.join(self.__arguments["dir"], "tags.ind") self.__arguments["tags.ind"] = os.path.join(self.__arguments["dir"], "tags.ind")
self.__arguments["stat_dic.n"] = os.path.join(self.__arguments["dir"], "stat_dic.n") self.__arguments["stat_dic.n"] = os.path.join(self.__arguments["dir"], "stat_dic.n")
self.__arguments["text.tfst"] = os.path.join(self.__arguments["dir"], "text.tfst")
self.__arguments["text.tind"] = os.path.join(self.__arguments["dir"], "text.tind")
self.__arguments["grf"] = "data/grammar.grf" self.__arguments["grf"] = "data/grammar.grf"
self.__arguments["fst"] = "data/grammar.fst2" self.__arguments["fst"] = "data/grammar.fst2"
...@@ -305,6 +308,24 @@ class TestUnitexTools(unittest.TestCase): ...@@ -305,6 +308,24 @@ class TestUnitexTools(unittest.TestCase):
self.assertTrue(ok, "Concord failed!") self.assertTrue(ok, "Concord failed!")
def test_11_txt2tfst(self):
args = [self._arguments["snt"]]
kwargs = {}
kwargs["alphabet"] = self._arguments["alphabet"]
kwargs["clean"] = False
kwargs["normalization_grammar"] = None
kwargs["tagset"] = None
kwargs["korean"] = False
ret = txt2tfst(*args, **kwargs)
ok = ret
ok = ok and os.path.exists(self._arguments["text.tfst"])
ok = ok and os.path.exists(self._arguments["text.tind"])
self.assertTrue(ok, "Txt2Tfst failed!")
if __name__ == '__main__': if __name__ == '__main__':
......
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import unittest import os, shutil, unittest
from unitex.io import * from unitex.io import *
class Arguments:
def __init__(self, language=None):
self.__arguments = {}
self.__arguments["file_source"] = "data/corpus.txt"
self.__arguments["file_target_hdd_01"] = "data/corpus-hdd-01.txt"
self.__arguments["file_target_hdd_02"] = "data/corpus-hdd-02.txt"
self.__arguments["file_target_vfs_01"] = "%sdata/corpus-vfs-01.txt" % UnitexIOConstants.VFS_PREFIX
self.__arguments["file_target_vfs_02"] = "%sdata/corpus-vfs-02.txt" % UnitexIOConstants.VFS_PREFIX
self.__arguments["virtual_file_hdd_01"] = "data/corpus-virtual-file-01.txt"
self.__arguments["virtual_file_hdd_02"] = "data/corpus-virtual-file-02.txt"
self.__arguments["virtual_file_vfs_01"] = "%sdata/corpus-virtual-file-01.txt" % UnitexIOConstants.VFS_PREFIX
self.__arguments["virtual_file_vfs_02"] = "%sdata/corpus-virtual-file-02.txt" % UnitexIOConstants.VFS_PREFIX
self.__arguments["directory"] = "data/biniou/"
def __getitem__(self, key):
if key not in self.__arguments:
raise KeyError("Argument '%s' not found ..." % key)
return self.__arguments[key]
class TestUnitexIO(unittest.TestCase): class TestUnitexIO(unittest.TestCase):
def test_01_(self): @classmethod
pass def setUpClass(self):
self._arguments = Arguments()
@classmethod
def tearDownClass(self):
if os.path.exists(self._arguments["file_target_hdd_01"]):
os.remove(self._arguments["file_target_hdd_01"])
if os.path.exists(self._arguments["file_target_hdd_02"]):
os.remove(self._arguments["file_target_hdd_02"])
if os.path.exists(self._arguments["directory"]):
shutil.rmtree(self._arguments["directory"])
def test_01_enable_stdout(self):
ret = enable_stdout()
self.assertTrue(ok, "STDOUT enabling failed!")
def test_02_disable_stdout(self):
ret = disable_stdout()
self.assertTrue(ok, "STDOUT disabling failed!")
def test_03_enable_stderr(self):
ret = enable_stderr()
self.assertTrue(ok, "STDERR enabling failed!")
def test_04_disable_stderr(self):
ret = disable_stderr()
self.assertTrue(ok, "STDERR disabling failed!")
def test_05_01_cp_hdd(self):
raise NotImplementedError
def test_05_02_cp_vfs(self):
raise NotImplementedError
def test_06_01_rm_hdd(self):
raise NotImplementedError
def test_06_02_rm_vfs(self):
raise NotImplementedError
def test_07_01_mv_hdd(self):
raise NotImplementedError
def test_07_02_mv_vfs(self):
raise NotImplementedError
def test_08_mkdir(self):
raise NotImplementedError
def test_09_rmdir(self):
raise NotImplementedError
def test_10_ls_hdd(self):
raise NotImplementedError
def test_10_ls_vfs(self):
raise NotImplementedError
def test_11_01_01_virtual_file_read_hdd(self):
raise NotImplementedError
def test_11_01_02_virtual_file_read_vfs(self):
raise NotImplementedError
def test_11_02_01_virtual_file_write_hdd(self):
raise NotImplementedError
def test_11_02_02_virtual_file_write_vfs(self):
raise NotImplementedError
def test_11_03_01_virtual_file_append_hdd(self):
raise NotImplementedError
def test_11_03_02_virtual_file_append_vfs(self):
raise NotImplementedError
......
...@@ -1057,3 +1057,68 @@ def tokenize(*args, **kwargs): ...@@ -1057,3 +1057,68 @@ def tokenize(*args, **kwargs):
ret = unitex_tool(command) ret = unitex_tool(command)
return ret return ret
def txt2tfst(*args, **kwargs):
"""This function constructs an automaton of a text.
If the text is separated into sentences, the function constructs an automaton for each
sentence. If this is not the case, the program arbitrarily cuts the text into sequences
of 2000 tokens and produces an automaton for each of these sequences.
The result is a file called text.tfst which is saved in the directory of the text.
Another file named text.tind is also produced.
Arguments (length: 1):
0 -- the full path to the text file in snt format.
Keyword arguments:
alphabet [str] -- the alphabet file of the language of the text
clean [bool] -- indicates whether the rule of conservation of the best
paths (see section 7.2.4) should be applied
(default: False)
normalization_grammar [str] -- name of a normalization grammar that is to be applied
to the text automaton
tagset [str] -- Elag tagset file to use to normalize dictionary entries
korean [bool] -- tells the function that it works on Korean
(default: False)
Return [bool]:
The function return 'True' if it succeeds and 'False' otherwise.
"""
if len(args) != 1:
raise UnitexException("You must specify one and only one text to normalize...")
text = args[0]
alphabet = kwargs.get("alphabet", None)
if alphabet is None:
raise UnitexException("You must specify the alphabet file path...")
clean = kwargs.get("clean", False)
normalization_grammar = kwargs.get("normalization_grammar", None)
tagset = kwargs.get("tagset", None)
korean = kwargs.get("korean", False)
command = ["UnitexTool", "Txt2Tfst"]
command.append("--alphabet=%s" % alphabet)
if clean is not False:
command.append("--clean")
if normalization_grammar is not None:
command.append("--normalization_grammar=%s" % normalization_grammar)
if tagset is not None:
command.append("--tagset=%s" % tagset)
if korean is not False:
command.append("--korean")
command.append(text)
command = " ".join(command)
LOGGER.info("Building text automaton for '%s'..." % text)
LOGGER.debug("Command: %s", command)
ret = unitex_tool(command)
return ret
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter