(1) add txt2tfst command. (2) add disclaimer to the _unitex.cpp extension. (3)...

(1) add txt2tfst command. (2) add disclaimer to the _unitex.cpp extension. (3) unit test to the IO module (work in progress)

(1) add txt2tfst command. (2) add disclaimer to the _unitex.cpp extension. (3)...
adf344c8 · Patrick Watrin · 445dfd92 · adf344c8 · adf344c8 · adf344c8
--- a/extensions/_unitex.cpp
+++ b/extensions/_unitex.cpp
+/*
+ * NOTE: some parts of this file are an adaptation of the 'fr_umlv_unitex_jni_UnitexJni.cpp' file
+ *       which is included in the Unitex source distribution.
+ */
 #include <Python.h>
 #include "AbstractFilePlugCallback.h"

--- a/tests/01_test_tools.py
+++ b/tests/01_test_tools.py
@@ -43,6 +43,9 @@ class Arguments:
        self.__arguments["tags.ind"] = os.path.join(self.__arguments["dir"], "tags.ind")
        self.__arguments["stat_dic.n"] = os.path.join(self.__arguments["dir"], "stat_dic.n")
+        self.__arguments["text.tfst"] = os.path.join(self.__arguments["dir"], "text.tfst")
+        self.__arguments["text.tind"] = os.path.join(self.__arguments["dir"], "text.tind")
        self.__arguments["grf"] = "data/grammar.grf" 
        self.__arguments["fst"] = "data/grammar.fst2" 
@@ -305,6 +308,24 @@ class TestUnitexTools(unittest.TestCase):
        self.assertTrue(ok, "Concord failed!")
+    def test_11_txt2tfst(self):
+        args = [self._arguments["snt"]]
+        kwargs = {}
+        kwargs["alphabet"] = self._arguments["alphabet"]
+        kwargs["clean"] = False
+        kwargs["normalization_grammar"] = None
+        kwargs["tagset"] = None
+        kwargs["korean"] = False
+        ret = txt2tfst(*args, **kwargs)
+        ok = ret
+        ok = ok and os.path.exists(self._arguments["text.tfst"])
+        ok = ok and os.path.exists(self._arguments["text.tind"])
+        self.assertTrue(ok, "Txt2Tfst failed!")
 if __name__ == '__main__':

--- a/tests/03_test_io.py
+++ b/tests/03_test_io.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-import unittest
+import os, shutil, unittest
 from unitex.io import *
+class Arguments:
+    def __init__(self, language=None):
+        self.__arguments = {}
+        self.__arguments["file_source"] = "data/corpus.txt"
+        self.__arguments["file_target_hdd_01"] = "data/corpus-hdd-01.txt"
+        self.__arguments["file_target_hdd_02"] = "data/corpus-hdd-02.txt"
+        self.__arguments["file_target_vfs_01"] = "%sdata/corpus-vfs-01.txt" % UnitexIOConstants.VFS_PREFIX
+        self.__arguments["file_target_vfs_02"] = "%sdata/corpus-vfs-02.txt" % UnitexIOConstants.VFS_PREFIX
+        self.__arguments["virtual_file_hdd_01"] = "data/corpus-virtual-file-01.txt"
+        self.__arguments["virtual_file_hdd_02"] = "data/corpus-virtual-file-02.txt"
+        self.__arguments["virtual_file_vfs_01"] = "%sdata/corpus-virtual-file-01.txt" % UnitexIOConstants.VFS_PREFIX
+        self.__arguments["virtual_file_vfs_02"] = "%sdata/corpus-virtual-file-02.txt" % UnitexIOConstants.VFS_PREFIX
+        self.__arguments["directory"] = "data/biniou/"
+    def __getitem__(self, key):
+        if key not in self.__arguments:
+            raise KeyError("Argument '%s' not found ..." % key)
+        return self.__arguments[key]
 class TestUnitexIO(unittest.TestCase):
-    def test_01_(self):
+    @classmethod
-        pass
+    def setUpClass(self):
+        self._arguments = Arguments()
+    @classmethod
+    def tearDownClass(self):
+        if os.path.exists(self._arguments["file_target_hdd_01"]):
+            os.remove(self._arguments["file_target_hdd_01"])
+        if os.path.exists(self._arguments["file_target_hdd_02"]):
+            os.remove(self._arguments["file_target_hdd_02"])
+        if os.path.exists(self._arguments["directory"]):
+            shutil.rmtree(self._arguments["directory"])
+    def test_01_enable_stdout(self):
+        ret = enable_stdout()
+        self.assertTrue(ok, "STDOUT enabling failed!")
+    def test_02_disable_stdout(self):
+        ret = disable_stdout()
+        self.assertTrue(ok, "STDOUT disabling failed!")
+    def test_03_enable_stderr(self):
+        ret = enable_stderr()
+        self.assertTrue(ok, "STDERR enabling failed!")
+    def test_04_disable_stderr(self):
+        ret = disable_stderr()
+        self.assertTrue(ok, "STDERR disabling failed!")
+    def test_05_01_cp_hdd(self):
+        raise NotImplementedError
+    def test_05_02_cp_vfs(self):
+        raise NotImplementedError
+    def test_06_01_rm_hdd(self):
+        raise NotImplementedError
+    def test_06_02_rm_vfs(self):
+        raise NotImplementedError
+    def test_07_01_mv_hdd(self):
+        raise NotImplementedError
+    def test_07_02_mv_vfs(self):
+        raise NotImplementedError
+    def test_08_mkdir(self):
+        raise NotImplementedError
+    def test_09_rmdir(self):
+        raise NotImplementedError
+    def test_10_ls_hdd(self):
+        raise NotImplementedError
+    def test_10_ls_vfs(self):
+        raise NotImplementedError
+    def test_11_01_01_virtual_file_read_hdd(self):
+        raise NotImplementedError
+    def test_11_01_02_virtual_file_read_vfs(self):
+        raise NotImplementedError
+    def test_11_02_01_virtual_file_write_hdd(self):
+        raise NotImplementedError
+    def test_11_02_02_virtual_file_write_vfs(self):
+        raise NotImplementedError
+    def test_11_03_01_virtual_file_append_hdd(self):
+        raise NotImplementedError
+    def test_11_03_02_virtual_file_append_vfs(self):
+        raise NotImplementedError

--- a/unitex/tools.py
+++ b/unitex/tools.py
@@ -1057,3 +1057,68 @@ def tokenize(*args, **kwargs):
    ret = unitex_tool(command)
    return ret
+def txt2tfst(*args, **kwargs):
+    """This function constructs an automaton of a text.
+    If the text is separated into sentences, the function constructs an automaton for each
+    sentence. If this is not the case, the program arbitrarily cuts the text into sequences
+    of 2000 tokens and produces an automaton for each of these sequences.
+    The result is a file called text.tfst which is saved in the directory of the text.
+    Another file named text.tind is also produced.
+    Arguments (length: 1):
+        0 -- the full path to the text file in snt format.
+    Keyword arguments:
+        alphabet [str]              -- the alphabet file of the language of the text
+        clean [bool]                -- indicates whether the rule of conservation of the best
+                                       paths (see section 7.2.4) should be applied
+                                       (default: False)
+        normalization_grammar [str] -- name of a normalization grammar that is to be applied
+                                       to the text automaton
+        tagset [str]                -- Elag tagset file to use to normalize dictionary entries
+        korean [bool]               -- tells the function that it works on Korean
+                                       (default: False)
+    Return [bool]:
+        The function return 'True' if it succeeds and 'False' otherwise.
+    """
+    if len(args) != 1:
+        raise UnitexException("You must specify one and only one text to normalize...")
+    text = args[0]
+    alphabet = kwargs.get("alphabet", None)
+    if alphabet is None:
+        raise UnitexException("You must specify the alphabet file path...")
+    clean = kwargs.get("clean", False)
+    normalization_grammar = kwargs.get("normalization_grammar", None)
+    tagset = kwargs.get("tagset", None)
+    korean = kwargs.get("korean", False)
+    command = ["UnitexTool", "Txt2Tfst"]
+    command.append("--alphabet=%s" % alphabet)
+    if clean is not False:
+        command.append("--clean")
+    if normalization_grammar is not None:
+        command.append("--normalization_grammar=%s" % normalization_grammar)
+    if tagset is not None:
+        command.append("--tagset=%s" % tagset)
+    if korean is not False:
+        command.append("--korean")
+    command.append(text)
+    command = " ".join(command)
+    LOGGER.info("Building text automaton for '%s'..." % text)
+    LOGGER.debug("Command: %s", command)
+    ret = unitex_tool(command)
+    return ret