add example and configuration building scripts

6505095f · Patrick Watrin · 712c860b · 6505095f · 6505095f · 6505095f
--- a/README.md
+++ b/README.md
@@ -42,7 +42,12 @@ UNITEX_INC=/path/to/unitex/Src/C++ python setup.py install

 **NOTE: The texts must be encoded in UTF-8. There is so far no support for UTF-16-(LE|BE) or any other encoding.**

-There are three ways to use the Unitex Python library:
+In the [`examples`](https://github.com/patwat/python-unitex/blob/master/examples/) directory, there are two scripts you can use as... examples obviously but also to achieve two simple tasks.
+
+* `build-config-file.py`: this script builds, for a given language, a default YAML config file adapated to your Unitex installation.
+* `do-concord.py`: this script builds a concordance file for a corpus and an grammar.
+
+For the binding itself, there are three ways to use it (from low to high-level):

 1. The `_unitex` C++ extension.
 2. The Unitex basic commands and features.
@@ -50,7 +55,7 @@ There are three ways to use the Unitex Python library:

 The following sections give some sample codes for each of these ways.

-### The `_unitex` C++ extension.
+### The `_unitex` C++ extension

 ```python
 from _unitex import unitex_load_persistent_alphabet,\

--- a/config/unitex.yaml
+++ b/config/unitex.yaml
 global:
-    debug: 1
-    verbose: 2
+    debug: 0
+    verbose: 0
    log: null

    persistence: True

--- a/examples/build-config-file.py
+++ b/examples/build-config-file.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import getopt
+import os
+import sys
+import yaml
+
+from io import open
+
+
+
+def load_dictionaries(directory):
+    dictionaries = []
+
+    dela_directory = os.path.join(directory, "Dela")
+    if os.path.exists(dela_directory) is False:
+        sys.stdout.write("'Dela' directory '%s' doesn't exist.\n")
+        return dictionaries
+
+    system_dic_file = os.path.join(directory, "system_dic.def")
+    if os.path.exists(system_dic_file) is False:
+        sys.stdout.write("'system_dic.def' file not found. Load the entire 'Dela' directory.\n")
+
+        for root, dir, files in os.walk(dela_directory):
+            for f in files:
+                f = os.path.join(dela_directory, f)
+                filename, extension = os.path.splitext(f)
+
+                if extension != ".bin":
+                    continue
+                elif os.path.exists("%s.inf" % filename) is False:
+                    sys.stdout.write("'inf' file doesn't exist for '%s'. Skipping...\n")
+                    continue
+
+                dictionaries.append(f)
+
+    else:
+        with open(system_dic_file, "r") as f:
+            line = f.readline()
+            while line:
+                line = line.rstrip()
+                if not line:
+                    line = f.readline()
+                    continue
+
+                dictionary = os.path.join(dela_directory, line)
+                if os.path.exists(dictionary) is False:
+                    sys.stdout.write("Dictionary '%s' doesn't exist. Skipping...\n" % dictionary)
+
+                    line = f.readline()
+                    continue
+                dictionaries.append(dictionary)
+
+                line = f.readline()
+
+    return dictionaries
+
+
+
+def load_preprocessing_fsts(directory):
+    sentence = None
+    replace = None
+
+    preprocessing_directory = os.path.join(directory, "Graphs/Preprocessing")
+
+    sentence = os.path.join(preprocessing_directory, "Sentence/Sentence.fst2")
+    if os.path.exists(sentence) is False:
+        sys.stdout.write("'Sentence.fst2' doesn't exist.\n")
+        sentence = None
+
+    replace = os.path.join(preprocessing_directory, "Replace/Replace.fst2")
+    if os.path.exists(replace) is False:
+        sys.stdout.write("'Replace.fst2' doesn't exist.\n")
+        replace = None
+
+    return sentence, replace
+
+
+
+def load_alphabets(directory):
+    alphabet = None
+    alphabet_sorted = None
+
+    alphabet = os.path.join(directory, "Alphabet.txt")
+    if os.path.exists(alphabet) is False:
+        sys.stdout.write("'Alphabet.txt' doesn't exist.\n")
+        alphabet = None
+
+    alphabet_sorted = os.path.join(directory, "Alphabet_sort.txt")
+    if os.path.exists(alphabet_sorted) is False:
+        sys.stdout.write("'Alphabet_sort.txt' doesn't exist.\n")
+        alphabet_sorted = None
+
+    return alphabet, alphabet_sorted
+
+
+
+if __name__ == "__main__":
+    def usage():
+        sys.stderr.write("Build Config File -- build the (default) config file for a given language\n\n")
+        sys.stderr.write("  $ build-config-file [OPTIONS] <Unitex YAML config template>\n\n")
+        sys.stderr.write("Options:\n")
+        sys.stderr.write("  [ -h, --help      = this help message                                      ]\n")
+        sys.stderr.write("    -o, --output    = the resulting config filename\n")
+        sys.stderr.write("    -l, --language  = the language name\n")
+        sys.stderr.write("    -d, --directory = the original resources directory for the language\n")
+        sys.stderr.write("                      (i.e. the language directory from Unitex distribution)\n\n")
+        sys.stderr.write("Example:\n")
+        sys.stderr.write("  $ build-config-file -l fr -d /path/to/French -o unitex-fr.yaml unitex.yaml\n")
+        sys.exit(1)
+
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], "ho:l:d:", ["help", "output=", "language=", "directory="])
+    except getopt.GetoptError:
+        usage()
+
+    if len(opts) == 0 and len(args) == 0:
+        usage()
+
+    output = None
+    language = None
+    directory = None
+
+    for o, a in opts :
+        if o == "-h" or o == "--help":
+            usage()
+        elif o == "-o" or o == "--output":
+            output = a
+        elif o == "-l" or o == "--language":
+            language = a
+        elif o == "-d" or o == "--directory":
+            directory = a
+        else:
+            sys.stderr.write("Wrong option '%s'.\n" % o)
+            usage()
+
+    if output is None:
+        sys.stderr.write("You must provide the resulting config filename.\n")
+        usage()
+
+    if language is None:
+        sys.stderr.write("You must provide the language name.\n")
+        usage()
+
+    if directory is None:
+        sys.stderr.write("You must provide the language directory.\n")
+        usage()
+    directory = os.path.abspath(directory)
+
+    if len(args) != 1:
+        sys.stderr.write("You must provide one and only one config template.\n")
+        usage()
+    [template] = args
+
+    options = None
+    with open(template, "r") as f:
+        options = yaml.load(f)
+
+    dictionaries = load_dictionaries(directory)
+    sentence, replace = load_preprocessing_fsts(directory)
+    alphabet, alphabet_sorted = load_alphabets(directory)
+
+    options["resources"]["dictionaries"] = dictionaries
+    options["resources"]["sentence"] = sentence
+    options["resources"]["replace"] = replace
+    options["resources"]["alphabet"] = alphabet
+    options["resources"]["alphabet-sorted"] = alphabet_sorted
+
+    with open(output, 'w') as f:
+        f.write(yaml.dump(options, default_flow_style=False))
--- a/examples/do-concord.py
+++ b/examples/do-concord.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import getopt
+import os
+import sys
+import yaml
+
+from unitex import init_log_system
+from unitex.config import UnitexConfig
+from unitex.tools import *
+from unitex.resources import *
+from unitex.io import *
+
+
+
+def load_resources(options):
+    if options["resources"]["alphabet"] is not None:
+        alphabet = load_persistent_alphabet(options["resources"]["alphabet"])
+        options["resources"]["alphabet"] = alphabet
+    
+    if options["resources"]["alphabet-sorted"] is not None:
+        alphabet_sorted = load_persistent_alphabet(options["resources"]["alphabet-sorted"])
+        options["resources"]["alphabet-sorted"] = alphabet_sorted
+    
+    if options["resources"]["sentence"] is not None:
+        sentence = load_persistent_fst2(options["resources"]["sentence"])
+        options["resources"]["sentence"] = sentence
+    
+    if options["resources"]["replace"] is not None:
+        replace = load_persistent_fst2(options["resources"]["replace"])
+        options["resources"]["replace"] = replace
+    
+    if options["resources"]["dictionaries"] is not None:
+        dictionaries = []
+    
+        for dictionary in options["resources"]["dictionaries"]:
+            dictionary = load_persistent_dictionary(dictionary)
+            dictionaries.append(dictionary)
+    
+        options["resources"]["dictionaries"] = dictionaries
+
+
+
+def free_resources(options):
+    if options["resources"]["alphabet"] is not None:
+        free_persistent_alphabet(options["resources"]["alphabet"])
+    
+    if options["resources"]["alphabet-sorted"] is not None:
+        free_persistent_alphabet(options["resources"]["alphabet-sorted"])
+    
+    if options["resources"]["sentence"] is not None:
+        free_persistent_fst2(options["resources"]["sentence"])
+    
+    if options["resources"]["replace"] is not None:
+        free_persistent_fst2(options["resources"]["replace"])
+    
+    if options["resources"]["dictionaries"] is not None:
+        for dictionary in options["resources"]["dictionaries"]:
+            free_persistent_dictionary(dictionary)
+
+
+
+def execute(path, grammar, options):
+    directory, filename = os.path.split(path)
+    name, extension = os.path.splitext(filename)
+
+    txt = path
+    snt = os.path.join(directory, "%s.snt" % name)
+    dir = os.path.join(directory, "%s_snt" % name)
+
+    # Set up the virtual filesystem
+    if options["virtualization"] is True:
+        _txt = "%s%s" % (UnitexConstants.VFS_PREFIX, txt)
+        cp(txt, _txt)
+        
+        txt = _txt
+        snt = "%s%s" % (UnitexConstants.VFS_PREFIX, snt)
+    else:
+        if os.path.exists(dir) is False:
+            mkdir(dir)
+
+    # Some ad-hoc check
+    alphabet = options["resources"]["alphabet"]
+    if alphabet is None:
+        sys.stderr.write("[ERROR] You must provide the alphabet. Fix the configuration file.\n")
+        sys.exit(1)
+
+    alphabet_sorted = options["resources"]["alphabet-sorted"]
+    if alphabet_sorted is None:
+        sys.stderr.write("[ERROR] You must provide the sorted alphabet. Fix the configuration file.\n")
+        sys.exit(1)
+
+    # Normalize the text
+    kwargs = options["tools"]["normalize"]
+    
+    ret = normalize(txt, **kwargs)
+    if ret is False:
+        sys.stderr.write("[ERROR] Text normalization failed!\n")
+        sys.exit(1)
+
+    # Apply Sentence.fst2
+    sentence = options["resources"]["sentence"]
+    if sentence is not None:
+        kwargs = {}
+        kwargs["start_on_space"] = options["tools"]["fst2txt"]["start_on_space"]
+        kwargs["char_by_char"] = options["tools"]["fst2txt"]["char_by_char"]
+        kwargs["merge"] = True
+        
+        ret = fst2txt(sentence, snt, alphabet, **kwargs)
+        if ret is False:
+            sys.stderr.write("Text segmentation failed!\n")
+            sys.exit(1)
+
+    # Apply Replace.fst2
+    replace = options["resources"]["replace"]
+    if replace is not None:
+        kwargs = {}
+        kwargs["start_on_space"] = options["tools"]["fst2txt"]["start_on_space"]
+        kwargs["char_by_char"] = options["tools"]["fst2txt"]["char_by_char"]
+        kwargs["merge"] = False
+        
+        ret = fst2txt(replace, snt, alphabet, **kwargs)
+        if ret is False:
+            sys.stderr.write("Replace grammar application failed!\n")
+            sys.exit(1)
+
+    # Tokenize the text
+    kwargs = options["tools"]["tokenize"]
+    
+    ret = tokenize(snt, alphabet, **kwargs)
+    if ret is False:
+        sys.stderr.write("[ERROR] Text tokenization failed!\n")
+        sys.exit(1)
+
+    # Apply dictionaries
+    if options["resources"]["dictionaries"] is not None:
+        dictionaries = options["resources"]["dictionaries"]
+        kwargs = options["tools"]["dico"]
+
+        ret = dico(dictionaries, snt, alphabet, **kwargs)
+        if ret is False:
+            sys.stderr.write("[ERROR] Dictionaries application failed!\n")
+            sys.exit(1)
+
+    # Locate pattern
+    kwargs = options["tools"]["locate"]
+    
+    ret = locate(grammar, snt, alphabet, **kwargs)
+    if ret is False:
+        sys.stderr.write("[ERROR] Locate failed!\n")
+        sys.exit(1)
+
+    index = os.path.join(dir, "concord.ind")
+    if options["virtualization"] is True:
+        index = "%s%s" % (UnitexConstants.VFS_PREFIX, index)
+
+    if exists(index) is False:
+        sys.stderr.write("[ERROR] Locate failed! No index produced.\n")
+        sys.exit(1)
+
+    # Build concordance
+    kwargs = options["tools"]["concord"]
+
+    format = kwargs["format"]
+    if format not in (UnitexConstants.FORMAT_HTML,
+                      UnitexConstants.FORMAT_TEXT,
+                      UnitexConstants.FORMAT_GLOSSANET,
+                      UnitexConstants.FORMAT_SCRIPT,
+                      UnitexConstants.FORMAT_XML,
+                      UnitexConstants.FORMAT_XML_WITH_HEADERS):
+        sys.stderr.write("[ERROR] This little script supports a limited list of concordance format:\n")
+        sys.stderr.write("[ERROR]    - TEXT ('text' option)\n")
+        sys.stderr.write("[ERROR]    - HTML ('html', 'glossanet' and 'script' options)\n")
+        sys.stderr.write("[ERROR]    - XML ('xml' and 'wml-with-headers' options)\n")
+        sys.exit(1)
+    
+    ret = concord(index, alphabet_sorted, **kwargs)
+    if ret is False:
+        sys.stderr.write("[ERROR] Concord failed!\n")
+        sys.exit(1)
+
+    concordances = None
+    output = None
+
+    if format == UnitexConstants.FORMAT_TEXT:
+        concordances = os.path.join(dir, "concord.txt")
+        output = os.path.join(directory, "%s-concordances.txt" % name)
+    elif format in (UnitexConstants.FORMAT_HTML, UnitexConstants.FORMAT_GLOSSANET, UnitexConstants.FORMAT_SCRIPT):
+        concordances = os.path.join(dir, "concord.html")
+        output = os.path.join(directory, "%s-concordances.html" % name)
+    elif format in (UnitexConstants.FORMAT_XML, UnitexConstants.FORMAT_XML_WITH_HEADERS):
+        concordances = os.path.join(dir, "concord.xml")
+        output = os.path.join(directory, "%s-concordances.xml" % name)
+
+    if options["virtualization"] is True:
+        concordances = "%s%s" % (UnitexConstants.VFS_PREFIX, concordances)
+    mv(concordances, output)
+
+    # Clean the Unitex files
+    if options["debug"] is False:
+        if options["virtualization"] is True:
+            for vf in ls("%s%s" % (UnitexConstants.VFS_PREFIX, directory)):
+                rm(vf)
+            rm(snt)
+            rm(txt)
+        else:
+            rmdir(dir)
+            rm(snt)
+
+    return output
+
+
+
+if __name__ == "__main__":
+    def usage():
+        sys.stderr.write("Do Concord -- A simple script to illustrate the Unitex Python binding\n\n")
+        sys.stderr.write("  $ do-concord [OPTIONS] <file1(, file2, ...)>\n\n")
+        sys.stderr.write("Options:\n")
+        sys.stderr.write("  [ -h, --help    = this help message       ]\n")
+        sys.stderr.write("    -c, --config  = the Unitex config file\n")
+        sys.stderr.write("    -g, --grammar = the fst2 grammar to use\n\n")
+        sys.stderr.write("Example:\n")
+        sys.stderr.write("  $ do-concord -c unitex.yaml *.txt\n")
+        sys.exit(1)
+
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], "hc:g:", ["help", "config=", "grammar="])
+    except getopt.GetoptError:
+        usage()
+
+    if len(opts) == 0 and len(args) == 0:
+        usage()
+
+    config_file = None
+    grammar = None
+
+    for o, a in opts :
+        if o == "-h" or o == "--help":
+            usage()
+        elif o == "-c" or o == "--config":
+            config_file = a
+        elif o == "-g" or o == "--grammar":
+            grammar = a
+        else:
+            sys.stderr.write("Wrong option '%s'.\n" % o)
+            usage()
+
+    if config_file is None:
+        sys.stderr.write("You must provide the config file.\n")
+        usage()
+
+    if grammar is None:
+        sys.stderr.write("You must provide the grammar.\n")
+        usage()
+
+    if not args:
+        sys.stderr.write("You must provide at least one file to process.\n")
+        usage()
+
+    # 'glob' is bad! Do not use 'glob'.
+    files = []
+    for arg in args:
+        if os.path.isdir(arg) is True:
+            for root, dir, _files in os.walk(arg):
+                files += [os.path.join(root, f) for f in _files]
+        elif os.path.isfile(arg) is True:
+            files.append(arg)
+        else:
+            sys.stderr.write("The arguments must be files or directories.\n")
+            usage()
+
+    # Configuration file loading
+    config = None
+    with open(config_file, "r") as f:
+        config = yaml.load(f)
+    options = UnitexConfig(config)
+
+    # Intialization of the basic logging system
+    init_log_system(options["verbose"], options["debug"], options["log"])
+
+    # Load resources in the persistent space
+    if options["persistence"] is True:
+        grammar = load_persistent_fst2(grammar)
+        load_resources(options)
+
+    results = []
+
+    for f in files:
+        sys.stdout.write("Processing '%s'...\n" % f)
+
+        # This function illustrate the whole Unitex process used in order
+        # to produce a concordance file.
+        result = execute(f, grammar, options)
+        sys.stdout.write("   -> %s\n" % result)
+
+        results.append(result)
+
+    # Free resources from the persistent space
+    if options["persistence"] is True:
+        free_persistent_fst2(grammar)
+        free_resources(options)
--- a/examples/unitex-fr.yaml
+++ b/examples/unitex-fr.yaml
+global:
+  debug: 0
+  log: null
+  persistence: true
+  verbose: 0
+  virtualization: true
+resources:
+  alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt
+  alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt
+  dictionaries:
+  - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin
+  - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin
+  - /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin
+  language: fr
+  replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2
+  sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2
+tools:
+  check_dic:
+    no_space_warning: false
+    strict: false
+  compress:
+    flip: false
+    output: null
+    semitic: false
+    version: v2
+  concord:
+    directory: null
+    font: null
+    fontsize: null
+    format: text
+    left: '0'
+    offsets: null
+    only_ambiguous: false
+    only_matches: false
+    right: '0'
+    script: null
+    sort: TO
+    thai: false
+    unxmlize: null
+  dico:
+    arabic_rules: null
+    korean: false
+    morpho: null
+    raw: null
+    semitic: false
+  extract:
+    non_matching_sentences: false
+  fst2txt:
+    merge: true
+    start_on_space: false
+    word_by_word: false
+  grf2fst2:
+    char_by_char: false
+    check_variables: true
+    debug: false
+    loop_check: false
+    named_repository: null
+    no_empty_graph_warning: false
+    pkgdir: null
+    silent_grf_name: true
+    tfst_check: false
+  locate:
+    ambiguous_outputs: true
+    arabic_rules: null
+    char_by_char: false
+    korean: false
+    match_mode: longest
+    morpho: null
+    negation_operator: tilde
+    number_of_matches: null
+    output_mode: merge
+    protect_dic_chars: true
+    sntdir: null
+    start_on_space: false
+    stop_token_count: null
+    variable: null
+    variable_error: ignore
+  normalize:
+    input_offsets: null
+    no_carriage_return: false
+    no_separator_normalization: false
+    output_offsets: null
+    replacement_rules: null
+  sort_txt:
+    duplicates: false
+    factorize_inflectional_codes: false
+    line_info: null
+    revers: false
+    sort_order: null
+    thai: false
+  tokenize:
+    char_by_char: false
+    input_offsets: null
+    output_offsets: null
+    tokens: null
+  txt2fst:
+    clean: false
+    korean: false
+    normalization_grammar: null
+    tagset: null
--- a/tests/data/unitex.yaml
+++ b/tests/data/unitex.yaml
 global:
-    debug: 1
-    verbose: 2
+    debug: 0
+    verbose: 0
    log: null

    persistence: True

--- a/unitex/processor.py
+++ b/unitex/processor.py
@@ -185,6 +185,8 @@ class UnitexProcessor(object):
        kwargs = self.__options["tools"]["tokenize"]

        ret = tokenize(self.__snt, alphabet, **kwargs)
+        if ret is False:
+            raise UnitexException("Text tokenization failed!")

    def _lexicalize(self):
        dictionaries = self.__options["resources"]["dictionaries"]
@@ -281,7 +283,7 @@ class UnitexProcessor(object):

            result = os.path.join(self.__dir, "concord.txt")
            if self.__options["virtualization"] is True:
-                index = "%s%s" % (UnitexConstants.VFS_PREFIX, result)
+                result = "%s%s" % (UnitexConstants.VFS_PREFIX, result)

        ret = concord(index, alphabet, **kwargs)
        if ret is False: