#!/usr/bin/env python # -*- coding: utf-8 -*- import getopt import os import sys import yaml from unitex import init_log_system from unitex.config import UnitexConfig from unitex.tools import * from unitex.resources import * from unitex.io import * def load_resources(options): if options["resources"]["alphabet"] is not None: alphabet = load_persistent_alphabet(options["resources"]["alphabet"]) options["resources"]["alphabet"] = alphabet if options["resources"]["alphabet-sorted"] is not None: alphabet_sorted = load_persistent_alphabet(options["resources"]["alphabet-sorted"]) options["resources"]["alphabet-sorted"] = alphabet_sorted if options["resources"]["sentence"] is not None: sentence = load_persistent_fst2(options["resources"]["sentence"]) options["resources"]["sentence"] = sentence if options["resources"]["replace"] is not None: replace = load_persistent_fst2(options["resources"]["replace"]) options["resources"]["replace"] = replace if options["resources"]["dictionaries"] is not None: dictionaries = [] for dictionary in options["resources"]["dictionaries"]: dictionary = load_persistent_dictionary(dictionary) dictionaries.append(dictionary) options["resources"]["dictionaries"] = dictionaries def free_resources(options): if options["resources"]["alphabet"] is not None: free_persistent_alphabet(options["resources"]["alphabet"]) if options["resources"]["alphabet-sorted"] is not None: free_persistent_alphabet(options["resources"]["alphabet-sorted"]) if options["resources"]["sentence"] is not None: free_persistent_fst2(options["resources"]["sentence"]) if options["resources"]["replace"] is not None: free_persistent_fst2(options["resources"]["replace"]) if options["resources"]["dictionaries"] is not None: for dictionary in options["resources"]["dictionaries"]: free_persistent_dictionary(dictionary) def execute(path, grammar, options): directory, filename = os.path.split(path) name, extension = os.path.splitext(filename) txt = path snt = os.path.join(directory, "%s.snt" % name) dir = os.path.join(directory, "%s_snt" % name) # Set up the virtual filesystem if options["virtualization"] is True: _txt = "%s%s" % (UnitexConstants.VFS_PREFIX, txt) cp(txt, _txt) txt = _txt snt = "%s%s" % (UnitexConstants.VFS_PREFIX, snt) else: if os.path.exists(dir) is False: mkdir(dir) # Some ad-hoc check alphabet = options["resources"]["alphabet"] if alphabet is None: sys.stderr.write("[ERROR] You must provide the alphabet. Fix the configuration file.\n") sys.exit(1) alphabet_sorted = options["resources"]["alphabet-sorted"] if alphabet_sorted is None: sys.stderr.write("[ERROR] You must provide the sorted alphabet. Fix the configuration file.\n") sys.exit(1) # Normalize the text kwargs = options["tools"]["normalize"] ret = normalize(txt, **kwargs) if ret is False: sys.stderr.write("[ERROR] Text normalization failed!\n") sys.exit(1) # Apply Sentence.fst2 sentence = options["resources"]["sentence"] if sentence is not None: kwargs = {} kwargs["start_on_space"] = options["tools"]["fst2txt"]["start_on_space"] kwargs["char_by_char"] = options["tools"]["fst2txt"]["char_by_char"] kwargs["merge"] = True ret = fst2txt(sentence, snt, alphabet, **kwargs) if ret is False: sys.stderr.write("Text segmentation failed!\n") sys.exit(1) # Apply Replace.fst2 replace = options["resources"]["replace"] if replace is not None: kwargs = {} kwargs["start_on_space"] = options["tools"]["fst2txt"]["start_on_space"] kwargs["char_by_char"] = options["tools"]["fst2txt"]["char_by_char"] kwargs["merge"] = False ret = fst2txt(replace, snt, alphabet, **kwargs) if ret is False: sys.stderr.write("Replace grammar application failed!\n") sys.exit(1) # Tokenize the text kwargs = options["tools"]["tokenize"] ret = tokenize(snt, alphabet, **kwargs) if ret is False: sys.stderr.write("[ERROR] Text tokenization failed!\n") sys.exit(1) # Apply dictionaries if options["resources"]["dictionaries"] is not None: dictionaries = options["resources"]["dictionaries"] kwargs = options["tools"]["dico"] ret = dico(dictionaries, snt, alphabet, **kwargs) if ret is False: sys.stderr.write("[ERROR] Dictionaries application failed!\n") sys.exit(1) # Locate pattern kwargs = options["tools"]["locate"] ret = locate(grammar, snt, alphabet, **kwargs) if ret is False: sys.stderr.write("[ERROR] Locate failed!\n") sys.exit(1) index = os.path.join(dir, "concord.ind") if options["virtualization"] is True: index = "%s%s" % (UnitexConstants.VFS_PREFIX, index) if exists(index) is False: sys.stderr.write("[ERROR] Locate failed! No index produced.\n") sys.exit(1) # Build concordance kwargs = options["tools"]["concord"] format = kwargs["format"] if format not in (UnitexConstants.FORMAT_HTML, UnitexConstants.FORMAT_TEXT, UnitexConstants.FORMAT_GLOSSANET, UnitexConstants.FORMAT_SCRIPT, UnitexConstants.FORMAT_XML, UnitexConstants.FORMAT_XML_WITH_HEADERS): sys.stderr.write("[ERROR] This little script supports a limited list of concordance format:\n") sys.stderr.write("[ERROR] - TEXT ('text' option)\n") sys.stderr.write("[ERROR] - HTML ('html', 'glossanet' and 'script' options)\n") sys.stderr.write("[ERROR] - XML ('xml' and 'wml-with-headers' options)\n") sys.exit(1) ret = concord(index, alphabet_sorted, **kwargs) if ret is False: sys.stderr.write("[ERROR] Concord failed!\n") sys.exit(1) concordances = None output = None if format == UnitexConstants.FORMAT_TEXT: concordances = os.path.join(dir, "concord.txt") output = os.path.join(directory, "%s-concordances.txt" % name) elif format in (UnitexConstants.FORMAT_HTML, UnitexConstants.FORMAT_GLOSSANET, UnitexConstants.FORMAT_SCRIPT): concordances = os.path.join(dir, "concord.html") output = os.path.join(directory, "%s-concordances.html" % name) elif format in (UnitexConstants.FORMAT_XML, UnitexConstants.FORMAT_XML_WITH_HEADERS): concordances = os.path.join(dir, "concord.xml") output = os.path.join(directory, "%s-concordances.xml" % name) if options["virtualization"] is True: concordances = "%s%s" % (UnitexConstants.VFS_PREFIX, concordances) mv(concordances, output) # Clean the Unitex files if options["debug"] is False: if options["virtualization"] is True: for vf in ls("%s%s" % (UnitexConstants.VFS_PREFIX, directory)): rm(vf) rm(snt) rm(txt) else: rmdir(dir) rm(snt) return output if __name__ == "__main__": def usage(): sys.stderr.write("Do Concord -- A simple script to illustrate the Unitex Python bindings\n\n") sys.stderr.write(" $ do-concord [OPTIONS] <file1(, file2, ...)>\n\n") sys.stderr.write("Options:\n") sys.stderr.write(" [ -h, --help = this help message ]\n") sys.stderr.write(" -c, --config = the Unitex config file\n") sys.stderr.write(" -g, --grammar = the fst2 grammar to use\n\n") sys.stderr.write("Example:\n") sys.stderr.write(" $ do-concord -c unitex.yaml -g grammar.fst2 *.txt\n") sys.exit(1) try: opts, args = getopt.getopt(sys.argv[1:], "hc:g:", ["help", "config=", "grammar="]) except getopt.GetoptError: usage() if len(opts) == 0 and len(args) == 0: usage() config_file = None grammar = None for o, a in opts : if o == "-h" or o == "--help": usage() elif o == "-c" or o == "--config": config_file = a elif o == "-g" or o == "--grammar": grammar = a else: sys.stderr.write("Wrong option '%s'.\n" % o) usage() if config_file is None: sys.stderr.write("You must provide the config file.\n") usage() if grammar is None: sys.stderr.write("You must provide the grammar.\n") usage() if not args: sys.stderr.write("You must provide at least one file to process.\n") usage() # 'glob' is bad! Do not use 'glob'. files = [] for arg in args: if os.path.isdir(arg) is True: for root, dir, _files in os.walk(arg): files += [os.path.join(root, f) for f in _files] elif os.path.isfile(arg) is True: files.append(arg) else: sys.stderr.write("The arguments must be files or directories.\n") usage() # Configuration file loading config = None with open(config_file, "r") as f: config = yaml.load(f) options = UnitexConfig(config) # Intialization of the basic logging system init_log_system(options["verbose"], options["debug"], options["log"]) # Load resources in the persistent space if options["persistence"] is True: grammar = load_persistent_fst2(grammar) load_resources(options) results = [] for f in files: sys.stdout.write("Processing '%s'...\n" % f) # This function illustrate the whole Unitex process used in order # to produce a concordance file. result = execute(f, grammar, options) sys.stdout.write(" -> %s\n" % result) results.append(result) # Free resources from the persistent space if options["persistence"] is True: free_persistent_fst2(grammar) free_resources(options)