Skip to content
Extraits de code Groupes Projets
Valider 6505095f rédigé par Patrick Watrin's avatar Patrick Watrin
Parcourir les fichiers

add example and configuration building scripts

parent 712c860b
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
......@@ -42,7 +42,12 @@ UNITEX_INC=/path/to/unitex/Src/C++ python setup.py install
**NOTE: The texts must be encoded in UTF-8. There is so far no support for UTF-16-(LE|BE) or any other encoding.**
There are three ways to use the Unitex Python library:
In the [`examples`](https://github.com/patwat/python-unitex/blob/master/examples/) directory, there are two scripts you can use as... examples obviously but also to achieve two simple tasks.
* `build-config-file.py`: this script builds, for a given language, a default YAML config file adapated to your Unitex installation.
* `do-concord.py`: this script builds a concordance file for a corpus and an grammar.
For the binding itself, there are three ways to use it (from low to high-level):
1. The `_unitex` C++ extension.
2. The Unitex basic commands and features.
......@@ -50,7 +55,7 @@ There are three ways to use the Unitex Python library:
The following sections give some sample codes for each of these ways.
### The `_unitex` C++ extension.
### The `_unitex` C++ extension
```python
from _unitex import unitex_load_persistent_alphabet,\
......
global:
debug: 1
verbose: 2
debug: 0
verbose: 0
log: null
persistence: True
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import getopt
import os
import sys
import yaml
from io import open
def load_dictionaries(directory):
dictionaries = []
dela_directory = os.path.join(directory, "Dela")
if os.path.exists(dela_directory) is False:
sys.stdout.write("'Dela' directory '%s' doesn't exist.\n")
return dictionaries
system_dic_file = os.path.join(directory, "system_dic.def")
if os.path.exists(system_dic_file) is False:
sys.stdout.write("'system_dic.def' file not found. Load the entire 'Dela' directory.\n")
for root, dir, files in os.walk(dela_directory):
for f in files:
f = os.path.join(dela_directory, f)
filename, extension = os.path.splitext(f)
if extension != ".bin":
continue
elif os.path.exists("%s.inf" % filename) is False:
sys.stdout.write("'inf' file doesn't exist for '%s'. Skipping...\n")
continue
dictionaries.append(f)
else:
with open(system_dic_file, "r") as f:
line = f.readline()
while line:
line = line.rstrip()
if not line:
line = f.readline()
continue
dictionary = os.path.join(dela_directory, line)
if os.path.exists(dictionary) is False:
sys.stdout.write("Dictionary '%s' doesn't exist. Skipping...\n" % dictionary)
line = f.readline()
continue
dictionaries.append(dictionary)
line = f.readline()
return dictionaries
def load_preprocessing_fsts(directory):
sentence = None
replace = None
preprocessing_directory = os.path.join(directory, "Graphs/Preprocessing")
sentence = os.path.join(preprocessing_directory, "Sentence/Sentence.fst2")
if os.path.exists(sentence) is False:
sys.stdout.write("'Sentence.fst2' doesn't exist.\n")
sentence = None
replace = os.path.join(preprocessing_directory, "Replace/Replace.fst2")
if os.path.exists(replace) is False:
sys.stdout.write("'Replace.fst2' doesn't exist.\n")
replace = None
return sentence, replace
def load_alphabets(directory):
alphabet = None
alphabet_sorted = None
alphabet = os.path.join(directory, "Alphabet.txt")
if os.path.exists(alphabet) is False:
sys.stdout.write("'Alphabet.txt' doesn't exist.\n")
alphabet = None
alphabet_sorted = os.path.join(directory, "Alphabet_sort.txt")
if os.path.exists(alphabet_sorted) is False:
sys.stdout.write("'Alphabet_sort.txt' doesn't exist.\n")
alphabet_sorted = None
return alphabet, alphabet_sorted
if __name__ == "__main__":
def usage():
sys.stderr.write("Build Config File -- build the (default) config file for a given language\n\n")
sys.stderr.write(" $ build-config-file [OPTIONS] <Unitex YAML config template>\n\n")
sys.stderr.write("Options:\n")
sys.stderr.write(" [ -h, --help = this help message ]\n")
sys.stderr.write(" -o, --output = the resulting config filename\n")
sys.stderr.write(" -l, --language = the language name\n")
sys.stderr.write(" -d, --directory = the original resources directory for the language\n")
sys.stderr.write(" (i.e. the language directory from Unitex distribution)\n\n")
sys.stderr.write("Example:\n")
sys.stderr.write(" $ build-config-file -l fr -d /path/to/French -o unitex-fr.yaml unitex.yaml\n")
sys.exit(1)
try:
opts, args = getopt.getopt(sys.argv[1:], "ho:l:d:", ["help", "output=", "language=", "directory="])
except getopt.GetoptError:
usage()
if len(opts) == 0 and len(args) == 0:
usage()
output = None
language = None
directory = None
for o, a in opts :
if o == "-h" or o == "--help":
usage()
elif o == "-o" or o == "--output":
output = a
elif o == "-l" or o == "--language":
language = a
elif o == "-d" or o == "--directory":
directory = a
else:
sys.stderr.write("Wrong option '%s'.\n" % o)
usage()
if output is None:
sys.stderr.write("You must provide the resulting config filename.\n")
usage()
if language is None:
sys.stderr.write("You must provide the language name.\n")
usage()
if directory is None:
sys.stderr.write("You must provide the language directory.\n")
usage()
directory = os.path.abspath(directory)
if len(args) != 1:
sys.stderr.write("You must provide one and only one config template.\n")
usage()
[template] = args
options = None
with open(template, "r") as f:
options = yaml.load(f)
dictionaries = load_dictionaries(directory)
sentence, replace = load_preprocessing_fsts(directory)
alphabet, alphabet_sorted = load_alphabets(directory)
options["resources"]["dictionaries"] = dictionaries
options["resources"]["sentence"] = sentence
options["resources"]["replace"] = replace
options["resources"]["alphabet"] = alphabet
options["resources"]["alphabet-sorted"] = alphabet_sorted
with open(output, 'w') as f:
f.write(yaml.dump(options, default_flow_style=False))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import getopt
import os
import sys
import yaml
from unitex import init_log_system
from unitex.config import UnitexConfig
from unitex.tools import *
from unitex.resources import *
from unitex.io import *
def load_resources(options):
if options["resources"]["alphabet"] is not None:
alphabet = load_persistent_alphabet(options["resources"]["alphabet"])
options["resources"]["alphabet"] = alphabet
if options["resources"]["alphabet-sorted"] is not None:
alphabet_sorted = load_persistent_alphabet(options["resources"]["alphabet-sorted"])
options["resources"]["alphabet-sorted"] = alphabet_sorted
if options["resources"]["sentence"] is not None:
sentence = load_persistent_fst2(options["resources"]["sentence"])
options["resources"]["sentence"] = sentence
if options["resources"]["replace"] is not None:
replace = load_persistent_fst2(options["resources"]["replace"])
options["resources"]["replace"] = replace
if options["resources"]["dictionaries"] is not None:
dictionaries = []
for dictionary in options["resources"]["dictionaries"]:
dictionary = load_persistent_dictionary(dictionary)
dictionaries.append(dictionary)
options["resources"]["dictionaries"] = dictionaries
def free_resources(options):
if options["resources"]["alphabet"] is not None:
free_persistent_alphabet(options["resources"]["alphabet"])
if options["resources"]["alphabet-sorted"] is not None:
free_persistent_alphabet(options["resources"]["alphabet-sorted"])
if options["resources"]["sentence"] is not None:
free_persistent_fst2(options["resources"]["sentence"])
if options["resources"]["replace"] is not None:
free_persistent_fst2(options["resources"]["replace"])
if options["resources"]["dictionaries"] is not None:
for dictionary in options["resources"]["dictionaries"]:
free_persistent_dictionary(dictionary)
def execute(path, grammar, options):
directory, filename = os.path.split(path)
name, extension = os.path.splitext(filename)
txt = path
snt = os.path.join(directory, "%s.snt" % name)
dir = os.path.join(directory, "%s_snt" % name)
# Set up the virtual filesystem
if options["virtualization"] is True:
_txt = "%s%s" % (UnitexConstants.VFS_PREFIX, txt)
cp(txt, _txt)
txt = _txt
snt = "%s%s" % (UnitexConstants.VFS_PREFIX, snt)
else:
if os.path.exists(dir) is False:
mkdir(dir)
# Some ad-hoc check
alphabet = options["resources"]["alphabet"]
if alphabet is None:
sys.stderr.write("[ERROR] You must provide the alphabet. Fix the configuration file.\n")
sys.exit(1)
alphabet_sorted = options["resources"]["alphabet-sorted"]
if alphabet_sorted is None:
sys.stderr.write("[ERROR] You must provide the sorted alphabet. Fix the configuration file.\n")
sys.exit(1)
# Normalize the text
kwargs = options["tools"]["normalize"]
ret = normalize(txt, **kwargs)
if ret is False:
sys.stderr.write("[ERROR] Text normalization failed!\n")
sys.exit(1)
# Apply Sentence.fst2
sentence = options["resources"]["sentence"]
if sentence is not None:
kwargs = {}
kwargs["start_on_space"] = options["tools"]["fst2txt"]["start_on_space"]
kwargs["char_by_char"] = options["tools"]["fst2txt"]["char_by_char"]
kwargs["merge"] = True
ret = fst2txt(sentence, snt, alphabet, **kwargs)
if ret is False:
sys.stderr.write("Text segmentation failed!\n")
sys.exit(1)
# Apply Replace.fst2
replace = options["resources"]["replace"]
if replace is not None:
kwargs = {}
kwargs["start_on_space"] = options["tools"]["fst2txt"]["start_on_space"]
kwargs["char_by_char"] = options["tools"]["fst2txt"]["char_by_char"]
kwargs["merge"] = False
ret = fst2txt(replace, snt, alphabet, **kwargs)
if ret is False:
sys.stderr.write("Replace grammar application failed!\n")
sys.exit(1)
# Tokenize the text
kwargs = options["tools"]["tokenize"]
ret = tokenize(snt, alphabet, **kwargs)
if ret is False:
sys.stderr.write("[ERROR] Text tokenization failed!\n")
sys.exit(1)
# Apply dictionaries
if options["resources"]["dictionaries"] is not None:
dictionaries = options["resources"]["dictionaries"]
kwargs = options["tools"]["dico"]
ret = dico(dictionaries, snt, alphabet, **kwargs)
if ret is False:
sys.stderr.write("[ERROR] Dictionaries application failed!\n")
sys.exit(1)
# Locate pattern
kwargs = options["tools"]["locate"]
ret = locate(grammar, snt, alphabet, **kwargs)
if ret is False:
sys.stderr.write("[ERROR] Locate failed!\n")
sys.exit(1)
index = os.path.join(dir, "concord.ind")
if options["virtualization"] is True:
index = "%s%s" % (UnitexConstants.VFS_PREFIX, index)
if exists(index) is False:
sys.stderr.write("[ERROR] Locate failed! No index produced.\n")
sys.exit(1)
# Build concordance
kwargs = options["tools"]["concord"]
format = kwargs["format"]
if format not in (UnitexConstants.FORMAT_HTML,
UnitexConstants.FORMAT_TEXT,
UnitexConstants.FORMAT_GLOSSANET,
UnitexConstants.FORMAT_SCRIPT,
UnitexConstants.FORMAT_XML,
UnitexConstants.FORMAT_XML_WITH_HEADERS):
sys.stderr.write("[ERROR] This little script supports a limited list of concordance format:\n")
sys.stderr.write("[ERROR] - TEXT ('text' option)\n")
sys.stderr.write("[ERROR] - HTML ('html', 'glossanet' and 'script' options)\n")
sys.stderr.write("[ERROR] - XML ('xml' and 'wml-with-headers' options)\n")
sys.exit(1)
ret = concord(index, alphabet_sorted, **kwargs)
if ret is False:
sys.stderr.write("[ERROR] Concord failed!\n")
sys.exit(1)
concordances = None
output = None
if format == UnitexConstants.FORMAT_TEXT:
concordances = os.path.join(dir, "concord.txt")
output = os.path.join(directory, "%s-concordances.txt" % name)
elif format in (UnitexConstants.FORMAT_HTML, UnitexConstants.FORMAT_GLOSSANET, UnitexConstants.FORMAT_SCRIPT):
concordances = os.path.join(dir, "concord.html")
output = os.path.join(directory, "%s-concordances.html" % name)
elif format in (UnitexConstants.FORMAT_XML, UnitexConstants.FORMAT_XML_WITH_HEADERS):
concordances = os.path.join(dir, "concord.xml")
output = os.path.join(directory, "%s-concordances.xml" % name)
if options["virtualization"] is True:
concordances = "%s%s" % (UnitexConstants.VFS_PREFIX, concordances)
mv(concordances, output)
# Clean the Unitex files
if options["debug"] is False:
if options["virtualization"] is True:
for vf in ls("%s%s" % (UnitexConstants.VFS_PREFIX, directory)):
rm(vf)
rm(snt)
rm(txt)
else:
rmdir(dir)
rm(snt)
return output
if __name__ == "__main__":
def usage():
sys.stderr.write("Do Concord -- A simple script to illustrate the Unitex Python binding\n\n")
sys.stderr.write(" $ do-concord [OPTIONS] <file1(, file2, ...)>\n\n")
sys.stderr.write("Options:\n")
sys.stderr.write(" [ -h, --help = this help message ]\n")
sys.stderr.write(" -c, --config = the Unitex config file\n")
sys.stderr.write(" -g, --grammar = the fst2 grammar to use\n\n")
sys.stderr.write("Example:\n")
sys.stderr.write(" $ do-concord -c unitex.yaml *.txt\n")
sys.exit(1)
try:
opts, args = getopt.getopt(sys.argv[1:], "hc:g:", ["help", "config=", "grammar="])
except getopt.GetoptError:
usage()
if len(opts) == 0 and len(args) == 0:
usage()
config_file = None
grammar = None
for o, a in opts :
if o == "-h" or o == "--help":
usage()
elif o == "-c" or o == "--config":
config_file = a
elif o == "-g" or o == "--grammar":
grammar = a
else:
sys.stderr.write("Wrong option '%s'.\n" % o)
usage()
if config_file is None:
sys.stderr.write("You must provide the config file.\n")
usage()
if grammar is None:
sys.stderr.write("You must provide the grammar.\n")
usage()
if not args:
sys.stderr.write("You must provide at least one file to process.\n")
usage()
# 'glob' is bad! Do not use 'glob'.
files = []
for arg in args:
if os.path.isdir(arg) is True:
for root, dir, _files in os.walk(arg):
files += [os.path.join(root, f) for f in _files]
elif os.path.isfile(arg) is True:
files.append(arg)
else:
sys.stderr.write("The arguments must be files or directories.\n")
usage()
# Configuration file loading
config = None
with open(config_file, "r") as f:
config = yaml.load(f)
options = UnitexConfig(config)
# Intialization of the basic logging system
init_log_system(options["verbose"], options["debug"], options["log"])
# Load resources in the persistent space
if options["persistence"] is True:
grammar = load_persistent_fst2(grammar)
load_resources(options)
results = []
for f in files:
sys.stdout.write("Processing '%s'...\n" % f)
# This function illustrate the whole Unitex process used in order
# to produce a concordance file.
result = execute(f, grammar, options)
sys.stdout.write(" -> %s\n" % result)
results.append(result)
# Free resources from the persistent space
if options["persistence"] is True:
free_persistent_fst2(grammar)
free_resources(options)
global:
debug: 0
log: null
persistence: true
verbose: 0
virtualization: true
resources:
alphabet: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet.txt
alphabet-sorted: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Alphabet_sort.txt
dictionaries:
- /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/dela-fr-public.bin
- /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/ajouts80jours.bin
- /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Dela/motsGramf-.bin
language: fr
replace: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Replace/Replace.fst2
sentence: /home/dev/projects/python-unitex/dependencies/Unitex-GramLab-3.1rc/French/Graphs/Preprocessing/Sentence/Sentence.fst2
tools:
check_dic:
no_space_warning: false
strict: false
compress:
flip: false
output: null
semitic: false
version: v2
concord:
directory: null
font: null
fontsize: null
format: text
left: '0'
offsets: null
only_ambiguous: false
only_matches: false
right: '0'
script: null
sort: TO
thai: false
unxmlize: null
dico:
arabic_rules: null
korean: false
morpho: null
raw: null
semitic: false
extract:
non_matching_sentences: false
fst2txt:
merge: true
start_on_space: false
word_by_word: false
grf2fst2:
char_by_char: false
check_variables: true
debug: false
loop_check: false
named_repository: null
no_empty_graph_warning: false
pkgdir: null
silent_grf_name: true
tfst_check: false
locate:
ambiguous_outputs: true
arabic_rules: null
char_by_char: false
korean: false
match_mode: longest
morpho: null
negation_operator: tilde
number_of_matches: null
output_mode: merge
protect_dic_chars: true
sntdir: null
start_on_space: false
stop_token_count: null
variable: null
variable_error: ignore
normalize:
input_offsets: null
no_carriage_return: false
no_separator_normalization: false
output_offsets: null
replacement_rules: null
sort_txt:
duplicates: false
factorize_inflectional_codes: false
line_info: null
revers: false
sort_order: null
thai: false
tokenize:
char_by_char: false
input_offsets: null
output_offsets: null
tokens: null
txt2fst:
clean: false
korean: false
normalization_grammar: null
tagset: null
global:
debug: 1
verbose: 2
debug: 0
verbose: 0
log: null
persistence: True
......
......@@ -185,6 +185,8 @@ class UnitexProcessor(object):
kwargs = self.__options["tools"]["tokenize"]
ret = tokenize(self.__snt, alphabet, **kwargs)
if ret is False:
raise UnitexException("Text tokenization failed!")
def _lexicalize(self):
dictionaries = self.__options["resources"]["dictionaries"]
......@@ -281,7 +283,7 @@ class UnitexProcessor(object):
result = os.path.join(self.__dir, "concord.txt")
if self.__options["virtualization"] is True:
index = "%s%s" % (UnitexConstants.VFS_PREFIX, result)
result = "%s%s" % (UnitexConstants.VFS_PREFIX, result)
ret = concord(index, alphabet, **kwargs)
if ret is False:
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter