diff --git a/.gitignore b/.gitignore index 090e6e47c19764d06c3db62727c9ee52749c6980..5006fdec3e39dcac20e8ce3176de9c8c2df8818b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ dist build +# Documentation +documentation/_build/ + # VIM *.swp diff --git a/README.md b/README.md index 5217c74f2d4e47c75b23a1ca62166a5083f7af42..a976b20bf9ff2e045d3047e867d34116ab1495de 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,28 @@ unitex_tool(...) True if the command succeeds, False otherwise. ``` +If you want to generate HTML documentation from the source files, you can also use the [Sphinx Documentation Generator](http://www.sphinx-doc.org/). + +```bash +# Run as root +# On MacOSX (MacPorts) +# Python 2.7 +port install py27-sphinx + +# Python 3.5 +port install py35-sphinx + +# On Linux (distributions based on Debian) +# Python 2.7 +apt-get install python-sphinx +``` + +Once the tool installed, just open a terminal, move in the [documentation]() directory and run (as user): + +```bash +make +``` + ## Getting started **NOTE: The texts must be encoded in UTF-8. There is so far no support for UTF-16-(LE|BE) or any other encoding.** diff --git a/documentation/Makefile b/documentation/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..3061aaeb743eadfe4b37ac5fcfaf95aadc8dd8e1 --- /dev/null +++ b/documentation/Makefile @@ -0,0 +1,216 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PythonbindingsforUnitexGramLab.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PythonbindingsforUnitexGramLab.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/PythonbindingsforUnitexGramLab" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PythonbindingsforUnitexGramLab" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/documentation/_unitex.rst b/documentation/_unitex.rst new file mode 100644 index 0000000000000000000000000000000000000000..dbf38295dc71502858a82ae68ddf8ab530ad64fb --- /dev/null +++ b/documentation/_unitex.rst @@ -0,0 +1,11 @@ +.. __unitex + +The `_unitex` C++ extension +=========================== + +.. currentmodule:: _unitex +.. autosummary:: + :toctree: + +.. automodule:: _unitex + :members: diff --git a/documentation/conf.py b/documentation/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..9a78d1f279e3e2da0604b69feed6f12d071c6bef --- /dev/null +++ b/documentation/conf.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Python bindings for Unitex/GramLab documentation build configuration file, created by +# sphinx-quickstart on Sun Feb 28 11:29:29 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath('../unitex/')) +sys.path.insert(1, os.path.abspath('../extensions/')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', +] +autoclass_content = "both" + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Python bindings for Unitex/GramLab' +copyright = '2016, Patrick Watrin' +author = 'Patrick Watrin' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.0b' +# The full version, including alpha/beta/rc tags. +release = '1.0b' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'PythonbindingsforUnitexGramLabdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'PythonbindingsforUnitexGramLab.tex', 'Python bindings for Unitex/GramLab Documentation', + 'Patrick Watrin', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pythonbindingsforunitexgramlab', 'Python bindings for Unitex/GramLab Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'PythonbindingsforUnitexGramLab', 'Python bindings for Unitex/GramLab Documentation', + author, 'PythonbindingsforUnitexGramLab', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/documentation/index.rst b/documentation/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..2aaa9132281e76dd5284d2c98e6c6bbf834f53a3 --- /dev/null +++ b/documentation/index.rst @@ -0,0 +1,21 @@ +.. Python bindings for Unitex/GramLab documentation master file, created by + sphinx-quickstart on Sun Feb 28 11:29:29 2016. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to Python bindings for Unitex/GramLab's documentation! +============================================================== + +Contents: + +.. toctree:: + :maxdepth: 1 + + unitex + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/documentation/modules.rst b/documentation/modules.rst new file mode 100644 index 0000000000000000000000000000000000000000..404b584145c6da47cfa32a8717c4c54bb75922bb --- /dev/null +++ b/documentation/modules.rst @@ -0,0 +1,7 @@ +unitex +====== + +.. toctree:: + :maxdepth: 4 + + unitex diff --git a/documentation/unitex.rst b/documentation/unitex.rst new file mode 100644 index 0000000000000000000000000000000000000000..4bb710f4bd54b109026e7f060f1379486ed5352b --- /dev/null +++ b/documentation/unitex.rst @@ -0,0 +1,9 @@ +unitex module +============= + +Module contents +--------------- + +.. automodule:: unitex + :members: + :undoc-members: diff --git a/extensions/_unitex.cpp b/extensions/_unitex.cpp index c2e3cd2f423d9895c27b08292349cc7aec5022e3..1a39017eba808f04e82ce259653787ec7c40518e 100644 --- a/extensions/_unitex.cpp +++ b/extensions/_unitex.cpp @@ -17,8 +17,10 @@ using namespace unitex; #endif -static char unitex_docstring[] = - "This module provides some usefull C function to work with the Unitex library."; +static char unitex_docstring[] = "\ +This module provides some usefull C++ functions to work with the Unitex\n\ +library.\ +"; @@ -29,10 +31,10 @@ static char unitex_docstring[] = /* 'unitex_tool' function */ static char unitex_tool_docstring[] = "\ This function launches an Unitex command.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the Unitex command.\n\n\ -Return [bool]:\n\ - True if the command succeeds, False otherwise.\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the Unitex command.\n\n\ +*Return [bool]:*\n\n\ + **True** if the command succeeds, **False** otherwise.\ "; static PyObject *unitex_tool(PyObject *self, PyObject *args); @@ -56,13 +58,12 @@ PyObject *unitex_tool(PyObject *self, PyObject *args) { /* 'unitex_load_persistent_dictionary' function */ static char unitex_load_persistent_dictionary_docstring[] = "\ This function loads a dictionary in the persistent space.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the dictionary path.\n\n\ -Return [str]:\n\ - The persistent file path [str] (derived from filename but not\n\ - strictly identical, depending of implementation). This path must\n\ - be used by the unitex tools and the 'free_persistent_dictionary'\n\ - function.\n\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the dictionary path.\n\n\ +*Return [str]:*\n\n\ + The persistent file path (derived from filename but not strictly\n\ + identical, depending of implementation). This path must be used by\n\ + the unitex tools and the 'free_persistent_dictionary' function.\ "; static PyObject *unitex_load_persistent_dictionary(PyObject *self, PyObject *args); @@ -90,12 +91,12 @@ PyObject *unitex_load_persistent_dictionary(PyObject *self, PyObject *args) { /* 'unitex_load_persistent_fst2' function */ static char unitex_load_persistent_fst2_docstring[] = "\ This function loads a grammar in the persistent space.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the fst2 path.\n\n\ -Return [str]:\n\ - The persistent file path [str] (derived from filename but not\n\ - strictly identical, depending of implementation). This path must\n\ - be used by the unitex tools and the 'free_persistent_fst2' function.\n\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the fst2 path.\n\n\ +*Return [str]:*\n\n\ + The persistent file path (derived from filename but not strictly\n\ + identical, depending of implementation). This path must be used by\n\ + the unitex tools and the 'free_persistent_fst2' function.\ "; static PyObject *unitex_load_persistent_fst2(PyObject *self, PyObject *args); @@ -123,13 +124,12 @@ PyObject *unitex_load_persistent_fst2(PyObject *self, PyObject *args) { /* 'unitex_load_persistent_alphabet' function */ static char unitex_load_persistent_alphabet_docstring[] = "\ This function loads an alphabet in the persistent space.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the alphabet path.\n\n\ -Return [str]:\n\ - The persistent file path [str] (derived from filename but not\n\ - strictly identical, depending of implementation). This path must\n\ - be used by the unitex tools and the 'free_persistent_alphabet'\n\ - function.\n\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the alphabet path.\n\n\ +*Return [str]:*\n\n\ + The persistent file path (derived from filename but not strictly\n\ + identical, depending of implementation). This path must be used by\n\ + the unitex tools and the 'free_persistent_alphabet' function.\ "; static PyObject *unitex_load_persistent_alphabet(PyObject *self, PyObject *args); @@ -159,10 +159,10 @@ PyObject *unitex_load_persistent_alphabet(PyObject *self, PyObject *args) { /* 'unitex_free_persistent_dictionary' function */ static char unitex_free_persistent_dictionary_docstring[] = "\ This function unloads a dictionary from persistent space.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the persistent file path returned by the\n\ - 'load_persistent_dictionary' function.\n\n\ -Return [None].\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the persistent file path returned by the\n\ + 'load_persistent_dictionary' function.\n\n\ +*No return.*\ "; static PyObject *unitex_free_persistent_dictionary(PyObject *self, PyObject *args); @@ -179,10 +179,10 @@ PyObject *unitex_free_persistent_dictionary(PyObject *self, PyObject *args) { /* 'unitex_free_persistent_fst2' function */ static char unitex_free_persistent_fst2_docstring[] = "\ This function unloads a grammar from persistent space.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the persistent file path returned by the\n\ - 'load_persistent_fst2' function.\n\n\ -Return [None].\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the persistent file path returned by the\n\ + 'load_persistent_fst2' function.\n\n\ +*No return.*\ "; static PyObject *unitex_free_persistent_fst2(PyObject *self, PyObject *args); @@ -199,10 +199,10 @@ PyObject *unitex_free_persistent_fst2(PyObject *self, PyObject *args) { /* 'unitex_free_persistent_alphabet' function */ static char unitex_free_persistent_alphabet_docstring[] = "\ This function unloads an alphabet from persistent space.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the persistent file path returned by the\n\ - 'load_persistent_alphabet' function.\n\n\ -Return [None].\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the persistent file path returned by the\n\ + 'load_persistent_alphabet' function.\n\n\ +*No return.*\ "; static PyObject *unitex_free_persistent_alphabet(PyObject *self, PyObject *args); @@ -222,10 +222,10 @@ PyObject *unitex_free_persistent_alphabet(PyObject *self, PyObject *args) { static char unitex_is_persistent_dictionary_docstring[] = "\ This function checks if a dictionary path points to the persistent\n\ space.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the file path to check.\n\n\ -Return [bool]:\n\ - True if the dictionary is persistent, False otherwise.\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the file path to check.\n\n\ +*Return [bool]:*\n\n\ + **True** if the dictionary is persistent, **False** otherwise.\ "; static PyObject *unitex_is_persistent_dictionary(PyObject *self, PyObject *args); @@ -244,10 +244,10 @@ PyObject *unitex_is_persistent_dictionary(PyObject *self, PyObject *args) { static char unitex_is_persistent_fst2_docstring[] = "\ This function checks if a grammar path points to the persistent\n\ space.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the file path to check.\n\n\ -Return [bool]:\n\ - True if the grammar is persistent, False otherwise.\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the file path to check.\n\n\ +*Return [bool]:*\n\n\ + **True** if the dictionary is persistent, **False** otherwise.\ "; static PyObject *unitex_is_persistent_fst2(PyObject *self, PyObject *args); @@ -266,10 +266,10 @@ PyObject *unitex_is_persistent_fst2(PyObject *self, PyObject *args) { static char unitex_is_persistent_alphabet_docstring[] = "\ This function checks if an alphabet path points to the persistent\n\ space.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the file path to check.\n\n\ -Return [bool]:\n\ - True if the alphabet is persistent, False otherwise.\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the file path to check.\n\n\ +*Return [bool]:*\n\n\ + **True** if the dictionary is persistent, **False** otherwise.\ "; static PyObject *unitex_is_persistent_alphabet(PyObject *self, PyObject *args); @@ -294,9 +294,9 @@ PyObject *unitex_is_persistent_alphabet(PyObject *self, PyObject *args) { static char unitex_enable_stdout_docstring[] = "\ This function enables Unitex standard output. This is the default\n\ but should be used for debug purposes only.\n\n\ -No argument.\n\n\ -Return [bool]:\n\ - True if it succeeds, False otherwise.\ +*No argument.*\n\n\ +*Return [bool]:*\n\n\ + **True** if it succeeds, **False** otherwise.\ "; static PyObject *unitex_enable_stdout(PyObject *self, PyObject *noarg); @@ -313,9 +313,9 @@ PyObject *unitex_enable_stdout(PyObject *self, PyObject *noarg) { static char unitex_enable_stderr_docstring[] = "\ This function enables Unitex error output. This is the default\n\ but should be used for debug purposes only.\n\n\ -No argument.\n\n\ -Return [bool]:\n\ - True if it succeeds, False otherwise.\ +*No argument.*\n\n\ +*Return [bool]:*\n\n\ + **True** if it succeeds, **False** otherwise.\ "; static PyObject *unitex_enable_stderr(PyObject *self, PyObject *noarg); @@ -333,9 +333,9 @@ static char unitex_disable_stdout_docstring[] = "\ This function disables Unitex standard output to ensure multithread\n\ output consistency (i.e. avoid output mixing between threads) and to\n\ improve performances.\n\n\ -No argument.\n\n\ -Return [bool]:\n\ - True if it succeeds, False otherwise.\ +*No argument.*\n\n\ +*Return [bool]:*\n\n\ + **True** if it succeeds, **False** otherwise.\ "; static PyObject *unitex_disable_stdout(PyObject *self, PyObject *noarg); @@ -353,9 +353,9 @@ static char unitex_disable_stderr_docstring[] = "\ This function disables Unitex error output to ensure multithread\n\ output consistency (i.e. avoid output mixing between threads) and to\n\ improve performances.\n\n\ -No argument.\n\n\ -Return [bool]:\n\ - True if it succeeds, False otherwise.\ +*No argument.*\n\n\ +*Return [bool]:*\n\n\ + **True** if it succeeds, **False** otherwise.\ "; static PyObject *unitex_disable_stderr(PyObject *self, PyObject *noarg); @@ -373,11 +373,11 @@ static char unitex_cp_docstring[] = "\ This function copies a file. Both pathes can be on the virtual\n\ filesystem or the disk filesystem. Therefore, this function can be\n\ used to virtualize a file or to dump a virtual file.\n\n\ -Positional arguments (length: 2):\n\ - 0 [str] -- source file path\n\ - 1 [str] -- target file path\n\n\ -Return [bool]:\n\ - True if it succeeds, False otherwise.\ +*Positional arguments (length: 2):*\n\n\ +- **0 [str]** -- the source file path.\n\ +- **1 [str]** -- the target file path.\n\n\ +*Return [bool]:*\n\n\ + **True** if it succeeds, **False** otherwise.\ "; static PyObject *unitex_cp(PyObject *self, PyObject *args); @@ -397,10 +397,10 @@ PyObject *unitex_cp(PyObject *self, PyObject *args) { static char unitex_rm_docstring[] = "\ This function removes a file. The path can be on the virtual\n\ filesystem or the disk filesystem.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- file path\n\n\ -Return [bool]:\n\ - True if it succeeds, False otherwise.\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the file path.\n\n\ +*Return [bool]:*\n\n\ + **True** if it succeeds, **False** otherwise.\ "; static PyObject *unitex_rm(PyObject *self, PyObject *args); @@ -419,11 +419,11 @@ PyObject *unitex_rm(PyObject *self, PyObject *args) { static char unitex_mv_docstring[] = "\ This function moves/renames a file. Both pathes can be on the\n\ virtual filesystem or the disk filesystem.\n\n\ -Positional arguments (length: 2):\n\ - 0 [str] -- old file path\n\ - 1 [str] -- new file path\n\n\ -Return [bool]:\n\ - True if it succeeds, False otherwise.\ +*Positional arguments (length: 2):*\n\n\ +- **0 [str]** -- the current file path.\n\ +- **1 [str]** -- the new file path.\n\n\ +*Return [bool]:*\n\n\ + **True** if it succeeds, **False** otherwise.\ "; static PyObject *unitex_mv(PyObject *self, PyObject *args); @@ -442,10 +442,10 @@ PyObject *unitex_mv(PyObject *self, PyObject *args) { /* 'unitex_mkdir' function */ static char unitex_mkdir_docstring[] = "\ This function creates a directory on the disk.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- directory path\n\n\ -Return [bool]:\n\ - True if it succeeds, False otherwise.\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the directory path.\n\n\ +*Return [bool]:*\n\n\ + **True** if it succeeds, **False** otherwise.\ "; static PyObject *unitex_mkdir(PyObject *self, PyObject *args); @@ -463,10 +463,10 @@ PyObject *unitex_mkdir(PyObject *self, PyObject *args) { /* 'unitex_rmdir' function */ static char unitex_rmdir_docstring[] = "\ This function removes a directory from the disk.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- directory path\n\n\ -Return [bool]:\n\ - True if it succeeds, False otherwise.\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the directory path.\n\n\ +*Return [bool]:*\n\n\ + **True** if it succeeds, **False** otherwise.\ "; static PyObject *unitex_rmdir(PyObject *self, PyObject *args); @@ -484,11 +484,11 @@ PyObject *unitex_rmdir(PyObject *self, PyObject *args) { /* 'unitex_ls' function */ static char unitex_ls_docstring[] = "\ This function lists (disk or virtual) directory contents.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- directory path\n\n\ -Return [list(str)]:\n\ - The function returns a list of files (not directories) if the\n\ - directory is not empty and an empty list otherwise.\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the directory path.\n\n\ +*Return [list(str)]:*\n\n\ + The function returns a list of files (not directories) if the\n\ + directory is not empty and an empty list otherwise.\ "; static PyObject *unitex_ls(PyObject *self, PyObject *args); @@ -524,11 +524,11 @@ PyObject *unitex_ls(PyObject *self, PyObject *args) { /* 'unitex_read_file' function (UTF-8 encoding only)*/ static char unitex_read_file_docstring[] = "\ This function read a file from the disk or from the virtual filesystem.\n\ -The file **must** be encoded in UTF-8.\n\n\ -Positional arguments (length: 1):\n\ - 0 [str] -- the file path\n\n\ -Return [str]:\n\ - The function returns an unicode string.\ +**WARNING: The file must be encoded in UTF-8.**\n\n\ +*Positional arguments (length: 1):*\n\n\ +- **0 [str]** -- the file path.\n\n\ +*Return [str]:*\n\n\ + The function returns an unicode string.\ "; static PyObject *unitex_read_file(PyObject *self, PyObject *args); @@ -568,13 +568,13 @@ PyObject *unitex_read_file(PyObject *self, PyObject *args) { /* 'unitex_write_file' function (UTF-8 encoding only)*/ static char unitex_write_file_docstring[] = "\ This function writes a file on the disk or on the virtual filesystem.\n\ -The file will be encoded in UTF-8.\n\n\ -Positional arguments (length: 3):\n\ - 0 [str] -- the file path\n\ - 1 [unicode] -- the file content\n\ - 2 [int] -- 1 to writes the UTF-8 bom, 0 otherwise\n\n\ -Return [bool]:\n\ - True if the function succeeds, False otherwise.\ +**WARNING: The file will be encoded in UTF-8.**\n\n\ +*Positional arguments (length: 3):*\n\n\ +- **0 [str]** -- the file path.\n\ +- **1 [unicode]** -- the file content.\n\ +- **2 [int]** -- 1 to writes the UTF-8 bom, 0 otherwise.\n\n\ +*Return [bool]:*\n\n\ + **True** if the function succeeds, **False** otherwise.\ "; static PyObject *unitex_write_file(PyObject *self, PyObject *args); @@ -603,12 +603,12 @@ PyObject *unitex_write_file(PyObject *self, PyObject *args) { /* 'unitex_append_to_file' function */ static char unitex_append_to_file_docstring[] = "\ This function writes at the end of an existing file (virtual or not).\n\ -The file **must** be encoded in UTF-8.\n\n\ -Positional arguments (length: 2):\n\ - 0 [str] -- the file path\n\ - 1 [unicode] -- the file content\n\n\ -Return [bool]:\n\ - True if the function succeeds, False otherwise.\ +**WARNING: The file must be encoded in UTF-8.**\n\n\ +*Positional arguments (length: 2):*\n\n\ +- **0 [str]** -- the file path.\n\ +- **1 [unicode]** -- the file content.\n\n\ +*Return [bool]:*\n\n\ + **True** if the function succeeds, **False** otherwise.\ "; static PyObject *unitex_append_to_file(PyObject *self, PyObject *args); diff --git a/unitex/__init__.py b/unitex/__init__.py index 7c38fce7b8f7604071b1537cbaa8a8581e660dfd..07a65b9bcd834356e15c571cbd874f5c99e9e414 100644 --- a/unitex/__init__.py +++ b/unitex/__init__.py @@ -23,9 +23,9 @@ class UnitexConstants(object): VFS_PREFIX = "$:" - GRAMMAR = "grammar" - DICTIONARY = "dictionary" - ALPHABET = "alphabet" + RESOURCE_GRAMMAR = "grammar" + RESOURCE_DICTIONARY = "dictionary" + RESOURCE_ALPHABET = "alphabet" DELAF = "delaf" DELAS = "delas" @@ -111,10 +111,11 @@ def enable_stdout(): This function enables Unitex standard output. This is the default but should be used for debug purposes only. - No argument. + *No argument.* - Return [bool]: - True if it succeeds, False otherwise. + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ _LOGGER.info("Enabling standard output...") ret = _unitex.unitex_enable_stdout() @@ -129,10 +130,11 @@ def disable_stdout(): output consistency (i.e. avoid output mixing between threads) and to improve performances. - No argument. + *No argument.* + + *Return [bool]:* - Return [bool]: - True if it succeeds, False otherwise. + **True** if it succeeds, **False** otherwise. """ _LOGGER.info("Disabling standard output...") ret = _unitex.unitex_disable_stdout() @@ -146,10 +148,11 @@ def enable_stderr(): This function enables Unitex error output. This is the default but should be used for debug purposes only. - No argument. + *No argument.* + + *Return [bool]:* - Return [bool]: - True if it succeeds, False otherwise. + **True** if it succeeds, **False** otherwise. """ _LOGGER.info("Enabling error output...") ret = _unitex.unitex_enable_stderr() @@ -164,10 +167,11 @@ def disable_stderr(): output consistency (i.e. avoid output mixing between threads) and to improve performances. - No argument. + *No argument.* - Return [bool]: - True if it succeeds, False otherwise. + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ _LOGGER.info("Disabling error output...") ret = _unitex.unitex_disable_stderr() @@ -182,31 +186,33 @@ def init_log_system(verbose, debug, log=None): """ This function enables/disables the logging system. - Arguments: - verbose [int] -- enables/disables the standard output. Possible - values are: - - 0: the standard output is disabled; - - 1: the standard output shows 'warnings' emitted by the - bindings logging system. - - 2: the standard output shows 'warnings' and various - processing informations emitted by the bindings logging - system; - - 3: the full standard output is activated for both the - bindings and the Unitex processor. - - debug [int] -- enables/disables the error output. Possible - values are: - - 0: the error output is disabled; - - 1: the error output is limited to the logging system - implemented in the bindings; - - 2: the error output is activated for both the bindings - and the Unitex processor. - - log [str] -- if not None, the error and standard outputs are - redirected to the file specified by this argument. Be sure - to have write access to this file. - - Return [None]. + *Arguments:* + + - **verbose [int]** -- enables/disables the standard output. + Possible values are: + + - 0: the standard output is disabled; + - 1: the standard output shows 'warnings' emitted by the bindings + logging system; + - 2: the standard output shows 'warnings' and various + processing informations emitted by the bindings logging system; + - 3: the full standard output is activated for both the bindings + and the Unitex processor. + + - **debug [int]** -- enables/disables the error output. Possible + values are: + + - 0: the error output is disabled; + - 1: the error output is limited to the logging system implemented + in the bindings; + - 2: the error output is activated for both the bindings and the + Unitex processor. + + - **log [str]** -- if not None, the error and standard outputs are + redirected to the file specified by this argument. Be sure to have + write access to this file. + + *No return.* """ for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) diff --git a/unitex/io.py b/unitex/io.py index 2aa6cdc28ab9bc03ae77ed97f1639bf6b6b23ef8..d3872ddf824eac57db237d8019c4e16720d16450 100644 --- a/unitex/io.py +++ b/unitex/io.py @@ -18,12 +18,14 @@ def cp(source_path, target_path): filesystem or the disk filesystem. Therefore, this function can be used to virtualize a file or to dump a virtual file. - Arguments: - source_path [str] -- source file path - target_path [str] -- target file path + *Arguments:* - Return [bool]: - True if it succeeds, False otherwise. + - **source_path [str]** -- source file path + - **target_path [str]** -- target file path + + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ _LOGGER.info("Copying file '%s' to '%s'..." % (source_path, target_path)) ret = _unitex.unitex_cp(source_path, target_path) @@ -37,11 +39,13 @@ def rm(path): This function removes a file. The path can be on the virtual filesystem or the disk filesystem. - Argument: - path [str] -- file path + *Argument:* + + - **path [str]** -- file path + + *Return [bool]:* - Return [bool]: - True if it succeeds, False otherwise. + **True** if it succeeds, **False** otherwise. """ _LOGGER.info("Removing file '%s'..." % path) ret = _unitex.unitex_rm(path) @@ -55,12 +59,14 @@ def mv(old_path, new_path): This function moves/renames a file. Both pathes can be on the virtual filesystem or the disk filesystem. - Arguments: - old_path [str] -- old file path - new_path [str] -- new file path + *Arguments:* - Return [bool]: - True if it succeeds, False otherwise. + - **old_path [str]** -- old file path + - **new_path [str]** -- new file path + + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ _LOGGER.info("Moving file '%s' to '%s'..." % (old_path, new_path)) ret = _unitex.unitex_mv(old_path, new_path) @@ -73,11 +79,13 @@ def mkdir(path): """ This function creates a directory on the disk. - Argument: - path [str] -- directory path + *Argument:* + + - **path [str]** -- directory path - Return [bool]: - True if it succeeds, False otherwise. + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ _LOGGER.info("Creating directory '%s'..." % path) ret = _unitex.unitex_mkdir(path) @@ -90,11 +98,13 @@ def rmdir(path): """ This function removes a directory from the disk. - Argument: - path [str] -- directory path + *Argument:* + + - **path [str]** -- directory path - Return [bool]: - True if it succeeds, False otherwise. + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ _LOGGER.info("Removing directory '%s'..." % path) ret = _unitex.unitex_rmdir(path) @@ -107,12 +117,14 @@ def ls(path): """ This function lists (disk or virtual) directory contents. - Argument: - path [str] -- directory path + *Argument:* + + - **path [str]** -- directory path - Return [list(str)]: - The function returns a list of files (not directories) if the - directory is not empty and an empty list otherwise. + *Return [list(str)]:* + + The function returns a list of files (not directories) if the + directory is not empty and an empty list otherwise. """ _LOGGER.info("Listing directory '%s'..." % path) return _unitex.unitex_ls(path) @@ -122,11 +134,13 @@ def exists(path): This function verify if a file exists (on disk or virtual filesystem). - Argument: - path [str] -- directory path + *Argument:* + + - **path [str]** -- directory path + + *Return [bool]:* - Return [bool]: - True if the path exists, False otherwise. + **True** if the path exists, **False** otherwise. """ if path.startswith(UnitexConstants.VFS_PREFIX) is False: return os.path.exists(path) @@ -141,7 +155,8 @@ class UnitexFile(object): mainly useful to read files from virtual filesystem whithout having to copy them to the disk. - *WARNING: the encoding must be UTF-8 and the data Unicode strings.* + **WARNING: the encoding must be UTF-8 and the data Unicode + strings.** """ def __init__(self): @@ -151,20 +166,60 @@ class UnitexFile(object): self.__mode = None def open(self, file, mode=None, use_bom=False): + """ + This function opens a file from the disk or from the virtual + filesystem. + **WARNING: the I/O encoding is limited to UTF-8.** + + *Arguments:* + + - **file [str]** -- the file path + + - **mode [str]** -- specifies the mode in which the file is + open. Possible values are: + + - 'r': open for reading (default); + - 'w': open for writing; + - 'a': open for writing (append to the end of file if it + exists). + + - **use_bom [int]** -- 1 to writes the UTF-8 bom ('w' mode only, + 0 otherwise. + + *No return.* + """ + if self.__file is not None: raise UnitexException("You must close the current file (%s) before open another one..." % self.__file) self.__use_bom = use_bom self.__file = file + + if mode is None: + mode = "r" self.__mode = mode def close(self): + """ + This function close the opened file and reset all the internal + parameters. + """ if self.__file is None: raise UnitexException("There is no file to close...") self.__file = None self.__mode = None def write(self, data): + """ + This function writes/append data to the opened file. The file + must be opened in 'w' or 'a' mode. + + *Arguments:* + + - **data [unicode]** -- the content to write. + + *No return.* + """ if self.__file is None: raise UnitexException("You must open a file before writing...") if self.__mode not in ("w", "a"): @@ -177,6 +232,16 @@ class UnitexFile(object): _unitex.unitex_append_to_file(self.__file, data) def read(self): + """ + This function reads data from the opened file. The file must be + opened in 'r' mode. + + *No arguments.* + + *Return [unicode]:* + + The data read are returned as a unicode string. + """ if self.__file is None: raise UnitexException("You must open a file before reading...") if self.__mode != "r": diff --git a/unitex/processor.py b/unitex/processor.py index 5a5f5001ec115bd0dbbd602b2da504a57df551ce..73ac8a7f8541c3bb646fca718762e5bf8a828b96 100644 --- a/unitex/processor.py +++ b/unitex/processor.py @@ -35,7 +35,7 @@ def escape(sequence): class UnitexProcessor(object): """ - This class hides mots of the Unitex (pre-)processing in order to + This class hides most of the Unitex (pre-)processing in order to facilitate his usage. """ @@ -72,28 +72,28 @@ class UnitexProcessor(object): self.__persisted_objects = [] if self.__options["resources"]["alphabet"] is not None: - _type = UnitexConstants.ALPHABET + _type = UnitexConstants.RESOURCE_ALPHABET _object = load_persistent_alphabet(self.__options["resources"]["alphabet"]) self.__persisted_objects.append((_type, _object)) self.__options["resources"]["alphabet"] = _object if self.__options["resources"]["alphabet-sorted"] is not None: - _type = UnitexConstants.ALPHABET + _type = UnitexConstants.RESOURCE_ALPHABET _object = load_persistent_alphabet(self.__options["resources"]["alphabet-sorted"]) self.__persisted_objects.append((_type, _object)) self.__options["resources"]["alphabet-sorted"] = _object if self.__options["resources"]["sentence"] is not None: - _type = UnitexConstants.GRAMMAR + _type = UnitexConstants.RESOURCE_GRAMMAR _object = load_persistent_fst2(self.__options["resources"]["sentence"]) self.__persisted_objects.append((_type, _object)) self.__options["resources"]["sentence"] = _object if self.__options["resources"]["replace"] is not None: - _type = UnitexConstants.GRAMMAR + _type = UnitexConstants.RESOURCE_GRAMMAR _object = load_persistent_fst2(self.__options["resources"]["replace"]) self.__persisted_objects.append((_type, _object)) @@ -102,7 +102,7 @@ class UnitexProcessor(object): if self.__options["resources"]["dictionaries"] is not None: _objects = [] - _type = UnitexConstants.DICTIONARY + _type = UnitexConstants.RESOURCE_DICTIONARY for dictionary in self.__options["resources"]["dictionaries"]: _object = load_persistent_dictionary(dictionary) @@ -116,11 +116,11 @@ class UnitexProcessor(object): return for _type, _object in self.__persisted_objects: - if _type == UnitexConstants.GRAMMAR: + if _type == UnitexConstants.RESOURCE_GRAMMAR: free_persistent_fst2(_object) - elif _type == UnitexConstants.DICTIONARY: + elif _type == UnitexConstants.RESOURCE_DICTIONARY: free_persistent_dictionary(_object) - elif _type == UnitexConstants.ALPHABET: + elif _type == UnitexConstants.RESOURCE_ALPHABET: free_persistent_alphabet(_object) def _clean(self): @@ -305,20 +305,23 @@ class UnitexProcessor(object): non-ambiguous forms, tokenization and application of dictionaries. - Arguments: - path [str] -- the input corpus file path. + *Arguments:* - mode [str] -- this parameter (de)activates all the - pre-processing operations. Possible values are: 's' for - sentence segmentation, 'r' to apply Replace.fst2, 't' - to tokenize and 'l' to lexicalize (apply the - dictionaries). For instance, if you want to segment, - tokenize and lexicalize, the mode will be 'stl'. + - **path [str]** -- the input corpus file path. - tagged [bool] -- this parameter specifies if the input text - is tagged or not. Tf True, this parameter deactivate two - preprocessing options: sentence segmentation and - Replace.fst2 application. + - **mode [str]** -- this parameter (de)activates all the + pre-processing operations. Possible values are: **'s'** for + sentence segmentation, **'r'** to apply Replace.fst2, **'t'** + to tokenize and **'l'** to lexicalize (apply the + dictionaries). For instance, if you want to segment, tokenize + and lexicalize, the mode will be 'stl'. + + - **tagged [bool]** -- this parameter specifies if the input text + is tagged or not. Tf True, this parameter deactivate two + preprocessing options: sentence segmentation and Replace.fst2 + application. + + *No return.* """ directory, filename = os.path.split(path) name, extension = os.path.splitext(filename) @@ -357,17 +360,20 @@ class UnitexProcessor(object): Unitex processor such as the working directory (*_snt) and the normalized text file (*.snt). - Arguments: - clean [bool] -- if set to False, all the files created by - the Unitex processor will be kept on the disk or the - virtual filesystem if the virtualization is activated. - This option must be activated for debugging purposes - only (default: True). - - free [bool] -- if persistence is activated, by setting this - option to True, all the persisted resources will be - freed from memory. You should use this option when all - your corpus are processed (default: False). + *Arguments:* + + - **clean [bool]** -- if set to False, all the files created by + the Unitex processor will be kept on the disk or the virtual + filesystem if the virtualization is activated. This option + must be activated for debugging purposes only. + (default: **True**) + + - **free [bool]** -- if persistence is activated, by setting this + option to True, all the persisted resources will be freed + from memory. You should use this option when all your corpus + are processed. (default: **False**) + + *No return.* """ if clean is True: self._clean() @@ -383,18 +389,22 @@ class UnitexProcessor(object): """ This function tags the current opened corpus. - Arguments: - grammar [str] -- fst2 transducer used to tag the corpus. + *Arguments:* + + - **grammar [str]** -- fst2 transducer used to tag the corpus. + + - **output [str]** -- the output file path. + + *Keyword arguments:* - output [str] -- the output file path. + - **xml [bool]** -- if set to True, the resulting file will + contain the XML headers. - Keyword arguments: - xml [bool] -- if set to True, the resulting file will - contain the XML headers. + - **match_mode [str]** -- Possible values are: + - UnitexConstants.MATCH_MODE_SHORTEST + - UnitexConstants.MATCH_MODE_LONGEST (default) - match_mode [str] -- Possible values are: - - UnitexConstants.MATCH_MODE_SHORTEST - - UnitexConstants.MATCH_MODE_LONGEST (default) + *No return.* """ xml = kwargs.get("xml", False) match_mode = kwargs.get("match_mode", UnitexConstants.MATCH_MODE_LONGEST) diff --git a/unitex/resources.py b/unitex/resources.py index 8901930d0f6e4ec775ff67b214dc700a3763c64d..52160474bca5e3fed53b17f3d5a4599395cb5fea 100644 --- a/unitex/resources.py +++ b/unitex/resources.py @@ -15,15 +15,16 @@ def load_persistent_dictionary(path): """ This function loads a dictionary in persistent space. - Argument: - path [str] -- the exisent file path in filespace (hard disk or - virtual file system). + *Argument:* - Return [str]: - The persistent file path [str] (derived from filename but not - strictly identical, depending of implementation). This path must - be used by the unitex tools and the 'free_persistent_dictionary' - function. + - **path [str]** -- the exisent file path in filespace (hard disk or + virtual file system). + + *Return [str]:* + + The persistent file path (derived from filename but not strictly + identical, depending of implementation). This path must be used + the unitex tools and the 'free_persistent_dictionary' function. """ _LOGGER.info("Load persistent dictionary '%s'..." % path) return _unitex.unitex_load_persistent_dictionary(path) @@ -33,11 +34,13 @@ def is_persistent_dictionary(path): This function checks if a dictionary path points to the persistent space. - Argument: - path [str] -- the file path to check. + *Argument:* + + - **path [str]** -- the file path to check. - Return [bool]: - True if the dictionary is persistent, False otherwise. + *Return [bool]:* + + **True** if the dictionary is persistent, **False** otherwise. """ return _unitex.unitex_is_persistent_dictionary(path) @@ -45,9 +48,12 @@ def free_persistent_dictionary(path): """ This function unloads a dictionary from persistent space. - Argument: - path [str] -- the persistent file path returned by the - 'load_persistent_dictionary' function. + *Argument:* + + - **path [str]** -- the persistent file path returned by the + 'load_persistent_dictionary' function. + + *No return.* """ _LOGGER.info("Free persistent dictionary '%s'..." % path) _unitex.unitex_free_persistent_dictionary(path) @@ -58,15 +64,16 @@ def load_persistent_fst2(path): """ This function loads a fst2 in persistent space. - Argument: - path [str] -- the exisent file path in filespace (hard disk or - virtual file system). + *Argument:* + + - **path [str]** -- the exisent file path in filespace (hard disk or + virtual file system). - Return [str]: - The persistent file path [str] (derived from filename but not - strictly identical, depending of implementation). This path must - be used by the unitex tools and the 'free_persistent_fst2' - function. + *Return [str]:* + + The persistent file path (derived from filename but not strictly + identical, depending of implementation). This path must be used + the unitex tools and the 'free_persistent_fst2' function. """ _LOGGER.info("Load persistent fst2 '%s'..." % path) return _unitex.unitex_load_persistent_fst2(path) @@ -75,11 +82,13 @@ def is_persistent_fst2(path): """ This function checks if a fst2 path points to the persistent space. - Argument: - path [str] -- the file path to check. + *Argument:* + + - **path [str]** -- the file path to check. - Return [bool]: - True if the fst2 is persistent, False otherwise. + *Return [bool]:* + + **True** if the dictionary is persistent, **False** otherwise. """ return _unitex.unitex_is_persistent_fst2(path) @@ -87,9 +96,12 @@ def free_persistent_fst2(path): """ This function unloads a fst2 from persistent space. - Argument: - path [str] -- the persistent file path returned by the - 'load_persistent_fst2' function. + *Argument:* + + - **path [str]** -- the persistent file path returned by the + 'load_persistent_fst2' function. + + *No return.* """ _LOGGER.info("Free persistent fst2 '%s'..." % path) _unitex.unitex_free_persistent_fst2(path) @@ -100,15 +112,16 @@ def load_persistent_alphabet(path): """ This function loads a alphabet in persistent space. - Argument: - path [str] -- the exisent file path in filespace (hard disk or - virtual file system). + *Argument:* + + - **path [str]** -- the exisent file path in filespace (hard disk or + virtual file system). + + *Return [str]:* - Return [str]: - The persistent file path [str] (derived from filename but not - strictly identical, depending of implementation). This path must - be used by the unitex tools and the 'free_persistent_alphabet' - function. + The persistent file path (derived from filename but not strictly + identical, depending of implementation). This path must be used + the unitex tools and the 'free_persistent_alphabet' function. """ _LOGGER.info("Load persistent alphabet '%s'..." % path) return _unitex.unitex_load_persistent_alphabet(path) @@ -118,11 +131,13 @@ def is_persistent_alphabet(path): This function checks if a alphabet path points to the persistent space. - Argument: - path [str] -- the file path to check. + *Argument:* - Return [bool]: - True if the alphabet is persistent, False otherwise. + - **path [str]** -- the file path to check. + + *Return [bool]:* + + **True** if the dictionary is persistent, **False** otherwise. """ return _unitex.unitex_is_persistent_alphabet(path) @@ -130,9 +145,12 @@ def free_persistent_alphabet(path): """ This function unloads a alphabet from persistent space. - Argument: - path [str] -- the persistent file path returned by the - 'load_persistent_alphabet' function. + *Argument:* + + - **path [str]** -- the persistent file path returned by the + 'load_persistent_alphabet' function. + + *No return.* """ _LOGGER.info("Free persistent alphabet '%s'..." % path) _unitex.unitex_free_persistent_alphabet(path) diff --git a/unitex/tools.py b/unitex/tools.py index 6b7862c0e4e89ae7bdec042b4d1e7ffba2f85700..a2abe5795fecc12303bee2202919d964ae342e1f 100644 --- a/unitex/tools.py +++ b/unitex/tools.py @@ -32,24 +32,27 @@ def check_dic(dictionary, dtype, alphabet, **kwargs): CHECK_DIC.TXT that contains check result informations. This file is stored in the <dela> directory. - Arguments: - dictionary [str] -- the dictionary file path. + *Arguments:* - dtype [str] -- the dictionary type: - - UnitexConstants.DELAF (inflected); - - UnitexConstants.DELAS (non inflected). + - **dictionary [str]** -- the dictionary file path. - alphabet [str] -- the alphabet file path. + - **dtype [str]** -- the dictionary type: + - UnitexConstants.DELAF (inflected); + - UnitexConstants.DELAS (non inflected). - Keyword arguments: - strict [bool] -- strict syntax checking against unprotected dot - and comma (default: False). + - **alphabet [str]** -- the alphabet file path. + + *Keyword arguments:* + + - **strict [bool]** -- strict syntax checking against unprotected + dot and comma (default: False). + + - **no_space_warning [bool]** -- tolerates spaces in grammatical, + semantic and inflectional codes (default: True). - no_space_warning [bool] -- tolerates spaces in grammatical, - semantic and inflectional codes (default: True). + *Return [bool]:* - Return [bool]: - True if it succeeds and False otherwise. + **True** if it succeeds, **False** otherwise. """ options = CheckDicOptions() options.load(kwargs) @@ -89,37 +92,42 @@ def compress(dictionary, **kwargs): This function takes a DELAF dictionary as a parameter and compresses it. The compression of a dictionary dico.dic produces two files: - - dico.bin: a binary file containing the minimum automaton of - the inflected forms of the dictionary; + - dico.bin: a binary file containing the minimum automaton of + the inflected forms of the dictionary; - - dico.inf: a text file containing the compressed forms required - for the reconstruction of the dictionary lines from the - inflected forms contained in the automaton. + - dico.inf: a text file containing the compressed forms required + for the reconstruction of the dictionary lines from the inflected + forms contained in the automaton. - Arguments: - dictionary [str] -- the dictionary file path. + *Arguments:* - Keyword arguments: - output [str] -- sets the output file. By default, a file xxx.dic - will produce a file xxx.bin. - - flip [bool] -- indicates that the inflected and canonical forms - should be swapped in the compressed dictionary. This option - is used to construct an inverse dictionary which is - necessary for the program 'Reconstrucao' (default: False). - - semitic [bool] -- indicates that the semitic compression - algorithm should be used. Setting this option with semitic - languages like Arabic significantly reduces the size of the - output dictionary (default: False). - - version [str] -- 'v1': produces an old style .bin file; - 'v2': produces a new style .bin file, with no - file size limitation to 16 Mb and a - smaller size (default). - - Return [bool]: - True if it succeeds and False otherwise. + - **dictionary [str]** -- the dictionary file path. + + *Keyword arguments:* + + - **output [str]** -- sets the output file. By default, a file + xxx.dic will produce a file xxx.bin. + + - **flip [bool]** -- indicates that the inflected and canonical + forms should be swapped in the compressed dictionary. This option + is used to construct an inverse dictionary which is necessary for + the program 'Reconstrucao' (default: False). + + - **semitic [bool]** -- indicates that the semitic compression + algorithm should be used. Setting this option with semitic + languages like Arabic significantly reduces the size of the output + dictionary (default: False). + + - **version [str]** -- Possible values are: + - UnitexConstants.DICTIONARY_VERSION_1: produces an old style .bin + UnitexConstants.DICTIONARY_VERfile; + - UnitexConstants.DICTIONARY_VERSION_2: produces a new style .bin + file, with no file size limitation to 16 Mb and a smaller size + (default). + + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ options = CompressOptions() options.load(kwargs) @@ -174,154 +182,153 @@ def concord(index, alphabet, **kwargs): occurrence in characters in the file text_name.snt. Z represents the number of the sentence in which the occurrence was found. - Arguments: - index [str] -- the index file path (produced by the 'locate' - function). - - alphabet [str] -- alphabet file used for sorting. - - Keyword arguments: - - - Generic options: - font [str] -- the name of the font to use if the output is - an HTML file. - - fontsize [int] -- the font size to use if the output is an - HTML file. - - only_ambiguous [bool] -- Only displays identical occurrences - with ambiguous outputs, in text order (default: False). - - only_matches [bool] -- this option will force empty right - and left contexts. Moreover, if used with - UnitexConstants.FORMAT_TEXT, the function will not - surround matches with tabulations (default: False). - - left [str] -- number of characters on the left of the - occurrences (default=0). In Thai mode, this means the - number of non-diacritic characters. - - right [str] -- number of characters (non-diacritic ones in - Thai mode) on the right of the occurrences (default=0). - If the occurrence is shorter than this value, the - concordance line is completed up to right. If the - occurrence is longer than the length defined by right, - it is nevertheless saved as whole. - - NOTE: For both --left and --right, you can add the 's' - character to stop at the first {S} tag. For instance, if you - set '40s' for the left value, the left context will end at - 40 characters at most, less if the {S} tag is found before. - - - Sort options: - sort [str] -- specifies the sort order. Possible values: - - - 'UnitexConstants.SORT_TEXT_ORDER': order in which the - occurrences appear in the text (default); - - - 'UnitexConstants.SORT_LEFT_CENTER': left context for - primary sort, then occurrence for secondary sort; - - - 'UnitexConstants.SORT_LEFT_RIGHT': left context, then - right context; - - - 'UnitexConstants.SORT_CENTER_LEFT': occurrence, then - left context; - - - 'UnitexConstants.SORT_CENTER_RIGHT': occurrence, then - right context; - - - 'UnitexConstants.SORT_RIGHT_LEFT': right context, then - left context; - - - 'UnitexConstants.SORT_RIGHT_CENTER': left context, - then occurrence. - - - Output options: - format [str] -- specifies the output fomat. Possible values: - - - UnitexConstants.FORMAT_HTML: produces a concordance in - HTML format encoded in UTF-8 (default); - - - UnitexConstants.FORMAT_TEXT: produces a concordance in - Unicode text format; - - - UnitexConstants.FORMAT_GLOSSANET: produces a - concordance for GlossaNet in HTML format where - occurrences are links described by the 'script' - argument (cf. Unitex manual p. 268). The HTML file - is encoded in UTF-8; - - - UnitexConstants.FORMAT_SCRIPT: produces a HTML - concordance file where occurrences are links - described by the 'script' argument; - - - UnitexConstants.FORMAT_INDEX: produces an index of the - concordance, made of the content of the occurrences - (with the grammar outputs, if any), preceded by the - positions of the occurrences in the text file given - in characters; - - - UnitexConstants.FORMAT_UIMA: produces an index of the - concordance relative to the original text file, - before any Unitex operation. The 'offsets' argument - must be provided; - - - UnitexConstants.FORMAT_PRLG: produces a concordance - for PRLG corpora where each line is prefixed by - information extracted with Unxmlize’s 'prlg' option. - You must provide both the 'offsets' and the - 'unxmlize' argument; - - - UnitexConstants.FORMAT_XML: produces an xml index of - the concordance; - - - UnitexConstants.FORMAT_XML_WITH_HEADER: produces an - xml index of the concordance with full xml header; - - - UnitexConstants.FORMAT_AXIS: quite the same as - 'index', but the numbers represent the median - character of each occurrence; - - - UnitexConstants.FORMAT_XALIGN: another index file, - used by the text alignment module. Each line is made - of 3 integers X Y Z followed by the content of the - occurrence. X is the sentence number, starting from - 1. Y and Z are the starting and ending positions of - the occurrence in the sentence, given in characters; - - - UnitexConstants.FORMAT_MERGE: indicates to the - function that it is supposed to produce a modified - version of the text and save it in a file. - The filename must be provided with the 'output' - argument. - - script [str] -- string describing the links format for - 'glossanet' and 'script' output. For instance, if you - use 'http://www.google.com/search?q=', you will obtain a - HTML concordance file where occurrences are hyperlinks - to Google queries. - - offsets [str] -- the file produced by Tokenize’s - output_offsets option (needed by the 'uima' and the - 'prlg' format). - - unxmlize [str] -- file produced by Unxmlize’s 'prlg' option - (needed by the 'prlg' format). - - output [str] -- the output filename (needed by the 'merge' - format). - - - Other options: - directory [str] -- indicates to the function that it must - not work in the same directory than <index> but in - 'directory'. - - thai [bool] -- option to use for Thai concordances - (default: False). - - Return [bool]: - True if it succeeds and False otherwise. + *Arguments:* + + - **index [str]** -- the index file path (produced by the 'locate' + function). + + - **alphabet [str]** -- alphabet file used for sorting. + + *Keyword arguments:* + + - *Generic options:* + + - **font [str]** -- the name of the font to use if the output is + an HTML file. + + - **fontsize [int]** -- the font size to use if the output is an + HTML file. + + - **only_ambiguous [bool]** -- Only displays identical occurrences + with ambiguous outputs, in text order (default: False). + + - **only_matches [bool]** -- this option will force empty right + and left contexts. Moreover, if used with + UnitexConstants.FORMAT_TEXT, the function will not surround + matches with tabulations (default: False). + + - **left [str]** -- number of characters on the left of the + occurrences (default=0). In Thai mode, this means the number of + non-diacritic characters. + + - **right [str]** -- number of characters (non-diacritic ones in + Thai mode) on the right of the occurrences (default=0). If the + occurrence is shorter than this value, the concordance line is + completed up to right. If the occurrence is longer than the + length defined by right, it is nevertheless saved as whole. + + **NOTE:** For both 'left' and 'right', you can add the 's' + character to stop at the first {S} tag. For instance, if you set + '40s' for the left value, the left context will end at 40 + characters at most, less if the {S} tag is found before. + + - *Sort options:* + + - **sort [str]** -- specifies the sort order. Possible values: + + - UnitexConstants.SORT_TEXT_ORDER: order in which the + occurrences appear in the text (default); + + - UnitexConstants.SORT_LEFT_CENTER: left context for primary + sort, then occurrence for secondary sort; + + - UnitexConstants.SORT_LEFT_RIGHT: left context, then right + context; + + - UnitexConstants.SORT_CENTER_LEFT: occurrence, then left + context; + + - UnitexConstants.SORT_CENTER_RIGHT: occurrence, then right + context; + + - UnitexConstants.SORT_RIGHT_LEFT: right context, then left + context; + + - UnitexConstants.SORT_RIGHT_CENTER: left context, then + occurrence. + + - *Output options:* + + - **format [str]** -- specifies the output fomat. Possible values: + + - UnitexConstants.FORMAT_HTML: produces a concordance in HTML + format encoded in UTF-8 (default); + + - UnitexConstants.FORMAT_TEXT: produces a concordance in Unicode + text format; + + - UnitexConstants.FORMAT_GLOSSANET: produces a concordance for + GlossaNet in HTML format where occurrences are links described + by the 'script' argument (cf. Unitex manual p. 268). The HTML + file is encoded in UTF-8; + + - UnitexConstants.FORMAT_SCRIPT: produces a HTML concordance + file where occurrences are links described by the 'script' + argument; + + - UnitexConstants.FORMAT_INDEX: produces an index of the + concordance, made of the content of the occurrences (with the + grammar outputs, if any), preceded by the positions of the + occurrences in the text file given in characters; + + - UnitexConstants.FORMAT_UIMA: produces an index of the + concordance relative to the original text file, before any + Unitex operation. The 'offsets' argument must be provided; + + - UnitexConstants.FORMAT_PRLG: produces a concordance for PRLG + corpora where each line is prefixed by information extracted + with Unxmlize’s 'prlg' option. You must provide both the + 'offsets' and the 'unxmlize' argument; + + - UnitexConstants.FORMAT_XML: produces an xml index of the + concordance; + + - UnitexConstants.FORMAT_XML_WITH_HEADER: produces an xml index + of the concordance with full xml header; + + - UnitexConstants.FORMAT_AXIS: quite the same as 'index', but + the numbers represent the median character of each occurrence; + + - UnitexConstants.FORMAT_XALIGN: another index file, used by the + text alignment module. Each line is made of 3 integers X Y Z + followed by the content of the occurrence. X is the sentence + number, starting from 1. Y and Z are the starting and ending + positions of the occurrence in the sentence, given in + characters; + + - UnitexConstants.FORMAT_MERGE: indicates to the function that + it is supposed to produce a modified version of the text and + save it in a file. The filename must be provided with the + 'output' argument. + + - **script [str]** -- string describing the links format for + 'glossanet' and 'script' output. For instance, if you use + 'http://www.google.com/search?q=', you will obtain a HTML + concordance file where occurrences are hyperlinks to Google + queries. + + - **offsets [str]** -- the file produced by Tokenize’s + output_offsets option (needed by the 'uima' and the 'prlg' + format). + + - **unxmlize [str]** -- file produced by Unxmlize’s 'prlg' option + (needed by the 'prlg' format). + + - **output [str]** -- the output filename (needed by the 'merge' + format). + + - *Other options:* + + - **directory [str]** -- indicates to the function that it must + not work in the same directory than <index> but in + 'directory'. + + - **thai [bool]** -- option to use for Thai concordances + (default: False). + + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ options = ConcordOptions() options.load(kwargs) @@ -414,48 +421,50 @@ def dico(dictionaries, text, alphabet, **kwargs): The function 'dico' produces the following files, and saves them in the directory of the text: - - dlf: dictionary of simple words in the text; - - dlc: dictionary of compound words in the text; - - err: list of unknown words in the text; - - tags_err: unrecognized simple words that are not matched by - the tags.ind file; - - tags.ind: sequences to be inserted in the text automaton (see - section 3.8.3, page 69); - - stat_dic.n: file containing the number of simple words, the - number of compound words, and the number of - unknown words in the text. - - NOTE: Files dlf, dlc, err and tags_err are not sorted. Use the + - dlf: dictionary of simple words in the text; + - dlc: dictionary of compound words in the text; + - err: list of unknown words in the text; + - tags_err: unrecognized simple words that are not matched by the + tags.ind file; + - tags.ind: sequences to be inserted in the text automaton (see + section 3.8.3, page 69); + - stat_dic.n: file containing the number of simple words, the number + of compound words, and the number of unknown words in the text. + + **NOTE:** Files dlf, dlc, err and tags_err are not sorted. Use the function 'sort_txt' to sort them. - Arguments: - dictionaries [list(str)] -- list of dictionary pathes ('bin' or - 'fst2' formats). - - text [str] -- text (snt format) file path. - - alphabet [str] -- alphabet file path. - - Keyword arguments: - morpho [list(str)] -- this optional argument indicates which - morphological mode dictionaries are to be used, if needed by - some .fst2 dictionaries. The argument is a list of - dictionary path (bin format). - - korean [bool] -- specify the dictionary is in korean - (default: False). - - semitic [bool] -- specify the dictionary is in a semitic - language (default: False). - - arabic_rules [str] -- specifies the Arabic typographic rule - configuration file path. - - raw [str] -- alternative output file path containing both simple - and compound words, without requiring a text directory. - - Return [bool]: - True if it succeeds and False otherwise. + *Arguments:* + + - **dictionaries [list(str)]** -- list of dictionary pathes ('bin' + or 'fst2' formats). + + - **text [str]** -- text (snt format) file path. + + - **alphabet [str]** -- alphabet file path. + + *Keyword arguments:* + + - **morpho [list(str)]** -- this optional argument indicates which + morphological mode dictionaries are to be used, if needed by some + .fst2 dictionaries. The argument is a list of dictionary path (bin + format). + + - **korean [bool]** -- specify the dictionary is in korean + (default: False). + + - **semitic [bool]** -- specify the dictionary is in a semitic + language (default: False). + + - **arabic_rules [str]** -- specifies the Arabic typographic rule + configuration file path. + + - **raw [str]** -- alternative output file path containing both + simple and compound words, without requiring a text directory. + + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ options = DicoOptions() options.load(kwargs) @@ -504,20 +513,23 @@ def extract(text, output, index, **kwargs): <text> represents the complete path of the text file, without omitting the extension .snt. - Arguments: - text [str] -- the text file (.snt format). + *Arguments:* - output [str] -- the output text file. + - **text [str]** -- the text file (.snt format). + + - **output [str]** -- the output text file. + + - **index [str]** -- the index file path (produced by the 'locate' + function). - index [str] -- the index file path (produced by the 'locate' - function). + *Keyword arguments:* - Keyword arguments: - non_matching_sentences [bool] -- extracts all sentences that - don’t contain matching units (default: False). + - **non_matching_sentences [bool]** -- extracts all sentences that + don’t contain matching units (default: False). + + *Return [bool]:* - Return [bool]: - True if it succeeds and False otherwise. + **True** if it succeeds, **False** otherwise. """ options = ExtractOptions() options.load(kwargs) @@ -556,30 +568,34 @@ def fst2txt(grammar, text, alphabet, **kwargs): at the preprocessing stage, when the text has not been cut into lexical units yet. This function modifies the input text file. - NOTE: This function modifies the input text file. - - Arguments: - grammar [str] -- the fst2 to apply on the text. + **NOTE:** This function modifies the input text file. - text [str] -- the (.snt) text file to be modified. + *Arguments:* - alphabet [str] -- the alphabet file of the language of the text. + - **grammar [str]** -- the fst2 to apply on the text. + + - **text [str]** -- the (.snt) text file to be modified. + + - **alphabet [str]** -- the alphabet file of the language of the + text. - Keyword arguments: - start_on_space [bool] -- this parameter indicates that the - search will start at any position in the text, even before a - space. This parameter should only be used to carry out - morphological searches (default: False). + *Keyword arguments:* - char_by_char [bool] -- works in character by character - tokenization mode. This is useful for languages like Thai - (default: False). + - **start_on_space [bool]** -- this parameter indicates that the + search will start at any position in the text, even before a + space. This parameter should only be used to carry out + morphological searches (default: False). + + - **char_by_char [bool]** -- works in character by character + tokenization mode. This is useful for languages like Thai + (default: False). + + - **merge [bool]** -- merge (instead of replace) transducer outputs + with text inputs (default: True). - merge [bool] -- merge (instead of replace) transducer outputs - with text inputs (default: True). + *Return [bool]:* - Return [bool]: - True if it succeeds and False otherwise. + **True** if it succeeds, **False** otherwise. """ options = Fst2TxtOptions() options.load(kwargs) @@ -634,50 +650,50 @@ def grf2fst2(grammar, alphabet, **kwargs): function as a parameter, but with extension .fst2. This file is saved in the same directory as <grf>. - Arguments: - grammar [str] -- the grf to compile. - - alphabet [str] -- specifies the alphabet file to be used for - tokenizing the content of the grammar boxes into lexical - units. - - Keyword arguments: - loop_check [bool] -- enables error (loop) checking - (default: False). - - char_by_char [bool] -- tokenization will be done character by - character. If neither -c nor -a option is used, lexical - units will be sequences of any Unicode letters - (default: False). - - pkgdir [str] -- specifies the repository directory to use (see - section 5.2.2, page 99). - - no_empty_graph_warning [bool] -- no warning will be emitted when - a graph matches the empty word. This option is used by - MultiFlex in order not to scare users with meaningless error - messages when they design an inflection grammar that matches - the empty word (default: False). - - tfst_check [bool] -- checks wether the given graph can be - considered as a valid sentence automaton or not - (default: False). - - silent_grf_name [bool] -- does not print the graph names - (default: True). - - named_repositories [list(str)] -- declaration of named - repositories. This argument is made of one or more X=Y - sequences, separated by ‘;’, where X is the name of the - repository denoted by pathname Y. - - debug [bool] -- compile graphs in debug mode (default: False). - - check_variables [bool] -- check output validity to avoid - malformed variable expressions (default: True). - - Return [bool]: - True if it succeeds and False otherwise. + *Arguments:* + + - **grammar [str]** -- the grf to compile. + + - **alphabet [str]** -- specifies the alphabet file to be used for + tokenizing the content of the grammar boxes into lexical units. + + *Keyword arguments:* + + - **loop_check [bool]** -- enables error (loop) checking + (default: False). + + - **char_by_char [bool]** -- tokenization will be done character by + character. If neither -c nor -a option is used, lexical units will + be sequences of any Unicode letters (default: False). + + - **pkgdir [str]** -- specifies the repository directory to use (see + section 5.2.2, page 99). + + - **no_empty_graph_warning [bool]** -- no warning will be emitted + when a graph matches the empty word. This option is used by + MultiFlex in order not to scare users with meaningless error + messages when they design an inflection grammar that matches the + empty word (default: False). + + - **tfst_check [bool]** -- checks wether the given graph can be + considered as a valid sentence automaton or not (default: False). + + - **silent_grf_name [bool]** -- does not print the graph names + (default: True). + + - **named_repositories [list(str)]** -- declaration of named + repositories. This argument is made of one or more X=Y sequences, + separated by ‘;’, where X is the name of the repository denoted by + pathname Y. + + - **debug [bool]** -- compile graphs in debug mode (default: False). + + - **check_variables [bool]** -- check output validity to avoid + malformed variable expressions (default: True). + + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ options = Grf2Fst2Options() options.load(kwargs) @@ -818,8 +834,9 @@ def locate(grammar, text, alphabet, **kwargs): - UnitexConstants.ON_ERROR_IGNORE (default) - UnitexConstants.ON_ERROR_BACKTRACK - Return [bool]: - True if it succeeds and False otherwise. + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ options = LocateOptions() options.load(kwargs) @@ -953,8 +970,9 @@ def normalize(text, **kwargs): rules specified with the 'replacement_rules' option (default: False). - Return [bool]: - True if it succeeds and False otherwise. + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ options = NormalizeOptions() options.load(kwargs) @@ -1022,8 +1040,9 @@ def sort_txt(text, **kwargs): and X,Y.Z:B become a single entry X,Y.Z:A:B (default: False). - Return [bool]: - True if it succeeds and False otherwise. + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ options = SortTxtOptions() options.load(kwargs) @@ -1118,8 +1137,9 @@ def tokenize(text, alphabet, **kwargs): output_offsets [str] -- offset file to be produced. - Return [bool]: - True if it succeeds and False otherwise. + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ options = TokenizeOptions() options.load(kwargs) @@ -1190,8 +1210,9 @@ def txt2tfst(text, alphabet, **kwargs): korean [bool] -- tells the function that it works on Korean (default: False). - Return [bool]: - True if it succeeds and False otherwise. + *Return [bool]:* + + **True** if it succeeds, **False** otherwise. """ options = Txt2TFstOptions() options.load(kwargs)