From b299098c66f8ede8b61c4dda4bd7b7309a636f8d Mon Sep 17 00:00:00 2001
From: Patrick Watrin <pat@lucy.local>
Date: Sun, 21 Feb 2016 19:10:15 +0100
Subject: [PATCH] first attempt for processor high-level class

---
 config/unitex.yaml         |  18 +--
 tests/01_test_tools.py     |   6 +-
 tests/02_test_resources.py |   8 +-
 tests/04_test_processor.py | 115 ++++++++++++++++
 tests/data/Replace.fst2    | Bin 0 -> 1782 bytes
 tests/data/Sentence.fst2   | Bin 19797 -> 18740 bytes
 tests/data/unitex.yaml     | 119 ++++++++++++++++
 unitex/config.py           |  55 +++-----
 unitex/processor.py        | 269 ++++++++++++++++++++++++++++++++++++-
 9 files changed, 531 insertions(+), 59 deletions(-)
 create mode 100644 tests/04_test_processor.py
 create mode 100644 tests/data/Replace.fst2
 create mode 100644 tests/data/unitex.yaml

diff --git a/config/unitex.yaml b/config/unitex.yaml
index 209a36e..bf58486 100644
--- a/config/unitex.yaml
+++ b/config/unitex.yaml
@@ -3,30 +3,26 @@ global:
     verbose: 2
     log: null
 
-    tempdir: "/tmp"
-
     persistence: True
     virtualization: True
 
 resources:
     language: "fr"
 
-    alphabet: "/home/resources/media/fr/unitex/preprocessing/Alphabet.txt"
-    alphabet-sorted: "/home/resources/media/fr/unitex/preprocessing/Alphabet_sort.txt"
-    sentence: "/home/resources/media/fr/unitex/preprocessing/sentence/Sentence.fst2"
-    replace: "/home/resources/media/fr/unitex/preprocessing/replace/Replace.fst2"
+    alphabet: "/full/path/to/Alphabet.txt"
+    alphabet-sorted: "/full/path/to/Alphabet_sort.txt"
+    sentence: "/full/path/to/Sentence.fst2"
+    replace: "/full/path/to/Replace.fst2"
 
     dictionaries:
-        - "/home/resources/media/fr/unitex/dictionary/delaf-short.bin"
-        - "/home/resources/media/fr/unitex/dictionary/delacf-light.bin"
-        - "/home/resources/media/fr/unitex/dictionary/toponyms.bin"
+        - "/full/path/to/dictionary.bin"
 
-# The 'options' section can contain any of the argument used by the unitex tools
+# The 'tools' section can contain any of the argument used by the unitex tools
 # functions. Note that, if you use the 'Processor' high-level class some argument
 # could be overriden to fit the 'tag', 'extract' and 'search' functions
 # behaviour. For intance, there is no point to define a font or a context for
 # 'concord'.
-options:
+tools:
     check_dic:
         strict: False
         no_space_warning: False
diff --git a/tests/01_test_tools.py b/tests/01_test_tools.py
index 0dc2371..29e6115 100644
--- a/tests/01_test_tools.py
+++ b/tests/01_test_tools.py
@@ -22,7 +22,7 @@ class Arguments:
         self.__arguments["inf"] = "data/dictionary.inf" 
 
         self.__arguments["alphabet"] = "data/Alphabet.txt" 
-        self.__arguments["alphabet_sort"] = "data/Alphabet_sort.txt" 
+        self.__arguments["alphabet-sorted"] = "data/Alphabet_sort.txt" 
 
         self.__arguments["sentence"] = "data/Sentence.fst2" 
 
@@ -215,7 +215,7 @@ class TestUnitexTools(unittest.TestCase):
         kwargs = {}
         kwargs["duplicates"] = False
         kwargs["reverse"] = False
-        kwargs["sort_order"] = self._arguments["alphabet_sort"]
+        kwargs["sort_order"] = self._arguments["alphabet-sorted"]
         kwargs["line_info"] = self._arguments["stat_dic.n"]
         kwargs["thai"] = False
         kwargs["factorize_inflectional_codes"] = False
@@ -285,7 +285,7 @@ class TestUnitexTools(unittest.TestCase):
 
     def test_10_concord(self):
         index = self._arguments["ind"]
-        alphabet = self._arguments["alphabet"]
+        alphabet = self._arguments["alphabet-sorted"]
 
         kwargs = {}
         kwargs["font"] = None
diff --git a/tests/02_test_resources.py b/tests/02_test_resources.py
index e2eea95..48b7451 100644
--- a/tests/02_test_resources.py
+++ b/tests/02_test_resources.py
@@ -3,6 +3,7 @@
 
 import os, unittest
 
+from unitex import UnitexConstants
 from unitex.resources import *
 from unitex.tools import compress
 
@@ -48,14 +49,13 @@ class TestUnitexResources(unittest.TestCase):
             os.remove(self._arguments["inf"])
 
     def test_01_load_dictionary(self):
-        args = [self._arguments["dic"]]
-
         kwargs = {}
+        kwargs["output"] = None
         kwargs["flip"] = False
         kwargs["semitic"] = False
-        kwargs["version"] = "v2"
+        kwargs["version"] = UnitexConstants.DICTIONARY_VERSION_1
 
-        ret = compress(*args, **kwargs)
+        ret = compress(self._arguments["dic"], **kwargs)
 
         path = self._arguments["bin"]
 
diff --git a/tests/04_test_processor.py b/tests/04_test_processor.py
new file mode 100644
index 0000000..a66a2c1
--- /dev/null
+++ b/tests/04_test_processor.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os, shutil, unittest
+
+from unitex import UnitexConstants
+from unitex.tools import compress, grf2fst2
+from unitex.processor import UnitexProcessor
+
+
+
+class Arguments:
+
+    def __init__(self, language=None):
+        self.__arguments = {}
+
+        self.__arguments["config"] = "data/unitex.yaml"
+
+        self.__arguments["alphabet"] = "data/Alphabet.txt" 
+
+        self.__arguments["dic"] = "data/dictionary.dic"
+        self.__arguments["bin"] = "data/dictionary.bin"
+        self.__arguments["inf"] = "data/dictionary.inf"
+
+        self.__arguments["grf"] = "data/grammar.grf"
+        self.__arguments["fst2"] = "data/grammar.fst2"
+
+        self.__arguments["txt"] = "data/corpus.txt"
+        self.__arguments["tag"] = "data/corpus.tag"
+        self.__arguments["xml"] = "data/corpus.xml"
+
+    def __getitem__(self, key):
+        if key not in self.__arguments:
+            raise KeyError("Argument '%s' not found ..." % key)
+        return self.__arguments[key]
+
+
+
+class TestUnitexIO(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(self):
+        self._arguments = Arguments()
+
+        dictionary = self._arguments["dic"]
+
+        kwargs = {}
+        kwargs["output"] = None
+        kwargs["flip"] = False
+        kwargs["semitic"] = False
+        kwargs["version"] = UnitexConstants.DICTIONARY_VERSION_1
+
+        ret = compress(dictionary, **kwargs)
+
+        grammar = self._arguments["grf"]
+        alphabet = self._arguments["alphabet"]
+
+        kwargs = {}
+        kwargs["loop_check"] = False
+        kwargs["char_by_char"] = False
+        kwargs["pkgdir"] = None
+        kwargs["no_empty_graph_warning"] = False
+        kwargs["tfst_check"] = False
+        kwargs["silent_grf_name"] = False
+        kwargs["named_repositories"] = None
+        kwargs["debug"] = False
+        kwargs["check_variables"] = False
+
+        ret = grf2fst2(grammar, alphabet, **kwargs)
+
+    @classmethod
+    def tearDownClass(self):
+        if os.path.exists(self._arguments["bin"]):
+            os.remove(self._arguments["bin"])
+
+        if os.path.exists(self._arguments["inf"]):
+            os.remove(self._arguments["inf"])
+
+        if os.path.exists(self._arguments["fst2"]):
+            os.remove(self._arguments["fst2"])
+
+        if os.path.exists(self._arguments["tag"]):
+            os.remove(self._arguments["tag"])
+
+        if os.path.exists(self._arguments["xml"]):
+            os.remove(self._arguments["xml"])
+
+    def test_01_processor_txt(self):
+        processor = UnitexProcessor(self._arguments["config"])
+        processor.open(self._arguments["txt"], mode="srtlf", tagged=False)
+
+        kwargs = {}
+        kwargs["xml"] = False
+
+        ret = processor.tag(self._arguments["fst2"], self._arguments["tag"], **kwargs)
+
+        processor.close(clean=True, free=True)
+        self.assertTrue(ret, "Tagging process failed (txt format)!")
+
+    def test_02_processor_xml(self):
+        processor = UnitexProcessor(self._arguments["config"])
+        processor.open(self._arguments["txt"], mode="srtlf", tagged=False)
+
+        kwargs = {}
+        kwargs["xml"] = True
+
+        ret = processor.tag(self._arguments["fst2"], self._arguments["tag"], **kwargs)
+
+        processor.close(clean=True, free=True)
+        self.assertTrue(ret, "Tagging process failed (xml format)!")
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/data/Replace.fst2 b/tests/data/Replace.fst2
new file mode 100644
index 0000000000000000000000000000000000000000..af5e70b0bd1bd67bee35481ceaf7aacb8ea2c214
GIT binary patch
literal 1782
zcmb_d+iKfD5S_jj`VT3wNb?{R$%%Wx6iUcjLW=A6#?*FO2vt%`(9i9cBsphiXLl^S
zkcSeaojp63b7n`*<HLyz|0eQEUP>vo9LP*oawoU4kZ-Vve*juagE!Ah87J#BS!c<b
zM2nFfY_99LYL?IP8<{Uq<EL!!wDJq9RbmZR;KQ0$!_EzR&VIK(vugeGP`3jYD(!Z7
zP!YYUhwIz8zKiSoxYp>Vdeosm+9$r%G97hLq8M|lW!<Vnerh*ewXl+^XDwPhGfl%a
z_n4lS(DNP2I_X2VbAOw_mzA2p*J@h6oQEvu$iv&}60asOSAAq?@tl$-uvEo-^YNV3
zhk41-ls2KqL^Hx2dy^6NpHD{E74jgGSu^SrCb-2+@pfC-KinjbgZ_W<e?IGRQ2u{d
z7c#~7@G&SU@Ljavy^(v|UA47aMYV+-;l#(>!Ixi?#`s;1;aLkW_3^vxvw#mzR;XLa
zU(~yWZ3sJEV5)lgldz5R)B23nc^mpN*M5IJ(frFT4ok%BV?o^4t0a4R+;Vzv<FPNo
zGf|~W^sF23Hq3WKv7y3@dL|aX&DzjiPtEH2?&j^$0W$SD74a;2t3JPaE7FHwV$K}v
z9pE19m$?YGvhb%wH1^Aj3e%<kF$$Sscb#@6IDL(NQ@O&)JWcylUJsDN=vl^fgo!(L
X_M4Q6-%KSuA4QDc#Z9uwY#nw_pC8;i

literal 0
HcmV?d00001

diff --git a/tests/data/Sentence.fst2 b/tests/data/Sentence.fst2
index 6e955340e44645ff673f76071baaf4a3cf697f70..e3641eba7c57da4d4b01e991ab603c1617536210 100644
GIT binary patch
literal 18740
zcmc(nTW?)Q7035865pX*EI<$%-(tH&P$zBLlq3yF3Izm}<1~qj6Q}V>+7$7f_y`E`
zz(s;*1P`cB2}tDyAt4a3e)FIIUNifg*x@D=9nYFIYu0_%ti64lfBgOA(EnQ*wuX-n
z9~({#%foZS)#2^o?l3DqcjG53F9iO?uogcz3bZ~#rz*5uR+udZ_vZ#EHsiw{;MEEv
z0hn>6et`K{5?t6jokPDh!K07qSeNu3Gv&QpBttW6b{}V-ZC)Pzuv{!b191f6)9Nmr
zu(;O=zw{XR8#CpyJlM}u--th0?-d3P#^AGdzFfR*%*{%9qIl)ruru7R8E`ws!}2jF
zENi{FT;d^#=nrJtmEs*RD+Moz-Gayxpq2+Z35P|3%vV|q=e<~lZ%u<&p#aC|4>~#4
zX)jc}VeLWu#MW6@YweXs&tfZl|2MXx9~NQ-o@e~`EX6Oo5!u7wVGB=`oPghn@uZ$8
z@{mg{u$Na$%v)ceNuP5_a7M`^V0;q5I0o!(PqE^E+__~}o?F!;a7Z~(;tjM~bY8Ar
zLW)l7%4p$A(GpsnDIk0xI^o61I7!AQtp%<1%Z0o6rFZAGqs}H*e1oLaXr=g6ICPRW
zI&;WNjZ@oa(cN*uYqG|10Rv|w9i@4$rPkiJQv4@f`PuT4v^LtRGSFnyj*bpIR!Us0
z#XRO;=L)Z^1b9*BW3};vDvoqj?m@@NARCNlWWHLm0Nqwgj!8>=-<}+H!==q1;~b}1
ztfL<zT-s<&=c|R2@=7_9k%YECS3V!A!pK8&<&`sBXG>c4Cr)|Z(ybO-WPSQUb9kcx
zGilcDly2;CR24&Llo>5G`#sO7fmPJj8ZK!Ke#J(l0OLpT&pQXH)VXPl(wc<WtB>t+
zlvaxjtv#K})aa62iJkT&ZZVIv9IFn8z2Vld-`&s7B*k|=ACQ3V_IYcEssj{!Oks}X
ztm+exS62!xauSpGKyt+ME5*n7+O+142gKWzVpcHdVcuywyza!EJq!D94%?+yy<Kkc
zDPO!ud~M}g$yR1-MQ=e+3F4<9`42kZfvR%TEWsCOX(_Ge;B=g?p!ccQxs#XDBk~-4
zfRr~twpH}g-Y8eugC<~R%|2BEamUBU$#Lb7kI-9HFeM4c&%`|o=hg!6v_DmajI|DU
zOqti;x#*@-&JL_qRz0h2jzs1facIxkTf@ED9UnvvX1C}}Zu3JL+V?sOtm7fD<ee=O
z?a|p{8M5A-%%NSi^^9qYZUVa9y+-qmq6geIidKU7Daf_)RP9|jk@%C<Qzdr=5otkK
zAxIek9kvRm7=dFhoMo(!GS){K>!XbI*rOfm<y^6My?9fQWvq{Otd0EEMt*A}zqOH{
zeYjrqLBsX&+&|5i$h=sp(BW`4#ygQ22jN$qn6`(9G496t&hX9{V{%$LB7vFwBai3=
z`{&7`xjcEY=q}omMSH{Opr~#Yda`w^P!pl8LQA%66-x!dQxLk~(XC=N_^o0!+&4!#
zn}r72&7!X$B%}A{=%<a?t>|nN4WQU4Tm(6G8)NJ?#@O}#&w9m~*=&5aStE;N@7b*U
z&?|AuP%N3^u+zA$CIYHLcxKV6ar3<b{#X8Dq3OUXu02+^sVjkY*6mS670qV#CtQ@5
z!YebUK9cWSzich=71<?S@sD*~#1T40Ojew%UiM5G=o!AZ)(OMEtv*oay?u3f6tnAb
zX68E>xQn;yyYk!}X=?78hfK~B8UyQzN>Jj6JUH7u-OW0mmJ={JAzvtN(5Nqvo~$1y
z3(rH{<`)(^4zqw$FZCM8VIO;d7p)toT0#~WGu_!n0&QKpt_~04K6qH}gqNafU5L*t
z{_zf|uV46Ikw7nJ4RF?3V4XeT(OrA8+Cn4SM1x;UgJy=60isE{^1SA&W6j~@AXmKf
z{AHNf8CLAzsgB)v<Qhh<VdO|B^&W!t9B&x8hMfzX24@~Ha!mto*_LmOpaExm+M5-J
zh`PoC+s}@;^SRFzXV2nQohf!~&hhIpv!Fpzn@3o3soZ_nOTJ`3)Sp%MDnsQ1<Y6iN
zSQE7|^PLiMY~_5Xq}_Pi{8H9bV<aDy;dwi8?vSfUWS0b9QDSYQt_yY_L)FSiWxoVI
zPsFKORoHu~IT5ECx!E~ElXA0r0-tiTgTgD$5a5)XeH3^+A{(=p0<TE1mi?4#is*e?
zLF3y(aPmCa*m82@>3f%s!#4T8%Nrlat66!n?qnr!+^5kdC${5`xj$TvEZr+liqJU2
z@EdXBj4>L<Z-()kVfY!wM#JQZVf<zo?U}<LpBYB3{F{9nXWMdtEf?5wfh`wU`Kf5w
zlgb74bm#${?WAUa6JF<AIifC4IiijVnb4!B;;CpzZ}okWug*_+^?ruoXBd8hbv6{)
zJP#Qc<g(gh*xO>zyPfNP?J?-xW6+|<prbK<zMSVhkHLrDhqkKI#-y)|p;izF*Lk9*
zo--*n$1E{U6uKvjBP!0I7Keva`M}O%maALE_tu#;xDY?*0{VEJ^U>>TPO`N1tFlG6
zT~F0|7H-+?`23-7>sfSi&X||=bo|@<2iocRz8L3xe>hi8yLsS0<+CE7x0dRKSyLW+
zO<C%h^3*e(HP0B$#qajDCcE$_@kTaw0wc>Xl2`+e9K%S#Hn8v$%t=ld9>y~g6S4&!
zh8e|Er^9f{rJ}pLRc&DpK8IF*QQopgUX({fmpS(T@bQ1cPp+xhllh_wgbxyjUy#^i
z@J5Rv7VTWOq%nEZ<6C3XqdlgVpHHa&NnR|n*LP~KJGPPusi*5>TPM)HuYG9e%u9Rp
z9MxAm?d?67&s*%BHPSh0ZRR4EdCCCO`W0`ZYwO3Bi+4KTP1n{K?B6Lp#9PrnTpy+J
zWSD1APao_@C;SNPeo1FVJ(A9pEJ1tS$oL?;9qT1*Z{gJUsz2?HXUWn&D|t4MUdMY+
zm7K0H^<sv-eAJZzStp`kk0j0=V^1khS&Lc21z$+Ec?i?yC%(r&(_X-xqWQM*g`T*+
zRikX~vamhx%6_DphHQgETDLV*c{k0@)>ioR=S}wlylpolGNA2w^|-W0CsB8z@^!ns
zWob2*w&ILOumCAYXzfL+t+ht2Bxo-m`8J&0oD;^LGjgXE<-MhwCw1;mu+I2#zN5$R
zGcPf=9zH+zEt+x1+VA4t>h?4)s%2Qh-qthNdpW*QaxFe-MLtteAMF!he8UG=J%u8o
zToz28izeS-qboUO-!6(0mRyQ=Id|h{>XMx%RScPA1+^EM=mwniO>c6IN2X!+X|CsG
z!^q_WA85@VUGW)s<ccPr3FwxPEB>k+rh&Kk=y};R;>R96FOwU<S{E$e2}Z8;)bq0V
z>v>uH^}H;;s%@sxI(+oJj0~N8B&er$Jk&?4lC*V-3Jo9UVoR6V*kc#*?4Og&wXf}o
z#TsV-H5m`M%K6;_TCs*SulYHjH9w^TJ8uOmZ-FhNr8Uk^72=)PR-44Ol8yBVX(hpx
zd2*dqJ=#-={g;|kz_$~mHCd?cnrH$OEusTVHZcdrXUu_-Xc*qCfhSW~1BM@K!1&bs
zC09H2?m@h{V~~|IB;+h(24&jJ3^}^_n4v|^kSontQ*@zVCS9Q58Adh7GwGrgX(Cv9
z$Ra&A*gn6NfGhl@hiBre75f7!)>V0wHOQqhF(Mb3(J=D)fX54bfbjtzVA&_wa*2rS
z%{+BSs7U)g3X<HB0XzSk)h&*l@_vi;`Dk%OQr@3p70VXu-+Z+=be*Rp7Z+h$t$N9=
zm9{KT5fWSJi!5pVu*grnS6gn2s9Sv&t;0<@hi;4X!3RmvdxYH?pI)=0B+qKx+bV*e
zr@y;ep8TeS%0k8wY2eIs^=-}o=>(nbLgDkB7%`Jg(6sM}m8+t+XP~j2f_3&dSw6?M
zHY+*aY)^!3_U(heEyw)GnARug*uPchU90no_bavnSec4v^er5?1DiW=dQTm;X{_mL
zP^(h0S5IG}(bJdVl<RjsqPOSSn{xGiHSc9KoO1o12R>98)6mZ(oO0<C5}$JQ>?;15
zEp&9Y9oRZRVOhYI#ZA@1mJ8l;fh`x<_5zzfux-q2Ngl+~nbAfb*?Qx)A8*_aBI@}&
zAl??4C*IF^<!>gyTYq5d4{ZH`tv|5!2ez+(Z3}S9&EHLuSN0*W<vLGw+raT@*0P(w
zi{Sm@&hUPd;eNMER%9;=oD%a~gHKEIT!T+b^LIbs(^B6cc_(p;=N;%95M8D;*sM%K
zzAfk3i0JSUFSHgN`q&>Wo%haD^(&S`_O$!w_hF%_+Z;nH{Z+!X;(PPAXFccSlLPR{
znyWs|QKhOKM<*|&)$q4IOT+2m<>AZ4Bf#Gs;V)L)SL*nQI(})SW#99|_v7~X!HByY
z{pi_|$7d`4b9MZ79ltZuzY))smqUXq<DS!zi|1p0HJ&z8uZw|uvzcES&Wy7!RBkVn
z@x}1SRI7`_>kYThRQeNjT&m;qb^KHvPlsk#I`3SJxAU)!a!*I4I3M;(%NOIfaCjCk
zetN{o>r2CX<)7EhcV18NpU2DkDIPgJj_*21%7c1iEtT)1ShD=q@3~laT#R=gqI+qC
zE{2W5@n+;8@+N13E(DfuT&PYiMf7h+ExF%(3udXjJK(n%dy)6|BImX9)$$a7Kd_pA
zy^e3z@wHgr8LfCZEZZwP?v5PJlnTwaW_RM-Y36%jcvA4<dZz3-7(N<3^Gay<B))0P
zZ&3MGi0ICTq`QHG4}UEt?DlXYXe3vdv$0p-@IfDS1z5cw_&eR+OR>Hi9QQ*zd*_Yd
zc0J~~bGvYRzgUVa>vOKi*{f9<AEJeHxE%Vy7l}_staLaC4}Tcz$!WV-fG55GYV3X#
zp9jMSqn&))Z$JJWRE**X27la&pU#e}q3bO6-XE>M8M^I<F4E33+q@fVc;GN}6-A4&
zPAzWHPQHhz-SAONrgKxAsg&OuHA?M&*WtW_oaGyj*5<|tU90qX#+gd>rZX`0cBKJJ
zR!FZ`>UbEwg0uaHr-{FFOxeplj8C<9zm8aqw9IaO6O;JM2a70e)jX|N&Q|_kt>f3~
zc%_c%5ATruo}~wj!Afh-x392=nA=CUVio_~s1n|&<0tD#mdSSAje^fLtMK9*c4VYh
z$+TNVt}(NaA&Q;Ai)t3Ro{@bryc;W2BJV*n&O%~{5T`-=`HKv6-wm4{2jqR6ZL$EF
zMALZc!?YHlv%iHVuCh`!O_;+X^Fh$5^Az@Bk-ir{Tg&HlavbKh@=C9lv8`stwYnKM
z)z0{-e#Y}H^Y`QPIAUhc?8f|V^?!>eBGeQzy2XRxtiY?qlL@jj&nZ=PH~#IEYC|qQ
zDLmyxt=jX%@J`{G(IkHN3-)f|VV%z$$ycl5tKS_=`CfYp-yhZN`f(k9QpcC-_;MY8
zTF0N&@#l5?MIB$Q<M}%NvW~y1<7;)iP{&Jke7%mB>v*+}zpmqN>iF9_exr`xtmE$3
z3-Gt5J8^pCg}28O{$zyg%UhL(*tnC}tvJ^OS5NbMx30e%c_Rtr&|yS6s||lUqcu(m
zCsH$N(#`m1%C{pU`FjVg-iQo%IPOb`9z_&bCWu`JBQ?EM?o|~M9rvK?M)KkQyC0QG
zG<RZ#{+it^wL&oZCSTnf_eOrYGtk4n(i!09)c&NU+E@Z7LhJZ2ZDQsq0K5JYOZzC}
zx)au`&WV?(9)~5G^EU$R`gY-x>wBZtdl93Y(XWUD)xnXYtF-=(j3pt(I{5i~smQ(?
zzI3iZrMdU1ir_8Nv=~!yeu?3e*iE076=Knfcb$p;OgdeuZ{(Na*-e<$QCrR3Cp`=s
z9T_rHtK75nTbl7(H$nd0+fqFFI6CJ>h;GeTVC^|O@~3+iPBG%P&3=&T$j!V{`aIE*
zCp*zKCl&FzA5kGk%?)pIcX|&n9WFoZWn_`5`yE#IFf}(R)j77kE6)tKLXW<)MkgfT
zDNCUv0tZIl{+3R3s@F#<R=CUQXBYXy@$5ZfZ0nQ6wO&cA{F3wDGl_o`e&J-R_cwm(
zGvj-&8Qc2JxT)ui?|o;iz2tf8C*zbk#N%<}cmCqxgAxhn0CB-n)F9!2+l+Xp#!9Qy
z!&#iSzMth;6T8T|ws)rPw3x%_?x^@}f1uve(wWiQ+YU6Qk8}p)+!Sb!75sGveNWrL
z%&&OY?XZE)iT(|lw!O!idcXX=$i0w+w;nY+7OmtiqQHsB4CZ!8TvBqAaJK9RGQt>B
zaTa(IbVbd%Db&kz0?ATaHN~3xc5u_PPEVn~B(q13qA{mwwi&<Z|F>8gc$c6IGo5qH
z^J$hDKg~1aiKH{lxIWj6b++Z)y*kezuIoP>^Fdztu+Fd7@kt#Y*YWpt{6igmQ-4(F
zf2^ba9&c&*UY+~-{ZDoN=Q{d!a5lJR|D~?~wT^$Q<9l_q{*UV1Zw!1Jw$Ca(&i+U&
zU66a1Dc02V2Hd|b-ySzl+44<VVNIR#vYPZbQ~C{%-qJYZr&v?dMSc*u(w>eUXUc0)
vAAH|a?-J$>yt3k2<Z;5#xC_z_&qU`L8JLv$E`KR1sNd@^4WBPElt=#r);x(7

literal 19797
zcmeI4U5{i(Qiks>`9Bn}79d!*tE#(aX0c#ry!P70_747l1q4z}xn{bS>Z+dV>aoWN
z2_Y`{kl-4@1==eQNQMi9gy4$)V?56rajM%hhV`ylB*a)g`DWxtWMpJSWac^5b8yiA
zjbD7>>*LXTi}h}?o-bZ};hUrBjf2t6>1cX<tKjD7<^<|S!EwP+!C}F)U{Wv+Ob$jj
zjz`n+tr1<P<C~)!@EqS5-GJHncywb792Fdn7;-wEju>(}o{VTb9gnLYve5DXSp;Me
zkVQZi`W+yP02jI+ObaHX8*n)o7a+@x(d5>x0%SRcLKXp8jsRp4;BpASMVKhW<W^*v
z+=9z=G`R_v2>_Qd02cu+2f&eYnA|*c4imI^EEE?+KDOV%=or?>dQ3lrJVpql3w2xo
z7i2+#09@$DnCS->gkUfW8O(l+5`c?^aA5@crVuVRfeQms9||rO!i9m3;6lG6xX|wi
zE{t#l7YmEaQE@qf3)4J;ivSk^F2EsN1h@!r5#S;mjTmHdcvvtEOmB`Dax%S9a6Dq<
z$&{7?Ed^Q%lY()<L11#L04_{yLU*Pzp`SoMgrCq)pdS;%A_XimKtCx)XSP^mfYIqN
zhKo>KFvU;@Bjz^26azO4ZWJ6B92FcEObaHXLzrQT0k~k{m?B)px58zNDTWG{F{X%S
z$Cx6T9b<!Nc8m?CNrV_<gHqTadLCnZ=y{Csq31DHhn~k+odaQRhtts*bCbf{0*og`
zKY>ng!Q2A2=6@^rA3+4Tki*;#5eaj%P`^ep%qmoI!Gb~+7c3}LalsO#u!KNyVSPgt
z7Zxv6abZb96&I9(b&OGpfKmc@xfHSpL>%F$;IM$eV<CY@C@#!HiU|bZf;gCB04`E+
z5#S=gMSzO{7X+We1;7*oaFK$G02cu+0$i9Ph8ZxTLX|8Srb1wtftnE$XXG)4$;e|u
zWuO#-MIy%-Q-tl9sEDDDiHZR_pl+g~sv{}}7#8JYgNTalO>T}bJp{$_kdqi_isO)`
z2LTYlu|0ZYdJIGKgFS}F6bO3e=|!y9iQFk@iOn$=0v%Jq@JO*cLq8fLMV^FSQ^X@>
zEK>RsJ{c?9f~iJkBsHZkDSb)N87X~9=}Vfvni+X5)Nn(pjgh9W7DhgOwJ-ANt96l2
zUmcD}Kwn%Vef2N=BYpKQ^7O^U=u1jpQu>n8mz2J3DOvDd=qv^bgbxZtz3LU#Y#a81
zF;R&EoPC7jj#yb9^hi1$xgWF39bRQn`_VuhCqv1NA#;<=4%0etNv9|wz+y|Wl0uey
ziXuX>uog}WCI#by;(|JCf<6Mp1&v@9Q&bXwixgZ0xCn3&-~!-i0k}xTs;2lrsN#aT
zhbk@@c&Orn^-5{ILUGZ4rHYI8Dpg#t(ok?=Rj^R4Tgfojl5E20*<5~%9S7Ev8vV;}
zE{W?bE~`a%gLWWNk6GeSk$Q{=h03Ij&r&!`e6yP1E-9>*IE;08jD@&NfPpMzAOS8y
z4TRsMNpZn*Le+Sh4P0=ZK#8l_SXi>)K%t5YE+mCbI|93=6sI#O;!vm(p3@nHgdWrI
zAjFU%teB=Pp@bfU5>njKG_%u?62|D#x{pk??iHc%C|H%(5ZcU4v~6OYLJDvcU<PR0
z0s;w4K!B|tiZnvx9TTKNg^Sx_sBnqejTYhJwrHJd7mAB&mntsAt5C&7Z>3JP3&llm
zl`1Z(U8=a$jK(#isGU014ybmaIuQOtrA~#zJY%bH4$~=D)f|Q*OzA~c8%od~eM+G@
zCL9<bT~!fV4ic(@F1qDBqTxHJIMRkd(U>O^PJ_+QP#(JZ8O}q~Ry3pso1bAlbn`Q`
zhi-m`_t4D`^9+rmf@rF+`?*!x9vc<WR-k&LFDbV<p$gqJpxiY8Hvk}7BUizD!gYpq
zval3L&nQronJ70=cA|{HC{T35q7x7mTvTvTNyX;pZWy}xxg$zPfnxKE0;QW@6e!*N
zqCjaL`$vIdj}rx2*K$OG(p4V?Vp^8u;1H;~C=kt*gi#>J)snN#&ZA2)8#FqN8q<f6
zZR6Sru~8e_ES0q_W~p*p)hC`}TA{&Hm4_(ej)y4QaH2-dpKr|$-Bn`_US>13oPOAQ
zt`(#of)y{VVa)nlS_-Uu%gf}@WV+M@EU0PNOW~}X{$JW^Jt?x}0^!6>l7O?{E5Fj#
zhp8BYg^Z9GEQPBA&u)Dd5)1``p+GDXii<H=DsqmE!BAgi>!Z6d7%&D4H7uqXDs={6
zMof{Bn2nA|Oq#Y|y!CPOi*(O!dQTZo*<!^Rd^&MP)aOLqNh}d7&Oj&5h{grcI3w}|
zbq{IHRBV2UGt$j3aRxeZMr?kGGtxZvPn;2(U*e2(%a=F<oj3#9Cr`_fIAdAK(axAS
zgTEW7fvQWK;h0wvCT4JsH*{Rn748~c1?d;m0WGIzP*;XlS3&YYbQM%y>ya_Y+=9(d
zU4?Fb>PlK&1)HC`3f=tFRp{oYu0l6IbtSE?MD-Nxe(EZ8)vGH=bp@y^q3YBXl)4g1
zKy?L0m^1t*l6iG=nE%)2l{@;EG&1%S?)6XkY8|SswgT?WpKrmHB?VqVD>LTKJ>+`h
z$a{sMtoI9!C%xZYWqcViG`nKgM?9FcG+vBt7Bl9`)m?a>FyvYSgrn(~c%#;ro}v3*
zZ0>p~!ya2V?svWQW?$xZW{b+V>A7F?_GMq*TV%DVIkf+$Nn2?09i%E>7HqzkFvU5?
z^r#)9Qa%`Zv*<umRmG{fTswX2gA-0=I4a199iXmChPIzKC)*A&?Rr~04~Ex=dOEH?
z9Uq?&!B;_S)9lT@4Wk+_ces?Ztd`eGJ^vQ){9C~DZvh-ko_`Cdvk=D(4%|cyPeL9J
zJTy{B<bM7wpkDMp{}#Y`^7*%bAK_a-J_GRjldYX^E2R0hf(rYP#CGi2ZkJ8k?RtNh
z%h~Yx3x6)q(=YMD|9|df1jp9@@?`{d&%TVXkhcN?H+|tJ`!a$NuDy(ay7n?cikA@x
zz)g^J|1#o7c@O{W%LoqR9KJnAayuCFN?l7G4(p7`yY^JNd3e>=&^^X_rRKJ2o$tSF
zM=clKr%U>7;&zkcIbjC06*sJb{gv0iij#4MBv&^GKf3x_0_YJPe7ODi7AWn#WpiWq
zn?a#sO1t5Kh*&5qX?H*ozF^+Kcw6x15R{*^{6zVQ@-r_#QGTNQMEQwaGlI>}H!z`_
zU-GtVe!f+(EK?PmpEre+H$UINgl>MBFYU9Vi(T$@-*(=Rg5A$IFwn7Qprr8zMxnPE
zp$fCm`I2K{4i4NrxES#k6}y3d;$O7SKzN7!`nu?76#yyRUs3rP$6#Fy<t2-OudYV@
zvPX_rZ?fpWp9^C;zyG#l6j4TW{o04|xJKcPPLYEd4bo82+3O-?*uHQ(A8l@j#Uy(*
zE!4%$EQ~b{3`J)mYmxjgq15t;up6d9dyBvJ7#c7)Ue%<wJ%`U{amXFcm!GKkp>ZhD
z#`?<dJ&@mqI1ZBW**-be3Hzul_QW^+2$Iz^$CaLS<W~x&*}=T7^8<i%VHOMqJ48%B
zr-xFK?>CLf_nK527<3zD!le4nil3mVw5c(!3cAI$I<wdAv^YN!lI(ZaYq^RiE(>k!
zIQ2IbgqdrfkQ04$ZKIIW5OvugW)-AN+()Y}_i}Ezt^*Q+12u>iOrh%xL~aCW{gZ3^
zAveTQeq&MkR6lHkgk=P;DR%#M7)HQ^Q-%nbg4skW-Ph>^5i4kO%UQRe?4gZRLF5iH
z?I42&nOd;<8L2`yKb*hl=4Yg`9*-$(enzU4H$Nj)=;mjnl2*OJF4xVZja0$zr@Emn
z5~{cAhB8tKRcNGwat#0@6%b#AYrP2booTy={PH2l7hQYh_GV*WcLm_e^ZRou<@=9?
z{x&SDq4aq)5c6~mA|co>h%yY_do{@oi^+g2*ib*%_XK5F#y5ruTvCml49-lU@^Xsg
zOPD|ONczQYtl83!pfc$XX3k?D%O-pu!|5lzxyB}=06&9fT;e(oZg!g3Ce8oM({3#>
zwva;tvq2=Ry*fucyE01aKfRnC(mx}21z1*<wfv%5ziwA;F)a^{Yzb=5_Hm_Kfe9i{
zZ(S8|;e$K*%R&!{h3ZV&$%_0INW`5AIW2|ehgCsGvR<u-%@21fy7}QwMK?d(DKzd>
z*!*y(qMIM?RCM#pnlcV!rJOYiVlIW<4}&ba>hTOojU2$80#(PNN#Ra`5)gNiV&fN7
zH9v&AvWy|V`5Vou^c8bf$E~dQvO-1)%`id62{MAHW`k%V$f!Yv4mLkE8@l;<j0@fT
zI8udHvzDvbVDnS6p_`wY4c+|IthBQZcDb6B#y1PQ9}Xt1W`*jlW})2wgep|CP#CwM
zW`Wi$Ic+97<S^^+VLi+i+E3aZpy%zNGm}<QQ3(k=(L8>lO}iOt(?R)3%TJUa7|o0F
z6Xhq$Pjtp&^V6n7H$QDUbn}bmX&=pt%`cjlZhq0cbo0x6p`&@R%cFVdXkKhNqIo{a
z78%XwXA24w%>yN3G|#UWO2BBoB9vds2=`HYMbl(yakXDe2pVP4!<QX3hOlYs+$+Mk
z?+<6Win{3qqqN;IQJko}!-XfJD_wgP>(6m3EWhIV%j1h<wRWS&K7ar4{QX0I;(%eI
z2^|h6`d@$lfaCR97_XRc_Rtyk=j6%o5mR^IdXjJ$#CL*rqTFy8q<av*3KG7+gu@_l
zB3S&qm=#_85)Q3590rSD!eQv<XE+Sq{0xVoo1ft@bjj!2iqM8b%MFLY=4Utz-TZpK
zzH&*q;m~@+p=dY^c0a>m=&CmyLK+SMHzJ|x42MvLL!ksT96}{*B<@|m<+DcqO~=(`
z-(^C8FE~x@SPB{&O*>n0lOF!bNGkg*#uVMAnPK3|{#A-g=^GO^wm`VJj-kTERg_{8
z1Nqw)moQZS*u1!~ou#n2aEu65T*ykXbb;c+!i6d>EL*7JQZsVREM&<7#hL|^V*3hI
z2X+W37RxlfSSs?Ylxa#>C-N*4=@0!a`-J^#zbq5~<RRzl>_MIgSmvAvf|x@PdkA6>
zK^8^S1%jAt(B-q7l@B&QPXwWxUtX!%uK6K%>U9Cz=>oy#=ZPS6^YcUyy7}n>(z-yf
z%XI;1PXxj4=ZOGX7Z9qqE&!zq2vw*HK<NSi{|l&3h{}B5(p$XT`o70Cj~fn7*q_@u
z_STgokdBt%FiO@3WpZ`Ic-D7t3jdbqYN`D3v)=eT+La!DQqj)y_V5CL^{GguX>^1l
zS9k2aOA+;kW!kYkypPA2G0*@9e<^HE3xEC_b>6o`Gq@QVDZSuX<n<A)o*xmpmtEZ@
zhc6F0P9FmFSk}a4;n8COU6r2gcm@V29T%`KP%S-FK;<PwRp`iGUrmc?wkivFECK8c
z*cmr&Z*~|chuQmfz0{}%t~xJTk~#{1_K>uA_MH7w2HB&Il<?iz*PL#rc?o#k)Z8tv
zn)fxO-V&A-e(uOre~ILvaEw*{&;fotmc5sUJL9-|I0Lex3=7?%c}K?vZHkYOx&+=i
zO)&wqh<}jsEsK<+Cn;arNYj@+#eDj*pO{Zy_7d{!B&O-her!H{*^SMouf9n>eLdRd
z=tRn{VVag4n#|jh-GU^BB6Ic$5=o6Ww(I`DI@^7i<b<KJod(f*P~BVZB8wQ2=yn=3
zMo42$K~_G9;RIcD>zkk3Y3SzXb}H?58f<=^C_^_tPn4mXpL<H^=I3@Q?TJ!!I}LU}
zPn4mn-t839?G$i36{^nd6iQPSN<g<#D7RB!FqV$7GGlx@7;ASiAI|zR+P_X>hJ&Wv
z{MB?J-WOUgx4dpIYC1HUF68)QJ(?~gc!y>K4>mtEU3Bxq)QWC?$PpUJ3Y#CI7v21r
zX3@<L;|uLrdAb};hsFpCyB~%qjiv+D8%;~0=|B~tX(==<plJb33xmr1Gw~Q_kcA4O
z;2^3FvPh!F8N`@^va_7k3t|*O7u53Rr*VdEe&j=IoR)jm3pPK`dZC+N48(TL&$FKO
znU2`y8mHx+^@1&jXFX_*Q>fk=CzQr1RH4QRrEvlpC!lcxs?4BfXn#5$ceth2gIn&V
zPX$`cCDwf{h<`z+!N}R+Si>uiK!abRH*<(L+^6wNFTeULroU(UOFX~I^HV&(%tPHb
z{@}eoG<Wx%S8VZfJU`F#J3PP3^YRCG@4otOIK1}8yYIc9+}m$_FZj+K;TL(n`Qi)T
z8V#)8{%)H6EahM4d5Py6JU_$p^4)h!oVRbk*{t3HzeWF-UjCXnMgHickA}8;p0wZR
z8GgurZOAQlbh%iQtoRo$UV0hXy3N~=Z*NJyZSuC{-R<UnfaI;s#qu0{<z%*9EzI3L
zS+166XR>*HvEDAh-{AQk&zp;xL%h0MF4k)s+}W&`_h<g28ZW(d`DA0|oyB@{`I+K;
zXM6eN!DhL&_Ko#=Gv{C6Abb16E#1kya`}W2cgy5gmrqWs{+;D~vzjT%2itqI)ZSh#
zKAx>V45!!VwfvacyGt40-EHi8`9$IsvYWHZAMDcd_I5dol<#dGZ!SKe{`T2swc6P5
zWVTu@)(;ly<h|MMVyh7Mm#fnTho2+G`Q?+VoI`ZswfZQHcFc<U%2<7e=eK$8XOPRk
ztE@FAv(u>eLp!|Bv)ZWTv)TIGlG{A9_0FR2@SI<qg~Qoq8U3N}-Qq&QS3HaJv(o>g
z#eBCxhO0t{+Luba!t-l9zs~b6Pvo$I@r4Lr<~-~lEq9CaNc3Qf_9_2>=cjqHd}lLd
zcrE1l;`Ck&@BZ?^MU=mrQ82rpcK0}wy?pZUVwv27?S?5)bGF$i?R>UnN|&Fe-+Z$^
z-!XgZS?Rr1tQNakRffg<s6F4TE_Tb<+@l#wT>@M&b0T|g5_@(M#m`(ZG&gWGKd?;s
z^5f;@r>71&U#^)(6Av$DyX`>iwkUBR&KHcF<-IqX&u6RI1->xT`k_hBwu@8RmZlcl
z)8%>=_1$@@H8id7UTgjSwbqw^MCIiloBRpMw|QQb_*0WVGx>9qzaV*?=M9O!H2Eu%
zH+kOTd57n_Ja>8C<GK86lfN<fTaw@8`7NHg5o3NbI|CNl!sQcFb@G6R>7@Z#i(}5<
zLm!jn`h2drJld#)*}Ze&{Nnz73l|@s2_G&WS(oI@apor^XWNY>7wgnwUl7Zcc9S-%
z+18TN*}0m!kG%lr7io;=oY@e+*=`hzMVY6k$zo@Znc0UAGt|R}&f(!hW5FyUWxBIR
z*kcu4eyTm(?t<HA+bURQ&i4YFNWQegTN_7EdV3{w;^N7!D`3smW-QWl=COg!S$N7s
zFQ2Su6+^F4R^D;N-gWu-2aaBv^2~Ug{-e0n!XUZYtX0olHTk~Doj9$@D<_qjRSB<V
zj{YM_37YqC`PqZm{K@R{vy{!3+xbPNHeYS@?fK?nd#@axe_IKMj796}i*2%m)N<R^
z^1-YF{F>;s3BMiP*NaC9KD&#14)$odtx=5uNa9Re6th^MvLsu=B#Sh`7AN_~tEF3p
z`9U`GgK*|cI&*_~=373q3MH0Q<_ydaE<Y_5Z9Yl(nQu4qlkC7JOwCz*A3=cA*)IEn
z>fPMGznm{ClF!a6bN=|8xR<~}s8#>F#p#*D+}oT+9YG7`C)>^G=6q98Xir#ma{1(J
zwj)?y&s(v2e0p}WnLkd=6_Kjh`a$V=Rk;){A8j*@RWcTBXOA<*papCSGs0CZRb?$c
zA;@pLZj(LZsS+FQbtlnEGyA$)Go#wXz1}1S#hE{g=FAPMGq3J~wdZK}|M5>G+WX}X
zNbZ?=V)CiU-<$jciSB;+naMwre4pp{c|0Io{<F!ykhqSnjOKp%uO|Oy^6w-b(;VzE
zkH-rgv^J#9DO9qvtr23#T6}NcwsZT!YMWBBdls>ra9Bh~o~Ae*xe!CvwjVA(>C(<w
l_&!@7fH4MgMtHwciVRX#iIj#ccx~qN?kD|)7hgbK{|N<x_q6~3

diff --git a/tests/data/unitex.yaml b/tests/data/unitex.yaml
new file mode 100644
index 0000000..dd9e7ae
--- /dev/null
+++ b/tests/data/unitex.yaml
@@ -0,0 +1,119 @@
+global:
+    debug: 1
+    verbose: 2
+    log: null
+
+    persistence: True
+    virtualization: True
+
+resources:
+    language: "fr"
+
+    alphabet: "data/Alphabet.txt"
+    alphabet-sorted: "data/Alphabet_sort.txt"
+    sentence: "data/Sentence.fst2"
+    replace: "data/Replace.fst2"
+
+    dictionaries:
+        - "data/dictionary.bin"
+
+# The 'tools' section can contain any of the argument used by the unitex tools
+# functions. Note that, if you use the 'Processor' high-level class some argument
+# could be overriden to fit the 'tag', 'extract' and 'search' functions
+# behaviour. For intance, there is no point to define a font or a context for
+# 'concord'.
+tools:
+    check_dic:
+        strict: False
+        no_space_warning: False
+
+    compress:
+        output: null
+        flip: False
+        semitic: False
+        version: "v2"
+
+    concord:
+        font: null
+        fontsize: null
+        only_ambiguous: False
+        only_matches: False
+        left: "0"
+        right: "0"
+        sort: "TO"
+        format: "text"
+        script: null
+        offsets: null
+        unxmlize: null
+        directory: null
+        thai: False
+
+    dico:
+        morpho: null
+        korean: False
+        semitic: False
+        arabic_rules: null
+        raw: null
+
+    extract:
+        non_matching_sentences: False
+
+    fst2txt:
+        start_on_space: False
+        word_by_word: False
+        merge: True
+
+    grf2fst2:
+        loop_check: False
+        char_by_char: False
+        pkgdir: null
+        no_empty_graph_warning: False
+        tfst_check: False
+        silent_grf_name: True
+        named_repository: null
+        debug: False
+        check_variables: True
+
+    locate:
+        start_on_space: False
+        char_by_char: False
+        morpho: null
+        korean: False
+        arabic_rules: null
+        sntdir: null
+        negation_operator: "tilde"
+        number_of_matches: null
+        stop_token_count: null
+        match_mode: "longest"
+        output_mode: "merge"
+        protect_dic_chars: True
+        variable: null
+        ambiguous_outputs: True
+        variable_error: "ignore"
+
+    normalize:
+        no_carriage_return: False
+        input_offsets: null
+        output_offsets: null
+        no_separator_normalization: False
+        replacement_rules: null
+
+    sort_txt:
+        duplicates: False
+        revers: False
+        sort_order: null
+        line_info: null
+        thai: False
+        factorize_inflectional_codes: False
+
+    tokenize:
+        char_by_char: False
+        tokens: null
+        input_offsets: null
+        output_offsets: null
+
+    txt2fst:
+        clean: False
+        normalization_grammar: null
+        tagset: null
+        korean: False
diff --git a/unitex/config.py b/unitex/config.py
index d6bdcc8..d4112ec 100644
--- a/unitex/config.py
+++ b/unitex/config.py
@@ -12,25 +12,12 @@ _LOGGER = logging.getLogger(__name__)
 
 
 
-class Options(object):
+class Options(dict):
 
     def __init__(self, options=None):
-        self.__options = {}
-
         if options is not None:
             self.load(options)
 
-    def __contains__(self, key):
-        return key in self.__options
-
-    def __getitem__(self, key):
-        if key not in self.__options:
-            raise UnitexException("Key '%s' not found!" % key)
-        return self.__options[key]
-
-    def __setitem__(self, key, value):
-        self.__options[key] = value
-
     def load(self, options):
         raise NotImplementedError
 
@@ -359,7 +346,7 @@ class LocateOptions(Options):
             raise UnitexException("[LOCATE] Wrong value for the 'korean' option. Boolean required.")
         self["korean"] = korean
 
-        arabic_rules = options.get("arabic_rules", False)
+        arabic_rules = options.get("arabic_rules", None)
         if arabic_rules is not None:
             if isinstance(arabic_rules, str) is False:
                 raise UnitexException("[LOCATE] Wrong value for the 'arabic_rules' option. String required.")
@@ -405,8 +392,8 @@ class LocateOptions(Options):
 
         output_mode = options.get("output_mode", UnitexConstants.OUTPUT_MODE_IGNORE)
         if output_mode not in (UnitexConstants.OUTPUT_MODE_IGNORE,
-                              UnitexConstants.OUTPUT_MODE_MERGE,
-                              UnitexConstants.OUTPUT_MODE_RELACE):
+                               UnitexConstants.OUTPUT_MODE_MERGE,
+                               UnitexConstants.OUTPUT_MODE_RELACE):
             raise UnitexException("[LOCATE] Wrong value for the 'output_mode' option. UnitexConstants.OUTPUT_MODE_X required.")
         self["output_mode"] = output_mode
 
@@ -688,11 +675,6 @@ class UnitexConfig(Options):
             raise UnitexException("Wrong value for the 'log' global option. String required.")
         self["log"] = log
 
-        tempdir = options.get("tempdir", tempfile.gettempdir())
-        if not exists(tempdir):
-            raise UnitexException("Temporary directory '%s' doesn't exist." % tempdir)
-        self["tempdir"] = tempdir
-
         persistence = options.get("persistence", False)
         if isinstance(persistence, bool) is False:
             raise UnitexException("Wrong value for the 'persistence' global option. Boolean required.")
@@ -705,17 +687,18 @@ class UnitexConfig(Options):
 
         self["resources"] = ResourcesOptions(settings.get("resources", {}))
 
-        options = settings.get("options", {})
-
-        self["check_dic"] = CheckDicOptions(options.get("check_dic", {}))
-        self["compress"] = CheckDicOptions(options.get("compress", {}))
-        self["concord"] = ConcordOptions(options.get("concord", {}))
-        self["dico"] = DicoOptions(options.get("dico", {}))
-        self["extract"] = ExtractOptions(options.get("extract", {}))
-        self["fst2txt"] = Fst2TxtOptions(options.get("fst2txt", {}))
-        self["grf2fst2"] = Grf2Fst2Options(options.get("grf2fst2", {}))
-        self["locate"] = LocateOptions(options.get("locate", {}))
-        self["normalize"] = NormalizeOptions(options.get("normalize", {}))
-        self["sort_txt"] = SortTxtOptions(options.get("sort_txt", {}))
-        self["tokenize"] = TokenizeOptions(options.get("tokenize", {}))
-        self["txt2tfst"] = Txt2TFstOptions(options.get("txt2tfst", {}))
+        tools = settings.get("tools", {})
+
+        self["tools"] = {}
+        self["tools"]["check_dic"] = CheckDicOptions(tools.get("check_dic", {}))
+        self["tools"]["compress"] = CheckDicOptions(tools.get("compress", {}))
+        self["tools"]["concord"] = ConcordOptions(tools.get("concord", {}))
+        self["tools"]["dico"] = DicoOptions(tools.get("dico", {}))
+        self["tools"]["extract"] = ExtractOptions(tools.get("extract", {}))
+        self["tools"]["fst2txt"] = Fst2TxtOptions(tools.get("fst2txt", {}))
+        self["tools"]["grf2fst2"] = Grf2Fst2Options(tools.get("grf2fst2", {}))
+        self["tools"]["locate"] = LocateOptions(tools.get("locate", {}))
+        self["tools"]["normalize"] = NormalizeOptions(tools.get("normalize", {}))
+        self["tools"]["sort_txt"] = SortTxtOptions(tools.get("sort_txt", {}))
+        self["tools"]["tokenize"] = TokenizeOptions(tools.get("tokenize", {}))
+        self["tools"]["txt2tfst"] = Txt2TFstOptions(tools.get("txt2tfst", {}))
diff --git a/unitex/processor.py b/unitex/processor.py
index 1f0a612..8344d85 100644
--- a/unitex/processor.py
+++ b/unitex/processor.py
@@ -2,8 +2,15 @@
 # -*- coding: utf-8 -*-
 
 import logging
+import os
+import re
 import yaml
 
+# Compatibility Python 2/3
+from io import open
+
+from xml.sax.saxutils import escape
+
 from unitex import *
 from unitex.config import UnitexConfig
 from unitex.io import *
@@ -14,13 +21,26 @@ _LOGGER = logging.getLogger(__name__)
 
 
 
+RULES = []
+RULES.append((re.compile(r"&"), "&amp;"))
+
+def escape(sequence):
+    for pattern, substitute in RULES:
+        sequence = pattern.sub(substitute, sequence)
+    return sequence
+
+
+
 class UnitexProcessor(object):
 
     def __init__(self, config):
         self.__options = None
 
         self.__persisted_objects = None
-        self.__working_directory = None
+
+        self.__txt = None
+        self.__snt = None
+        self.__dir = None
 
         self.init(config)
 
@@ -98,12 +118,211 @@ class UnitexProcessor(object):
                 free_persistent_alphabet(_object)
 
     def clean(self):
-        if self.__working_directory is None:
+        if self.__txt is None:
+            _LOGGER.error("Unable to clean processor. No file opened!")
             return
-        rmdir(self.__working_directory)
+
+        if self.__options["virtualization"] is True:
+            if self.__dir is not None:
+                for vf in ls("%s%s" % (UnitexConstants.VFS_PREFIX, self.__dir)):
+                    rm(vf)
+            rm(self.__snt)
+            rm(self.__txt)
+        else:
+            rmdir(self.__dir)
+            rm(self.__snt)
+
+    def _normalize(self):
+        kwargs = self.__options["tools"]["normalize"]
+
+        ret = normalize(self.__txt, **kwargs)
+        if ret is False:
+            raise UnitexException("Text normalization failed!")
+
+    def _segment(self):
+        grammar = self.__options["resources"]["sentence"]
+        if grammar is None:
+            raise UnitexException("Unable to segment text. No sentence grammar provided.")
+
+        alphabet = self.__options["resources"]["alphabet"]
+        if alphabet is None:
+            raise UnitexException("Unable to segment text. No alphabet file provided.")
+
+        kwargs = {}
+        kwargs["start_on_space"] = self.__options["tools"]["fst2txt"]["start_on_space"]
+        kwargs["char_by_char"] = self.__options["tools"]["fst2txt"]["char_by_char"]
+        kwargs["merge"] = True
+
+        ret = fst2txt(grammar, self.__snt, alphabet, **kwargs)
+        if ret is False:
+            raise UnitexException("Text segmentation failed!")
+
+    def _replace(self):
+        grammar = self.__options["resources"]["replace"]
+        if grammar is None:
+            raise UnitexException("Unable to normalize text. No replace grammar provided.")
+
+        alphabet = self.__options["resources"]["alphabet"]
+        if alphabet is None:
+            raise UnitexException("Unable to normalize text. No alphabet file provided.")
+
+        kwargs = {}
+        kwargs["start_on_space"] = self.__options["tools"]["fst2txt"]["start_on_space"]
+        kwargs["char_by_char"] = self.__options["tools"]["fst2txt"]["char_by_char"]
+        kwargs["merge"] = False
+
+        ret = fst2txt(grammar, self.__snt, alphabet, **kwargs)
+        if ret is False:
+            raise UnitexException("Text normalization failed!")
+
+    def _tokenize(self):
+        alphabet = self.__options["resources"]["alphabet"]
+        if alphabet is None:
+            raise UnitexException("Unable to tokenize text. No alphabet file provided.")
+
+        kwargs = self.__options["tools"]["tokenize"]
+
+        ret = tokenize(self.__snt, alphabet, **kwargs)
+
+    def _lexicalize(self):
+        dictionaries = self.__options["resources"]["dictionaries"]
+        if not dictionaries:
+            raise UnitexException("Unable to lexicalize text. No dictionaries provided.")
+
+        alphabet = self.__options["resources"]["alphabet"]
+        if alphabet is None:
+            raise UnitexException("Unable to tokenize text. No alphabet file provided.")
+
+        kwargs = self.__options["tools"]["dico"]
+
+        ret = dico(dictionaries, self.__snt, alphabet, **kwargs)
+        if ret is False:
+            raise UnitexException("Text lexicalization failed!")
+
+    def _locate(self, grammar, match_mode, output_mode):
+        alphabet = self.__options["resources"]["alphabet"]
+        if alphabet is None:
+            raise UnitexException("Unable to locate pattern. No alphabet file provided.")
+
+        kwargs = {}
+        kwargs["morpho"] = self.__options["tools"]["locate"]["morpho"]
+        kwargs["start_on_space"] = self.__options["tools"]["locate"]["start_on_space"]
+        kwargs["char_by_char"] = self.__options["tools"]["locate"]["char_by_char"]
+        kwargs["korean"] = self.__options["tools"]["locate"]["korean"]
+        kwargs["arabic_rules"] = self.__options["tools"]["locate"]["arabic_rules"]
+        kwargs["negation_operator"] = self.__options["tools"]["locate"]["negation_operator"]
+        kwargs["stop_token_count"] = self.__options["tools"]["locate"]["stop_token_count"]
+        kwargs["protect_dic_chars"] = self.__options["tools"]["locate"]["protect_dic_chars"]
+        kwargs["variable"] = self.__options["tools"]["locate"]["variable"]
+        kwargs["variable_error"] = self.__options["tools"]["locate"]["variable_error"]
+
+        kwargs["sntdir"] = None
+        kwargs["number_of_matches"] = None
+        kwargs["ambiguous_outputs"] = False
+
+        if match_mode not in (UnitexConstants.MATCH_MODE_LONGEST,
+                              UnitexConstants.MATCH_MODE_SHORTEST):
+            raise UnitexException("Wrong value for the 'match_mode' option. UnitexConstants.MATCH_MODE_X required.")
+        kwargs["match_mode"] = match_mode
+
+        if output_mode not in (UnitexConstants.OUTPUT_MODE_IGNORE,
+                               UnitexConstants.OUTPUT_MODE_MERGE,
+                               UnitexConstants.OUTPUT_MODE_RELACE):
+            raise UnitexException("Wrong value for the 'output_mode' option. UnitexConstants.OUTPUT_MODE_X required.")
+        kwargs["output_mode"] = output_mode
+
+        ret = locate(grammar, self.__snt, alphabet, **kwargs)
+        if ret is False:
+            raise UnitexException("Locate failed!")
+
+        index = os.path.join(self.__dir, "concord.ind")
+        if self.__options["virtualization"] is True:
+            index = "%s%s" % (UnitexConstants.VFS_PREFIX, index)
+
+        if exists(index) is False:
+            raise UnitexException("Locate failed! No index produced.")
+        return index
+
+    def _concord(self, index, merge=False, output=None):
+        alphabet = self.__options["resources"]["alphabet"]
+        if alphabet is None:
+            raise UnitexException("Unable to build concordance. No alphabet file provided.")
+
+        kwargs = {}
+        kwargs["font"] = None
+        kwargs["fontsize"] = None
+        kwargs["only_ambiguous"] = False
+        kwargs["left"] = "0"
+        kwargs["right"] = "0"
+        kwargs["sort"] = UnitexConstants.SORT_TEXT_ORDER
+        kwargs["script"] = None
+        kwargs["offsets"] = None
+        kwargs["unxmlize"] = None
+        kwargs["directory"] = None
+        kwargs["thai"] = self.__options["tools"]["concord"]["thai"]
+
+        result = None
+
+        if merge is True:
+            kwargs["format"] = UnitexConstants.FORMAT_MERGE
+            if output is None:
+                raise UnitexException("You must provide the output file path to use the merge option.")
+            kwargs["output"] = output
+            kwargs["only_matches"] = False
+
+            result = output
+
+        else:
+            kwargs["format"] = UnitexConstants.FORMAT_TEXT
+            kwargs["output"] = None
+            kwargs["only_matches"] = False
+
+            result = os.path.join(self.__dir, "concord.txt")
+            if self.__options["virtualization"] is True:
+                index = "%s%s" % (UnitexConstants.VFS_PREFIX, result)
+
+        ret = concord(index, alphabet, **kwargs)
+        if ret is False:
+            raise UnitexException("Concord failed!")
+
+        if exists(result) is False:
+            raise UnitexException("Concord failed! No concordances produced.")
+        return result
 
     def open(self, path, mode="srtlf", tagged=False):
-        pass
+        directory, filename = os.path.split(path)
+        name, extension = os.path.splitext(filename)
+
+        self.__txt = path
+        self.__snt = os.path.join(directory, "%s.snt" % name)
+        self.__dir = os.path.join(directory, "%s_snt" % name)
+
+        if self.__options["virtualization"] is True:
+            txt = "%s%s" % (UnitexConstants.VFS_PREFIX, self.__txt)
+            cp(self.__txt, txt)
+
+            self.__txt = txt
+            self.__snt = "%s%s" % (UnitexConstants.VFS_PREFIX, self.__snt)
+
+        else:
+            if os.path.exists(self.__dir) is False:
+                mkdir(self.__dir)
+            elif "f" in mode:
+                rmdir(self.__dir)
+                mkdir(self.__dir)
+
+        self._normalize()
+
+        if tagged is False:
+            if "s" in mode:
+                self._segment()
+            if "r" in mode:
+                self._replace()
+
+        if "t" in mode:
+            self._tokenize()
+        if "l" in mode:
+            self._lexicalize()
 
     def close(self, clean=True, free=False):
         if clean is True:
@@ -112,8 +331,48 @@ class UnitexProcessor(object):
         if free is True:
             self.free()
 
+        self.__txt = None
+        self.__snt = None
+        self.__dir = None
+
     def tag(self, grammar, output, **kwargs):
-        raise NotImplementedError
+        xml = kwargs.get("xml", False)
+        match_mode = kwargs.get("match_mode", UnitexConstants.MATCH_MODE_LONGEST)
+        output_mode = UnitexConstants.OUTPUT_MODE_MERGE
+
+        index = self._locate(grammar, match_mode, output_mode)
+
+        if xml is False:
+            self._concord(index, merge=True, output=output)
+            if exists(output) is False:
+                raise UnitexException("No tagged file produced!")
+            return True
+
+        _output = os.path.join(self.__dir, "concord-merge-temp.txt")
+        if self.__options["virtualization"] is True:
+            _output = "%s%s" % (UnitexConstants.VFS_PREFIX, _output)
+
+        self._concord(index, merge=True, output=_output)
+        if exists(_output) is False:
+            raise UnitexException("No (temporary) tagged file produced!")
+
+        tagged = open(output, "w", encoding="utf-8")
+        tagged.write("<?xml version='1.0' encoding='UTF-8'?>\n")
+        tagged.write("<TAGFILE query='%s'>\n" % grammar)
+
+        merged = UnitexFile()
+        merged.open(_output, "r")
+        content = merged.read()
+        merged.close()
+
+        content = escape(content)
+        tagged.write(content)
+
+        tagged.write("</TAGFILE>\n")
+        tagged.close()
+        rm(_output)
+
+        return True
 
     def search(self, grammar, output, **kwargs):
         raise NotImplementedError
-- 
GitLab