From b299098c66f8ede8b61c4dda4bd7b7309a636f8d Mon Sep 17 00:00:00 2001 From: Patrick Watrin <pat@lucy.local> Date: Sun, 21 Feb 2016 19:10:15 +0100 Subject: [PATCH] first attempt for processor high-level class --- config/unitex.yaml | 18 +-- tests/01_test_tools.py | 6 +- tests/02_test_resources.py | 8 +- tests/04_test_processor.py | 115 ++++++++++++++++ tests/data/Replace.fst2 | Bin 0 -> 1782 bytes tests/data/Sentence.fst2 | Bin 19797 -> 18740 bytes tests/data/unitex.yaml | 119 ++++++++++++++++ unitex/config.py | 55 +++----- unitex/processor.py | 269 ++++++++++++++++++++++++++++++++++++- 9 files changed, 531 insertions(+), 59 deletions(-) create mode 100644 tests/04_test_processor.py create mode 100644 tests/data/Replace.fst2 create mode 100644 tests/data/unitex.yaml diff --git a/config/unitex.yaml b/config/unitex.yaml index 209a36e..bf58486 100644 --- a/config/unitex.yaml +++ b/config/unitex.yaml @@ -3,30 +3,26 @@ global: verbose: 2 log: null - tempdir: "/tmp" - persistence: True virtualization: True resources: language: "fr" - alphabet: "/home/resources/media/fr/unitex/preprocessing/Alphabet.txt" - alphabet-sorted: "/home/resources/media/fr/unitex/preprocessing/Alphabet_sort.txt" - sentence: "/home/resources/media/fr/unitex/preprocessing/sentence/Sentence.fst2" - replace: "/home/resources/media/fr/unitex/preprocessing/replace/Replace.fst2" + alphabet: "/full/path/to/Alphabet.txt" + alphabet-sorted: "/full/path/to/Alphabet_sort.txt" + sentence: "/full/path/to/Sentence.fst2" + replace: "/full/path/to/Replace.fst2" dictionaries: - - "/home/resources/media/fr/unitex/dictionary/delaf-short.bin" - - "/home/resources/media/fr/unitex/dictionary/delacf-light.bin" - - "/home/resources/media/fr/unitex/dictionary/toponyms.bin" + - "/full/path/to/dictionary.bin" -# The 'options' section can contain any of the argument used by the unitex tools +# The 'tools' section can contain any of the argument used by the unitex tools # functions. Note that, if you use the 'Processor' high-level class some argument # could be overriden to fit the 'tag', 'extract' and 'search' functions # behaviour. For intance, there is no point to define a font or a context for # 'concord'. -options: +tools: check_dic: strict: False no_space_warning: False diff --git a/tests/01_test_tools.py b/tests/01_test_tools.py index 0dc2371..29e6115 100644 --- a/tests/01_test_tools.py +++ b/tests/01_test_tools.py @@ -22,7 +22,7 @@ class Arguments: self.__arguments["inf"] = "data/dictionary.inf" self.__arguments["alphabet"] = "data/Alphabet.txt" - self.__arguments["alphabet_sort"] = "data/Alphabet_sort.txt" + self.__arguments["alphabet-sorted"] = "data/Alphabet_sort.txt" self.__arguments["sentence"] = "data/Sentence.fst2" @@ -215,7 +215,7 @@ class TestUnitexTools(unittest.TestCase): kwargs = {} kwargs["duplicates"] = False kwargs["reverse"] = False - kwargs["sort_order"] = self._arguments["alphabet_sort"] + kwargs["sort_order"] = self._arguments["alphabet-sorted"] kwargs["line_info"] = self._arguments["stat_dic.n"] kwargs["thai"] = False kwargs["factorize_inflectional_codes"] = False @@ -285,7 +285,7 @@ class TestUnitexTools(unittest.TestCase): def test_10_concord(self): index = self._arguments["ind"] - alphabet = self._arguments["alphabet"] + alphabet = self._arguments["alphabet-sorted"] kwargs = {} kwargs["font"] = None diff --git a/tests/02_test_resources.py b/tests/02_test_resources.py index e2eea95..48b7451 100644 --- a/tests/02_test_resources.py +++ b/tests/02_test_resources.py @@ -3,6 +3,7 @@ import os, unittest +from unitex import UnitexConstants from unitex.resources import * from unitex.tools import compress @@ -48,14 +49,13 @@ class TestUnitexResources(unittest.TestCase): os.remove(self._arguments["inf"]) def test_01_load_dictionary(self): - args = [self._arguments["dic"]] - kwargs = {} + kwargs["output"] = None kwargs["flip"] = False kwargs["semitic"] = False - kwargs["version"] = "v2" + kwargs["version"] = UnitexConstants.DICTIONARY_VERSION_1 - ret = compress(*args, **kwargs) + ret = compress(self._arguments["dic"], **kwargs) path = self._arguments["bin"] diff --git a/tests/04_test_processor.py b/tests/04_test_processor.py new file mode 100644 index 0000000..a66a2c1 --- /dev/null +++ b/tests/04_test_processor.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os, shutil, unittest + +from unitex import UnitexConstants +from unitex.tools import compress, grf2fst2 +from unitex.processor import UnitexProcessor + + + +class Arguments: + + def __init__(self, language=None): + self.__arguments = {} + + self.__arguments["config"] = "data/unitex.yaml" + + self.__arguments["alphabet"] = "data/Alphabet.txt" + + self.__arguments["dic"] = "data/dictionary.dic" + self.__arguments["bin"] = "data/dictionary.bin" + self.__arguments["inf"] = "data/dictionary.inf" + + self.__arguments["grf"] = "data/grammar.grf" + self.__arguments["fst2"] = "data/grammar.fst2" + + self.__arguments["txt"] = "data/corpus.txt" + self.__arguments["tag"] = "data/corpus.tag" + self.__arguments["xml"] = "data/corpus.xml" + + def __getitem__(self, key): + if key not in self.__arguments: + raise KeyError("Argument '%s' not found ..." % key) + return self.__arguments[key] + + + +class TestUnitexIO(unittest.TestCase): + + @classmethod + def setUpClass(self): + self._arguments = Arguments() + + dictionary = self._arguments["dic"] + + kwargs = {} + kwargs["output"] = None + kwargs["flip"] = False + kwargs["semitic"] = False + kwargs["version"] = UnitexConstants.DICTIONARY_VERSION_1 + + ret = compress(dictionary, **kwargs) + + grammar = self._arguments["grf"] + alphabet = self._arguments["alphabet"] + + kwargs = {} + kwargs["loop_check"] = False + kwargs["char_by_char"] = False + kwargs["pkgdir"] = None + kwargs["no_empty_graph_warning"] = False + kwargs["tfst_check"] = False + kwargs["silent_grf_name"] = False + kwargs["named_repositories"] = None + kwargs["debug"] = False + kwargs["check_variables"] = False + + ret = grf2fst2(grammar, alphabet, **kwargs) + + @classmethod + def tearDownClass(self): + if os.path.exists(self._arguments["bin"]): + os.remove(self._arguments["bin"]) + + if os.path.exists(self._arguments["inf"]): + os.remove(self._arguments["inf"]) + + if os.path.exists(self._arguments["fst2"]): + os.remove(self._arguments["fst2"]) + + if os.path.exists(self._arguments["tag"]): + os.remove(self._arguments["tag"]) + + if os.path.exists(self._arguments["xml"]): + os.remove(self._arguments["xml"]) + + def test_01_processor_txt(self): + processor = UnitexProcessor(self._arguments["config"]) + processor.open(self._arguments["txt"], mode="srtlf", tagged=False) + + kwargs = {} + kwargs["xml"] = False + + ret = processor.tag(self._arguments["fst2"], self._arguments["tag"], **kwargs) + + processor.close(clean=True, free=True) + self.assertTrue(ret, "Tagging process failed (txt format)!") + + def test_02_processor_xml(self): + processor = UnitexProcessor(self._arguments["config"]) + processor.open(self._arguments["txt"], mode="srtlf", tagged=False) + + kwargs = {} + kwargs["xml"] = True + + ret = processor.tag(self._arguments["fst2"], self._arguments["tag"], **kwargs) + + processor.close(clean=True, free=True) + self.assertTrue(ret, "Tagging process failed (xml format)!") + + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/data/Replace.fst2 b/tests/data/Replace.fst2 new file mode 100644 index 0000000000000000000000000000000000000000..af5e70b0bd1bd67bee35481ceaf7aacb8ea2c214 GIT binary patch literal 1782 zcmb_d+iKfD5S_jj`VT3wNb?{R$%%Wx6iUcjLW=A6#?*FO2vt%`(9i9cBsphiXLl^S zkcSeaojp63b7n`*<HLyz|0eQEUP>vo9LP*oawoU4kZ-Vve*juagE!Ah87J#BS!c<b zM2nFfY_99LYL?IP8<{Uq<EL!!wDJq9RbmZR;KQ0$!_EzR&VIK(vugeGP`3jYD(!Z7 zP!YYUhwIz8zKiSoxYp>Vdeosm+9$r%G97hLq8M|lW!<Vnerh*ewXl+^XDwPhGfl%a z_n4lS(DNP2I_X2VbAOw_mzA2p*J@h6oQEvu$iv&}60asOSAAq?@tl$-uvEo-^YNV3 zhk41-ls2KqL^Hx2dy^6NpHD{E74jgGSu^SrCb-2+@pfC-KinjbgZ_W<e?IGRQ2u{d z7c#~7@G&SU@Ljavy^(v|UA47aMYV+-;l#(>!Ixi?#`s;1;aLkW_3^vxvw#mzR;XLa zU(~yWZ3sJEV5)lgldz5R)B23nc^mpN*M5IJ(frFT4ok%BV?o^4t0a4R+;Vzv<FPNo zGf|~W^sF23Hq3WKv7y3@dL|aX&DzjiPtEH2?&j^$0W$SD74a;2t3JPaE7FHwV$K}v z9pE19m$?YGvhb%wH1^Aj3e%<kF$$Sscb#@6IDL(NQ@O&)JWcylUJsDN=vl^fgo!(L X_M4Q6-%KSuA4QDc#Z9uwY#nw_pC8;i literal 0 HcmV?d00001 diff --git a/tests/data/Sentence.fst2 b/tests/data/Sentence.fst2 index 6e955340e44645ff673f76071baaf4a3cf697f70..e3641eba7c57da4d4b01e991ab603c1617536210 100644 GIT binary patch literal 18740 zcmc(nTW?)Q7035865pX*EI<$%-(tH&P$zBLlq3yF3Izm}<1~qj6Q}V>+7$7f_y`E` zz(s;*1P`cB2}tDyAt4a3e)FIIUNifg*x@D=9nYFIYu0_%ti64lfBgOA(EnQ*wuX-n z9~({#%foZS)#2^o?l3DqcjG53F9iO?uogcz3bZ~#rz*5uR+udZ_vZ#EHsiw{;MEEv z0hn>6et`K{5?t6jokPDh!K07qSeNu3Gv&QpBttW6b{}V-ZC)Pzuv{!b191f6)9Nmr zu(;O=zw{XR8#CpyJlM}u--th0?-d3P#^AGdzFfR*%*{%9qIl)ruru7R8E`ws!}2jF zENi{FT;d^#=nrJtmEs*RD+Moz-Gayxpq2+Z35P|3%vV|q=e<~lZ%u<&p#aC|4>~#4 zX)jc}VeLWu#MW6@YweXs&tfZl|2MXx9~NQ-o@e~`EX6Oo5!u7wVGB=`oPghn@uZ$8 z@{mg{u$Na$%v)ceNuP5_a7M`^V0;q5I0o!(PqE^E+__~}o?F!;a7Z~(;tjM~bY8Ar zLW)l7%4p$A(GpsnDIk0xI^o61I7!AQtp%<1%Z0o6rFZAGqs}H*e1oLaXr=g6ICPRW zI&;WNjZ@oa(cN*uYqG|10Rv|w9i@4$rPkiJQv4@f`PuT4v^LtRGSFnyj*bpIR!Us0 z#XRO;=L)Z^1b9*BW3};vDvoqj?m@@NARCNlWWHLm0Nqwgj!8>=-<}+H!==q1;~b}1 ztfL<zT-s<&=c|R2@=7_9k%YECS3V!A!pK8&<&`sBXG>c4Cr)|Z(ybO-WPSQUb9kcx zGilcDly2;CR24&Llo>5G`#sO7fmPJj8ZK!Ke#J(l0OLpT&pQXH)VXPl(wc<WtB>t+ zlvaxjtv#K})aa62iJkT&ZZVIv9IFn8z2Vld-`&s7B*k|=ACQ3V_IYcEssj{!Oks}X ztm+exS62!xauSpGKyt+ME5*n7+O+142gKWzVpcHdVcuywyza!EJq!D94%?+yy<Kkc zDPO!ud~M}g$yR1-MQ=e+3F4<9`42kZfvR%TEWsCOX(_Ge;B=g?p!ccQxs#XDBk~-4 zfRr~twpH}g-Y8eugC<~R%|2BEamUBU$#Lb7kI-9HFeM4c&%`|o=hg!6v_DmajI|DU zOqti;x#*@-&JL_qRz0h2jzs1facIxkTf@ED9UnvvX1C}}Zu3JL+V?sOtm7fD<ee=O z?a|p{8M5A-%%NSi^^9qYZUVa9y+-qmq6geIidKU7Daf_)RP9|jk@%C<Qzdr=5otkK zAxIek9kvRm7=dFhoMo(!GS){K>!XbI*rOfm<y^6My?9fQWvq{Otd0EEMt*A}zqOH{ zeYjrqLBsX&+&|5i$h=sp(BW`4#ygQ22jN$qn6`(9G496t&hX9{V{%$LB7vFwBai3= z`{&7`xjcEY=q}omMSH{Opr~#Yda`w^P!pl8LQA%66-x!dQxLk~(XC=N_^o0!+&4!# zn}r72&7!X$B%}A{=%<a?t>|nN4WQU4Tm(6G8)NJ?#@O}#&w9m~*=&5aStE;N@7b*U z&?|AuP%N3^u+zA$CIYHLcxKV6ar3<b{#X8Dq3OUXu02+^sVjkY*6mS670qV#CtQ@5 z!YebUK9cWSzich=71<?S@sD*~#1T40Ojew%UiM5G=o!AZ)(OMEtv*oay?u3f6tnAb zX68E>xQn;yyYk!}X=?78hfK~B8UyQzN>Jj6JUH7u-OW0mmJ={JAzvtN(5Nqvo~$1y z3(rH{<`)(^4zqw$FZCM8VIO;d7p)toT0#~WGu_!n0&QKpt_~04K6qH}gqNafU5L*t z{_zf|uV46Ikw7nJ4RF?3V4XeT(OrA8+Cn4SM1x;UgJy=60isE{^1SA&W6j~@AXmKf z{AHNf8CLAzsgB)v<Qhh<VdO|B^&W!t9B&x8hMfzX24@~Ha!mto*_LmOpaExm+M5-J zh`PoC+s}@;^SRFzXV2nQohf!~&hhIpv!Fpzn@3o3soZ_nOTJ`3)Sp%MDnsQ1<Y6iN zSQE7|^PLiMY~_5Xq}_Pi{8H9bV<aDy;dwi8?vSfUWS0b9QDSYQt_yY_L)FSiWxoVI zPsFKORoHu~IT5ECx!E~ElXA0r0-tiTgTgD$5a5)XeH3^+A{(=p0<TE1mi?4#is*e? zLF3y(aPmCa*m82@>3f%s!#4T8%Nrlat66!n?qnr!+^5kdC${5`xj$TvEZr+liqJU2 z@EdXBj4>L<Z-()kVfY!wM#JQZVf<zo?U}<LpBYB3{F{9nXWMdtEf?5wfh`wU`Kf5w zlgb74bm#${?WAUa6JF<AIifC4IiijVnb4!B;;CpzZ}okWug*_+^?ruoXBd8hbv6{) zJP#Qc<g(gh*xO>zyPfNP?J?-xW6+|<prbK<zMSVhkHLrDhqkKI#-y)|p;izF*Lk9* zo--*n$1E{U6uKvjBP!0I7Keva`M}O%maALE_tu#;xDY?*0{VEJ^U>>TPO`N1tFlG6 zT~F0|7H-+?`23-7>sfSi&X||=bo|@<2iocRz8L3xe>hi8yLsS0<+CE7x0dRKSyLW+ zO<C%h^3*e(HP0B$#qajDCcE$_@kTaw0wc>Xl2`+e9K%S#Hn8v$%t=ld9>y~g6S4&! zh8e|Er^9f{rJ}pLRc&DpK8IF*QQopgUX({fmpS(T@bQ1cPp+xhllh_wgbxyjUy#^i z@J5Rv7VTWOq%nEZ<6C3XqdlgVpHHa&NnR|n*LP~KJGPPusi*5>TPM)HuYG9e%u9Rp z9MxAm?d?67&s*%BHPSh0ZRR4EdCCCO`W0`ZYwO3Bi+4KTP1n{K?B6Lp#9PrnTpy+J zWSD1APao_@C;SNPeo1FVJ(A9pEJ1tS$oL?;9qT1*Z{gJUsz2?HXUWn&D|t4MUdMY+ zm7K0H^<sv-eAJZzStp`kk0j0=V^1khS&Lc21z$+Ec?i?yC%(r&(_X-xqWQM*g`T*+ zRikX~vamhx%6_DphHQgETDLV*c{k0@)>ioR=S}wlylpolGNA2w^|-W0CsB8z@^!ns zWob2*w&ILOumCAYXzfL+t+ht2Bxo-m`8J&0oD;^LGjgXE<-MhwCw1;mu+I2#zN5$R zGcPf=9zH+zEt+x1+VA4t>h?4)s%2Qh-qthNdpW*QaxFe-MLtteAMF!he8UG=J%u8o zToz28izeS-qboUO-!6(0mRyQ=Id|h{>XMx%RScPA1+^EM=mwniO>c6IN2X!+X|CsG z!^q_WA85@VUGW)s<ccPr3FwxPEB>k+rh&Kk=y};R;>R96FOwU<S{E$e2}Z8;)bq0V z>v>uH^}H;;s%@sxI(+oJj0~N8B&er$Jk&?4lC*V-3Jo9UVoR6V*kc#*?4Og&wXf}o z#TsV-H5m`M%K6;_TCs*SulYHjH9w^TJ8uOmZ-FhNr8Uk^72=)PR-44Ol8yBVX(hpx zd2*dqJ=#-={g;|kz_$~mHCd?cnrH$OEusTVHZcdrXUu_-Xc*qCfhSW~1BM@K!1&bs zC09H2?m@h{V~~|IB;+h(24&jJ3^}^_n4v|^kSontQ*@zVCS9Q58Adh7GwGrgX(Cv9 z$Ra&A*gn6NfGhl@hiBre75f7!)>V0wHOQqhF(Mb3(J=D)fX54bfbjtzVA&_wa*2rS z%{+BSs7U)g3X<HB0XzSk)h&*l@_vi;`Dk%OQr@3p70VXu-+Z+=be*Rp7Z+h$t$N9= zm9{KT5fWSJi!5pVu*grnS6gn2s9Sv&t;0<@hi;4X!3RmvdxYH?pI)=0B+qKx+bV*e zr@y;ep8TeS%0k8wY2eIs^=-}o=>(nbLgDkB7%`Jg(6sM}m8+t+XP~j2f_3&dSw6?M zHY+*aY)^!3_U(heEyw)GnARug*uPchU90no_bavnSec4v^er5?1DiW=dQTm;X{_mL zP^(h0S5IG}(bJdVl<RjsqPOSSn{xGiHSc9KoO1o12R>98)6mZ(oO0<C5}$JQ>?;15 zEp&9Y9oRZRVOhYI#ZA@1mJ8l;fh`x<_5zzfux-q2Ngl+~nbAfb*?Qx)A8*_aBI@}& zAl??4C*IF^<!>gyTYq5d4{ZH`tv|5!2ez+(Z3}S9&EHLuSN0*W<vLGw+raT@*0P(w zi{Sm@&hUPd;eNMER%9;=oD%a~gHKEIT!T+b^LIbs(^B6cc_(p;=N;%95M8D;*sM%K zzAfk3i0JSUFSHgN`q&>Wo%haD^(&S`_O$!w_hF%_+Z;nH{Z+!X;(PPAXFccSlLPR{ znyWs|QKhOKM<*|&)$q4IOT+2m<>AZ4Bf#Gs;V)L)SL*nQI(})SW#99|_v7~X!HByY z{pi_|$7d`4b9MZ79ltZuzY))smqUXq<DS!zi|1p0HJ&z8uZw|uvzcES&Wy7!RBkVn z@x}1SRI7`_>kYThRQeNjT&m;qb^KHvPlsk#I`3SJxAU)!a!*I4I3M;(%NOIfaCjCk zetN{o>r2CX<)7EhcV18NpU2DkDIPgJj_*21%7c1iEtT)1ShD=q@3~laT#R=gqI+qC zE{2W5@n+;8@+N13E(DfuT&PYiMf7h+ExF%(3udXjJK(n%dy)6|BImX9)$$a7Kd_pA zy^e3z@wHgr8LfCZEZZwP?v5PJlnTwaW_RM-Y36%jcvA4<dZz3-7(N<3^Gay<B))0P zZ&3MGi0ICTq`QHG4}UEt?DlXYXe3vdv$0p-@IfDS1z5cw_&eR+OR>Hi9QQ*zd*_Yd zc0J~~bGvYRzgUVa>vOKi*{f9<AEJeHxE%Vy7l}_staLaC4}Tcz$!WV-fG55GYV3X# zp9jMSqn&))Z$JJWRE**X27la&pU#e}q3bO6-XE>M8M^I<F4E33+q@fVc;GN}6-A4& zPAzWHPQHhz-SAONrgKxAsg&OuHA?M&*WtW_oaGyj*5<|tU90qX#+gd>rZX`0cBKJJ zR!FZ`>UbEwg0uaHr-{FFOxeplj8C<9zm8aqw9IaO6O;JM2a70e)jX|N&Q|_kt>f3~ zc%_c%5ATruo}~wj!Afh-x392=nA=CUVio_~s1n|&<0tD#mdSSAje^fLtMK9*c4VYh z$+TNVt}(NaA&Q;Ai)t3Ro{@bryc;W2BJV*n&O%~{5T`-=`HKv6-wm4{2jqR6ZL$EF zMALZc!?YHlv%iHVuCh`!O_;+X^Fh$5^Az@Bk-ir{Tg&HlavbKh@=C9lv8`stwYnKM z)z0{-e#Y}H^Y`QPIAUhc?8f|V^?!>eBGeQzy2XRxtiY?qlL@jj&nZ=PH~#IEYC|qQ zDLmyxt=jX%@J`{G(IkHN3-)f|VV%z$$ycl5tKS_=`CfYp-yhZN`f(k9QpcC-_;MY8 zTF0N&@#l5?MIB$Q<M}%NvW~y1<7;)iP{&Jke7%mB>v*+}zpmqN>iF9_exr`xtmE$3 z3-Gt5J8^pCg}28O{$zyg%UhL(*tnC}tvJ^OS5NbMx30e%c_Rtr&|yS6s||lUqcu(m zCsH$N(#`m1%C{pU`FjVg-iQo%IPOb`9z_&bCWu`JBQ?EM?o|~M9rvK?M)KkQyC0QG zG<RZ#{+it^wL&oZCSTnf_eOrYGtk4n(i!09)c&NU+E@Z7LhJZ2ZDQsq0K5JYOZzC} zx)au`&WV?(9)~5G^EU$R`gY-x>wBZtdl93Y(XWUD)xnXYtF-=(j3pt(I{5i~smQ(? zzI3iZrMdU1ir_8Nv=~!yeu?3e*iE076=Knfcb$p;OgdeuZ{(Na*-e<$QCrR3Cp`=s z9T_rHtK75nTbl7(H$nd0+fqFFI6CJ>h;GeTVC^|O@~3+iPBG%P&3=&T$j!V{`aIE* zCp*zKCl&FzA5kGk%?)pIcX|&n9WFoZWn_`5`yE#IFf}(R)j77kE6)tKLXW<)MkgfT zDNCUv0tZIl{+3R3s@F#<R=CUQXBYXy@$5ZfZ0nQ6wO&cA{F3wDGl_o`e&J-R_cwm( zGvj-&8Qc2JxT)ui?|o;iz2tf8C*zbk#N%<}cmCqxgAxhn0CB-n)F9!2+l+Xp#!9Qy z!&#iSzMth;6T8T|ws)rPw3x%_?x^@}f1uve(wWiQ+YU6Qk8}p)+!Sb!75sGveNWrL z%&&OY?XZE)iT(|lw!O!idcXX=$i0w+w;nY+7OmtiqQHsB4CZ!8TvBqAaJK9RGQt>B zaTa(IbVbd%Db&kz0?ATaHN~3xc5u_PPEVn~B(q13qA{mwwi&<Z|F>8gc$c6IGo5qH z^J$hDKg~1aiKH{lxIWj6b++Z)y*kezuIoP>^Fdztu+Fd7@kt#Y*YWpt{6igmQ-4(F zf2^ba9&c&*UY+~-{ZDoN=Q{d!a5lJR|D~?~wT^$Q<9l_q{*UV1Zw!1Jw$Ca(&i+U& zU66a1Dc02V2Hd|b-ySzl+44<VVNIR#vYPZbQ~C{%-qJYZr&v?dMSc*u(w>eUXUc0) vAAH|a?-J$>yt3k2<Z;5#xC_z_&qU`L8JLv$E`KR1sNd@^4WBPElt=#r);x(7 literal 19797 zcmeI4U5{i(Qiks>`9Bn}79d!*tE#(aX0c#ry!P70_747l1q4z}xn{bS>Z+dV>aoWN z2_Y`{kl-4@1==eQNQMi9gy4$)V?56rajM%hhV`ylB*a)g`DWxtWMpJSWac^5b8yiA zjbD7>>*LXTi}h}?o-bZ};hUrBjf2t6>1cX<tKjD7<^<|S!EwP+!C}F)U{Wv+Ob$jj zjz`n+tr1<P<C~)!@EqS5-GJHncywb792Fdn7;-wEju>(}o{VTb9gnLYve5DXSp;Me zkVQZi`W+yP02jI+ObaHX8*n)o7a+@x(d5>x0%SRcLKXp8jsRp4;BpASMVKhW<W^*v z+=9z=G`R_v2>_Qd02cu+2f&eYnA|*c4imI^EEE?+KDOV%=or?>dQ3lrJVpql3w2xo z7i2+#09@$DnCS->gkUfW8O(l+5`c?^aA5@crVuVRfeQms9||rO!i9m3;6lG6xX|wi zE{t#l7YmEaQE@qf3)4J;ivSk^F2EsN1h@!r5#S;mjTmHdcvvtEOmB`Dax%S9a6Dq< z$&{7?Ed^Q%lY()<L11#L04_{yLU*Pzp`SoMgrCq)pdS;%A_XimKtCx)XSP^mfYIqN zhKo>KFvU;@Bjz^26azO4ZWJ6B92FcEObaHXLzrQT0k~k{m?B)px58zNDTWG{F{X%S z$Cx6T9b<!Nc8m?CNrV_<gHqTadLCnZ=y{Csq31DHhn~k+odaQRhtts*bCbf{0*og` zKY>ng!Q2A2=6@^rA3+4Tki*;#5eaj%P`^ep%qmoI!Gb~+7c3}LalsO#u!KNyVSPgt z7Zxv6abZb96&I9(b&OGpfKmc@xfHSpL>%F$;IM$eV<CY@C@#!HiU|bZf;gCB04`E+ z5#S=gMSzO{7X+We1;7*oaFK$G02cu+0$i9Ph8ZxTLX|8Srb1wtftnE$XXG)4$;e|u zWuO#-MIy%-Q-tl9sEDDDiHZR_pl+g~sv{}}7#8JYgNTalO>T}bJp{$_kdqi_isO)` z2LTYlu|0ZYdJIGKgFS}F6bO3e=|!y9iQFk@iOn$=0v%Jq@JO*cLq8fLMV^FSQ^X@> zEK>RsJ{c?9f~iJkBsHZkDSb)N87X~9=}Vfvni+X5)Nn(pjgh9W7DhgOwJ-ANt96l2 zUmcD}Kwn%Vef2N=BYpKQ^7O^U=u1jpQu>n8mz2J3DOvDd=qv^bgbxZtz3LU#Y#a81 zF;R&EoPC7jj#yb9^hi1$xgWF39bRQn`_VuhCqv1NA#;<=4%0etNv9|wz+y|Wl0uey ziXuX>uog}WCI#by;(|JCf<6Mp1&v@9Q&bXwixgZ0xCn3&-~!-i0k}xTs;2lrsN#aT zhbk@@c&Orn^-5{ILUGZ4rHYI8Dpg#t(ok?=Rj^R4Tgfojl5E20*<5~%9S7Ev8vV;} zE{W?bE~`a%gLWWNk6GeSk$Q{=h03Ij&r&!`e6yP1E-9>*IE;08jD@&NfPpMzAOS8y z4TRsMNpZn*Le+Sh4P0=ZK#8l_SXi>)K%t5YE+mCbI|93=6sI#O;!vm(p3@nHgdWrI zAjFU%teB=Pp@bfU5>njKG_%u?62|D#x{pk??iHc%C|H%(5ZcU4v~6OYLJDvcU<PR0 z0s;w4K!B|tiZnvx9TTKNg^Sx_sBnqejTYhJwrHJd7mAB&mntsAt5C&7Z>3JP3&llm zl`1Z(U8=a$jK(#isGU014ybmaIuQOtrA~#zJY%bH4$~=D)f|Q*OzA~c8%od~eM+G@ zCL9<bT~!fV4ic(@F1qDBqTxHJIMRkd(U>O^PJ_+QP#(JZ8O}q~Ry3pso1bAlbn`Q` zhi-m`_t4D`^9+rmf@rF+`?*!x9vc<WR-k&LFDbV<p$gqJpxiY8Hvk}7BUizD!gYpq zval3L&nQronJ70=cA|{HC{T35q7x7mTvTvTNyX;pZWy}xxg$zPfnxKE0;QW@6e!*N zqCjaL`$vIdj}rx2*K$OG(p4V?Vp^8u;1H;~C=kt*gi#>J)snN#&ZA2)8#FqN8q<f6 zZR6Sru~8e_ES0q_W~p*p)hC`}TA{&Hm4_(ej)y4QaH2-dpKr|$-Bn`_US>13oPOAQ zt`(#of)y{VVa)nlS_-Uu%gf}@WV+M@EU0PNOW~}X{$JW^Jt?x}0^!6>l7O?{E5Fj# zhp8BYg^Z9GEQPBA&u)Dd5)1``p+GDXii<H=DsqmE!BAgi>!Z6d7%&D4H7uqXDs={6 zMof{Bn2nA|Oq#Y|y!CPOi*(O!dQTZo*<!^Rd^&MP)aOLqNh}d7&Oj&5h{grcI3w}| zbq{IHRBV2UGt$j3aRxeZMr?kGGtxZvPn;2(U*e2(%a=F<oj3#9Cr`_fIAdAK(axAS zgTEW7fvQWK;h0wvCT4JsH*{Rn748~c1?d;m0WGIzP*;XlS3&YYbQM%y>ya_Y+=9(d zU4?Fb>PlK&1)HC`3f=tFRp{oYu0l6IbtSE?MD-Nxe(EZ8)vGH=bp@y^q3YBXl)4g1 zKy?L0m^1t*l6iG=nE%)2l{@;EG&1%S?)6XkY8|SswgT?WpKrmHB?VqVD>LTKJ>+`h z$a{sMtoI9!C%xZYWqcViG`nKgM?9FcG+vBt7Bl9`)m?a>FyvYSgrn(~c%#;ro}v3* zZ0>p~!ya2V?svWQW?$xZW{b+V>A7F?_GMq*TV%DVIkf+$Nn2?09i%E>7HqzkFvU5? z^r#)9Qa%`Zv*<umRmG{fTswX2gA-0=I4a199iXmChPIzKC)*A&?Rr~04~Ex=dOEH? z9Uq?&!B;_S)9lT@4Wk+_ces?Ztd`eGJ^vQ){9C~DZvh-ko_`Cdvk=D(4%|cyPeL9J zJTy{B<bM7wpkDMp{}#Y`^7*%bAK_a-J_GRjldYX^E2R0hf(rYP#CGi2ZkJ8k?RtNh z%h~Yx3x6)q(=YMD|9|df1jp9@@?`{d&%TVXkhcN?H+|tJ`!a$NuDy(ay7n?cikA@x zz)g^J|1#o7c@O{W%LoqR9KJnAayuCFN?l7G4(p7`yY^JNd3e>=&^^X_rRKJ2o$tSF zM=clKr%U>7;&zkcIbjC06*sJb{gv0iij#4MBv&^GKf3x_0_YJPe7ODi7AWn#WpiWq zn?a#sO1t5Kh*&5qX?H*ozF^+Kcw6x15R{*^{6zVQ@-r_#QGTNQMEQwaGlI>}H!z`_ zU-GtVe!f+(EK?PmpEre+H$UINgl>MBFYU9Vi(T$@-*(=Rg5A$IFwn7Qprr8zMxnPE zp$fCm`I2K{4i4NrxES#k6}y3d;$O7SKzN7!`nu?76#yyRUs3rP$6#Fy<t2-OudYV@ zvPX_rZ?fpWp9^C;zyG#l6j4TW{o04|xJKcPPLYEd4bo82+3O-?*uHQ(A8l@j#Uy(* zE!4%$EQ~b{3`J)mYmxjgq15t;up6d9dyBvJ7#c7)Ue%<wJ%`U{amXFcm!GKkp>ZhD z#`?<dJ&@mqI1ZBW**-be3Hzul_QW^+2$Iz^$CaLS<W~x&*}=T7^8<i%VHOMqJ48%B zr-xFK?>CLf_nK527<3zD!le4nil3mVw5c(!3cAI$I<wdAv^YN!lI(ZaYq^RiE(>k! zIQ2IbgqdrfkQ04$ZKIIW5OvugW)-AN+()Y}_i}Ezt^*Q+12u>iOrh%xL~aCW{gZ3^ zAveTQeq&MkR6lHkgk=P;DR%#M7)HQ^Q-%nbg4skW-Ph>^5i4kO%UQRe?4gZRLF5iH z?I42&nOd;<8L2`yKb*hl=4Yg`9*-$(enzU4H$Nj)=;mjnl2*OJF4xVZja0$zr@Emn z5~{cAhB8tKRcNGwat#0@6%b#AYrP2booTy={PH2l7hQYh_GV*WcLm_e^ZRou<@=9? z{x&SDq4aq)5c6~mA|co>h%yY_do{@oi^+g2*ib*%_XK5F#y5ruTvCml49-lU@^Xsg zOPD|ONczQYtl83!pfc$XX3k?D%O-pu!|5lzxyB}=06&9fT;e(oZg!g3Ce8oM({3#> zwva;tvq2=Ry*fucyE01aKfRnC(mx}21z1*<wfv%5ziwA;F)a^{Yzb=5_Hm_Kfe9i{ zZ(S8|;e$K*%R&!{h3ZV&$%_0INW`5AIW2|ehgCsGvR<u-%@21fy7}QwMK?d(DKzd> z*!*y(qMIM?RCM#pnlcV!rJOYiVlIW<4}&ba>hTOojU2$80#(PNN#Ra`5)gNiV&fN7 zH9v&AvWy|V`5Vou^c8bf$E~dQvO-1)%`id62{MAHW`k%V$f!Yv4mLkE8@l;<j0@fT zI8udHvzDvbVDnS6p_`wY4c+|IthBQZcDb6B#y1PQ9}Xt1W`*jlW})2wgep|CP#CwM zW`Wi$Ic+97<S^^+VLi+i+E3aZpy%zNGm}<QQ3(k=(L8>lO}iOt(?R)3%TJUa7|o0F z6Xhq$Pjtp&^V6n7H$QDUbn}bmX&=pt%`cjlZhq0cbo0x6p`&@R%cFVdXkKhNqIo{a z78%XwXA24w%>yN3G|#UWO2BBoB9vds2=`HYMbl(yakXDe2pVP4!<QX3hOlYs+$+Mk z?+<6Win{3qqqN;IQJko}!-XfJD_wgP>(6m3EWhIV%j1h<wRWS&K7ar4{QX0I;(%eI z2^|h6`d@$lfaCR97_XRc_Rtyk=j6%o5mR^IdXjJ$#CL*rqTFy8q<av*3KG7+gu@_l zB3S&qm=#_85)Q3590rSD!eQv<XE+Sq{0xVoo1ft@bjj!2iqM8b%MFLY=4Utz-TZpK zzH&*q;m~@+p=dY^c0a>m=&CmyLK+SMHzJ|x42MvLL!ksT96}{*B<@|m<+DcqO~=(` z-(^C8FE~x@SPB{&O*>n0lOF!bNGkg*#uVMAnPK3|{#A-g=^GO^wm`VJj-kTERg_{8 z1Nqw)moQZS*u1!~ou#n2aEu65T*ykXbb;c+!i6d>EL*7JQZsVREM&<7#hL|^V*3hI z2X+W37RxlfSSs?Ylxa#>C-N*4=@0!a`-J^#zbq5~<RRzl>_MIgSmvAvf|x@PdkA6> zK^8^S1%jAt(B-q7l@B&QPXwWxUtX!%uK6K%>U9Cz=>oy#=ZPS6^YcUyy7}n>(z-yf z%XI;1PXxj4=ZOGX7Z9qqE&!zq2vw*HK<NSi{|l&3h{}B5(p$XT`o70Cj~fn7*q_@u z_STgokdBt%FiO@3WpZ`Ic-D7t3jdbqYN`D3v)=eT+La!DQqj)y_V5CL^{GguX>^1l zS9k2aOA+;kW!kYkypPA2G0*@9e<^HE3xEC_b>6o`Gq@QVDZSuX<n<A)o*xmpmtEZ@ zhc6F0P9FmFSk}a4;n8COU6r2gcm@V29T%`KP%S-FK;<PwRp`iGUrmc?wkivFECK8c z*cmr&Z*~|chuQmfz0{}%t~xJTk~#{1_K>uA_MH7w2HB&Il<?iz*PL#rc?o#k)Z8tv zn)fxO-V&A-e(uOre~ILvaEw*{&;fotmc5sUJL9-|I0Lex3=7?%c}K?vZHkYOx&+=i zO)&wqh<}jsEsK<+Cn;arNYj@+#eDj*pO{Zy_7d{!B&O-her!H{*^SMouf9n>eLdRd z=tRn{VVag4n#|jh-GU^BB6Ic$5=o6Ww(I`DI@^7i<b<KJod(f*P~BVZB8wQ2=yn=3 zMo42$K~_G9;RIcD>zkk3Y3SzXb}H?58f<=^C_^_tPn4mXpL<H^=I3@Q?TJ!!I}LU} zPn4mn-t839?G$i36{^nd6iQPSN<g<#D7RB!FqV$7GGlx@7;ASiAI|zR+P_X>hJ&Wv z{MB?J-WOUgx4dpIYC1HUF68)QJ(?~gc!y>K4>mtEU3Bxq)QWC?$PpUJ3Y#CI7v21r zX3@<L;|uLrdAb};hsFpCyB~%qjiv+D8%;~0=|B~tX(==<plJb33xmr1Gw~Q_kcA4O z;2^3FvPh!F8N`@^va_7k3t|*O7u53Rr*VdEe&j=IoR)jm3pPK`dZC+N48(TL&$FKO znU2`y8mHx+^@1&jXFX_*Q>fk=CzQr1RH4QRrEvlpC!lcxs?4BfXn#5$ceth2gIn&V zPX$`cCDwf{h<`z+!N}R+Si>uiK!abRH*<(L+^6wNFTeULroU(UOFX~I^HV&(%tPHb z{@}eoG<Wx%S8VZfJU`F#J3PP3^YRCG@4otOIK1}8yYIc9+}m$_FZj+K;TL(n`Qi)T z8V#)8{%)H6EahM4d5Py6JU_$p^4)h!oVRbk*{t3HzeWF-UjCXnMgHickA}8;p0wZR z8GgurZOAQlbh%iQtoRo$UV0hXy3N~=Z*NJyZSuC{-R<UnfaI;s#qu0{<z%*9EzI3L zS+166XR>*HvEDAh-{AQk&zp;xL%h0MF4k)s+}W&`_h<g28ZW(d`DA0|oyB@{`I+K; zXM6eN!DhL&_Ko#=Gv{C6Abb16E#1kya`}W2cgy5gmrqWs{+;D~vzjT%2itqI)ZSh# zKAx>V45!!VwfvacyGt40-EHi8`9$IsvYWHZAMDcd_I5dol<#dGZ!SKe{`T2swc6P5 zWVTu@)(;ly<h|MMVyh7Mm#fnTho2+G`Q?+VoI`ZswfZQHcFc<U%2<7e=eK$8XOPRk ztE@FAv(u>eLp!|Bv)ZWTv)TIGlG{A9_0FR2@SI<qg~Qoq8U3N}-Qq&QS3HaJv(o>g z#eBCxhO0t{+Luba!t-l9zs~b6Pvo$I@r4Lr<~-~lEq9CaNc3Qf_9_2>=cjqHd}lLd zcrE1l;`Ck&@BZ?^MU=mrQ82rpcK0}wy?pZUVwv27?S?5)bGF$i?R>UnN|&Fe-+Z$^ z-!XgZS?Rr1tQNakRffg<s6F4TE_Tb<+@l#wT>@M&b0T|g5_@(M#m`(ZG&gWGKd?;s z^5f;@r>71&U#^)(6Av$DyX`>iwkUBR&KHcF<-IqX&u6RI1->xT`k_hBwu@8RmZlcl z)8%>=_1$@@H8id7UTgjSwbqw^MCIiloBRpMw|QQb_*0WVGx>9qzaV*?=M9O!H2Eu% zH+kOTd57n_Ja>8C<GK86lfN<fTaw@8`7NHg5o3NbI|CNl!sQcFb@G6R>7@Z#i(}5< zLm!jn`h2drJld#)*}Ze&{Nnz73l|@s2_G&WS(oI@apor^XWNY>7wgnwUl7Zcc9S-% z+18TN*}0m!kG%lr7io;=oY@e+*=`hzMVY6k$zo@Znc0UAGt|R}&f(!hW5FyUWxBIR z*kcu4eyTm(?t<HA+bURQ&i4YFNWQegTN_7EdV3{w;^N7!D`3smW-QWl=COg!S$N7s zFQ2Su6+^F4R^D;N-gWu-2aaBv^2~Ug{-e0n!XUZYtX0olHTk~Doj9$@D<_qjRSB<V zj{YM_37YqC`PqZm{K@R{vy{!3+xbPNHeYS@?fK?nd#@axe_IKMj796}i*2%m)N<R^ z^1-YF{F>;s3BMiP*NaC9KD)$odtx=5uNa9Re6th^MvLsu=B#Sh`7AN_~tEF3p z`9U`GgK*|cI&*_~=373q3MH0Q<_ydaE<Y_5Z9Yl(nQu4qlkC7JOwCz*A3=cA*)IEn z>fPMGznm{ClF!a6bN=|8xR<~}s8#>F#p#*D+}oT+9YG7`C)>^G=6q98Xir#ma{1(J zwj)?y&s(v2e0p}WnLkd=6_Kjh`a$V=Rk;){A8j*@RWcTBXOA<*papCSGs0CZRb?$c zA;@pLZj(LZsS+FQbtlnEGyA$)Go#wXz1}1S#hE{g=FAPMGq3J~wdZK}|M5>G+WX}X zNbZ?=V)CiU-<$jciSB;+naMwre4pp{c|0Io{<F!ykhqSnjOKp%uO|Oy^6w-b(;VzE zkH-rgv^J#9DO9qvtr23#T6}NcwsZT!YMWBBdls>ra9Bh~o~Ae*xe!CvwjVA(>C(<w l_&!@7fH4MgMtHwciVRX#iIj#ccx~qN?kD|)7hgbK{|N<x_q6~3 diff --git a/tests/data/unitex.yaml b/tests/data/unitex.yaml new file mode 100644 index 0000000..dd9e7ae --- /dev/null +++ b/tests/data/unitex.yaml @@ -0,0 +1,119 @@ +global: + debug: 1 + verbose: 2 + log: null + + persistence: True + virtualization: True + +resources: + language: "fr" + + alphabet: "data/Alphabet.txt" + alphabet-sorted: "data/Alphabet_sort.txt" + sentence: "data/Sentence.fst2" + replace: "data/Replace.fst2" + + dictionaries: + - "data/dictionary.bin" + +# The 'tools' section can contain any of the argument used by the unitex tools +# functions. Note that, if you use the 'Processor' high-level class some argument +# could be overriden to fit the 'tag', 'extract' and 'search' functions +# behaviour. For intance, there is no point to define a font or a context for +# 'concord'. +tools: + check_dic: + strict: False + no_space_warning: False + + compress: + output: null + flip: False + semitic: False + version: "v2" + + concord: + font: null + fontsize: null + only_ambiguous: False + only_matches: False + left: "0" + right: "0" + sort: "TO" + format: "text" + script: null + offsets: null + unxmlize: null + directory: null + thai: False + + dico: + morpho: null + korean: False + semitic: False + arabic_rules: null + raw: null + + extract: + non_matching_sentences: False + + fst2txt: + start_on_space: False + word_by_word: False + merge: True + + grf2fst2: + loop_check: False + char_by_char: False + pkgdir: null + no_empty_graph_warning: False + tfst_check: False + silent_grf_name: True + named_repository: null + debug: False + check_variables: True + + locate: + start_on_space: False + char_by_char: False + morpho: null + korean: False + arabic_rules: null + sntdir: null + negation_operator: "tilde" + number_of_matches: null + stop_token_count: null + match_mode: "longest" + output_mode: "merge" + protect_dic_chars: True + variable: null + ambiguous_outputs: True + variable_error: "ignore" + + normalize: + no_carriage_return: False + input_offsets: null + output_offsets: null + no_separator_normalization: False + replacement_rules: null + + sort_txt: + duplicates: False + revers: False + sort_order: null + line_info: null + thai: False + factorize_inflectional_codes: False + + tokenize: + char_by_char: False + tokens: null + input_offsets: null + output_offsets: null + + txt2fst: + clean: False + normalization_grammar: null + tagset: null + korean: False diff --git a/unitex/config.py b/unitex/config.py index d6bdcc8..d4112ec 100644 --- a/unitex/config.py +++ b/unitex/config.py @@ -12,25 +12,12 @@ _LOGGER = logging.getLogger(__name__) -class Options(object): +class Options(dict): def __init__(self, options=None): - self.__options = {} - if options is not None: self.load(options) - def __contains__(self, key): - return key in self.__options - - def __getitem__(self, key): - if key not in self.__options: - raise UnitexException("Key '%s' not found!" % key) - return self.__options[key] - - def __setitem__(self, key, value): - self.__options[key] = value - def load(self, options): raise NotImplementedError @@ -359,7 +346,7 @@ class LocateOptions(Options): raise UnitexException("[LOCATE] Wrong value for the 'korean' option. Boolean required.") self["korean"] = korean - arabic_rules = options.get("arabic_rules", False) + arabic_rules = options.get("arabic_rules", None) if arabic_rules is not None: if isinstance(arabic_rules, str) is False: raise UnitexException("[LOCATE] Wrong value for the 'arabic_rules' option. String required.") @@ -405,8 +392,8 @@ class LocateOptions(Options): output_mode = options.get("output_mode", UnitexConstants.OUTPUT_MODE_IGNORE) if output_mode not in (UnitexConstants.OUTPUT_MODE_IGNORE, - UnitexConstants.OUTPUT_MODE_MERGE, - UnitexConstants.OUTPUT_MODE_RELACE): + UnitexConstants.OUTPUT_MODE_MERGE, + UnitexConstants.OUTPUT_MODE_RELACE): raise UnitexException("[LOCATE] Wrong value for the 'output_mode' option. UnitexConstants.OUTPUT_MODE_X required.") self["output_mode"] = output_mode @@ -688,11 +675,6 @@ class UnitexConfig(Options): raise UnitexException("Wrong value for the 'log' global option. String required.") self["log"] = log - tempdir = options.get("tempdir", tempfile.gettempdir()) - if not exists(tempdir): - raise UnitexException("Temporary directory '%s' doesn't exist." % tempdir) - self["tempdir"] = tempdir - persistence = options.get("persistence", False) if isinstance(persistence, bool) is False: raise UnitexException("Wrong value for the 'persistence' global option. Boolean required.") @@ -705,17 +687,18 @@ class UnitexConfig(Options): self["resources"] = ResourcesOptions(settings.get("resources", {})) - options = settings.get("options", {}) - - self["check_dic"] = CheckDicOptions(options.get("check_dic", {})) - self["compress"] = CheckDicOptions(options.get("compress", {})) - self["concord"] = ConcordOptions(options.get("concord", {})) - self["dico"] = DicoOptions(options.get("dico", {})) - self["extract"] = ExtractOptions(options.get("extract", {})) - self["fst2txt"] = Fst2TxtOptions(options.get("fst2txt", {})) - self["grf2fst2"] = Grf2Fst2Options(options.get("grf2fst2", {})) - self["locate"] = LocateOptions(options.get("locate", {})) - self["normalize"] = NormalizeOptions(options.get("normalize", {})) - self["sort_txt"] = SortTxtOptions(options.get("sort_txt", {})) - self["tokenize"] = TokenizeOptions(options.get("tokenize", {})) - self["txt2tfst"] = Txt2TFstOptions(options.get("txt2tfst", {})) + tools = settings.get("tools", {}) + + self["tools"] = {} + self["tools"]["check_dic"] = CheckDicOptions(tools.get("check_dic", {})) + self["tools"]["compress"] = CheckDicOptions(tools.get("compress", {})) + self["tools"]["concord"] = ConcordOptions(tools.get("concord", {})) + self["tools"]["dico"] = DicoOptions(tools.get("dico", {})) + self["tools"]["extract"] = ExtractOptions(tools.get("extract", {})) + self["tools"]["fst2txt"] = Fst2TxtOptions(tools.get("fst2txt", {})) + self["tools"]["grf2fst2"] = Grf2Fst2Options(tools.get("grf2fst2", {})) + self["tools"]["locate"] = LocateOptions(tools.get("locate", {})) + self["tools"]["normalize"] = NormalizeOptions(tools.get("normalize", {})) + self["tools"]["sort_txt"] = SortTxtOptions(tools.get("sort_txt", {})) + self["tools"]["tokenize"] = TokenizeOptions(tools.get("tokenize", {})) + self["tools"]["txt2tfst"] = Txt2TFstOptions(tools.get("txt2tfst", {})) diff --git a/unitex/processor.py b/unitex/processor.py index 1f0a612..8344d85 100644 --- a/unitex/processor.py +++ b/unitex/processor.py @@ -2,8 +2,15 @@ # -*- coding: utf-8 -*- import logging +import os +import re import yaml +# Compatibility Python 2/3 +from io import open + +from xml.sax.saxutils import escape + from unitex import * from unitex.config import UnitexConfig from unitex.io import * @@ -14,13 +21,26 @@ _LOGGER = logging.getLogger(__name__) +RULES = [] +RULES.append((re.compile(r"&"), "&")) + +def escape(sequence): + for pattern, substitute in RULES: + sequence = pattern.sub(substitute, sequence) + return sequence + + + class UnitexProcessor(object): def __init__(self, config): self.__options = None self.__persisted_objects = None - self.__working_directory = None + + self.__txt = None + self.__snt = None + self.__dir = None self.init(config) @@ -98,12 +118,211 @@ class UnitexProcessor(object): free_persistent_alphabet(_object) def clean(self): - if self.__working_directory is None: + if self.__txt is None: + _LOGGER.error("Unable to clean processor. No file opened!") return - rmdir(self.__working_directory) + + if self.__options["virtualization"] is True: + if self.__dir is not None: + for vf in ls("%s%s" % (UnitexConstants.VFS_PREFIX, self.__dir)): + rm(vf) + rm(self.__snt) + rm(self.__txt) + else: + rmdir(self.__dir) + rm(self.__snt) + + def _normalize(self): + kwargs = self.__options["tools"]["normalize"] + + ret = normalize(self.__txt, **kwargs) + if ret is False: + raise UnitexException("Text normalization failed!") + + def _segment(self): + grammar = self.__options["resources"]["sentence"] + if grammar is None: + raise UnitexException("Unable to segment text. No sentence grammar provided.") + + alphabet = self.__options["resources"]["alphabet"] + if alphabet is None: + raise UnitexException("Unable to segment text. No alphabet file provided.") + + kwargs = {} + kwargs["start_on_space"] = self.__options["tools"]["fst2txt"]["start_on_space"] + kwargs["char_by_char"] = self.__options["tools"]["fst2txt"]["char_by_char"] + kwargs["merge"] = True + + ret = fst2txt(grammar, self.__snt, alphabet, **kwargs) + if ret is False: + raise UnitexException("Text segmentation failed!") + + def _replace(self): + grammar = self.__options["resources"]["replace"] + if grammar is None: + raise UnitexException("Unable to normalize text. No replace grammar provided.") + + alphabet = self.__options["resources"]["alphabet"] + if alphabet is None: + raise UnitexException("Unable to normalize text. No alphabet file provided.") + + kwargs = {} + kwargs["start_on_space"] = self.__options["tools"]["fst2txt"]["start_on_space"] + kwargs["char_by_char"] = self.__options["tools"]["fst2txt"]["char_by_char"] + kwargs["merge"] = False + + ret = fst2txt(grammar, self.__snt, alphabet, **kwargs) + if ret is False: + raise UnitexException("Text normalization failed!") + + def _tokenize(self): + alphabet = self.__options["resources"]["alphabet"] + if alphabet is None: + raise UnitexException("Unable to tokenize text. No alphabet file provided.") + + kwargs = self.__options["tools"]["tokenize"] + + ret = tokenize(self.__snt, alphabet, **kwargs) + + def _lexicalize(self): + dictionaries = self.__options["resources"]["dictionaries"] + if not dictionaries: + raise UnitexException("Unable to lexicalize text. No dictionaries provided.") + + alphabet = self.__options["resources"]["alphabet"] + if alphabet is None: + raise UnitexException("Unable to tokenize text. No alphabet file provided.") + + kwargs = self.__options["tools"]["dico"] + + ret = dico(dictionaries, self.__snt, alphabet, **kwargs) + if ret is False: + raise UnitexException("Text lexicalization failed!") + + def _locate(self, grammar, match_mode, output_mode): + alphabet = self.__options["resources"]["alphabet"] + if alphabet is None: + raise UnitexException("Unable to locate pattern. No alphabet file provided.") + + kwargs = {} + kwargs["morpho"] = self.__options["tools"]["locate"]["morpho"] + kwargs["start_on_space"] = self.__options["tools"]["locate"]["start_on_space"] + kwargs["char_by_char"] = self.__options["tools"]["locate"]["char_by_char"] + kwargs["korean"] = self.__options["tools"]["locate"]["korean"] + kwargs["arabic_rules"] = self.__options["tools"]["locate"]["arabic_rules"] + kwargs["negation_operator"] = self.__options["tools"]["locate"]["negation_operator"] + kwargs["stop_token_count"] = self.__options["tools"]["locate"]["stop_token_count"] + kwargs["protect_dic_chars"] = self.__options["tools"]["locate"]["protect_dic_chars"] + kwargs["variable"] = self.__options["tools"]["locate"]["variable"] + kwargs["variable_error"] = self.__options["tools"]["locate"]["variable_error"] + + kwargs["sntdir"] = None + kwargs["number_of_matches"] = None + kwargs["ambiguous_outputs"] = False + + if match_mode not in (UnitexConstants.MATCH_MODE_LONGEST, + UnitexConstants.MATCH_MODE_SHORTEST): + raise UnitexException("Wrong value for the 'match_mode' option. UnitexConstants.MATCH_MODE_X required.") + kwargs["match_mode"] = match_mode + + if output_mode not in (UnitexConstants.OUTPUT_MODE_IGNORE, + UnitexConstants.OUTPUT_MODE_MERGE, + UnitexConstants.OUTPUT_MODE_RELACE): + raise UnitexException("Wrong value for the 'output_mode' option. UnitexConstants.OUTPUT_MODE_X required.") + kwargs["output_mode"] = output_mode + + ret = locate(grammar, self.__snt, alphabet, **kwargs) + if ret is False: + raise UnitexException("Locate failed!") + + index = os.path.join(self.__dir, "concord.ind") + if self.__options["virtualization"] is True: + index = "%s%s" % (UnitexConstants.VFS_PREFIX, index) + + if exists(index) is False: + raise UnitexException("Locate failed! No index produced.") + return index + + def _concord(self, index, merge=False, output=None): + alphabet = self.__options["resources"]["alphabet"] + if alphabet is None: + raise UnitexException("Unable to build concordance. No alphabet file provided.") + + kwargs = {} + kwargs["font"] = None + kwargs["fontsize"] = None + kwargs["only_ambiguous"] = False + kwargs["left"] = "0" + kwargs["right"] = "0" + kwargs["sort"] = UnitexConstants.SORT_TEXT_ORDER + kwargs["script"] = None + kwargs["offsets"] = None + kwargs["unxmlize"] = None + kwargs["directory"] = None + kwargs["thai"] = self.__options["tools"]["concord"]["thai"] + + result = None + + if merge is True: + kwargs["format"] = UnitexConstants.FORMAT_MERGE + if output is None: + raise UnitexException("You must provide the output file path to use the merge option.") + kwargs["output"] = output + kwargs["only_matches"] = False + + result = output + + else: + kwargs["format"] = UnitexConstants.FORMAT_TEXT + kwargs["output"] = None + kwargs["only_matches"] = False + + result = os.path.join(self.__dir, "concord.txt") + if self.__options["virtualization"] is True: + index = "%s%s" % (UnitexConstants.VFS_PREFIX, result) + + ret = concord(index, alphabet, **kwargs) + if ret is False: + raise UnitexException("Concord failed!") + + if exists(result) is False: + raise UnitexException("Concord failed! No concordances produced.") + return result def open(self, path, mode="srtlf", tagged=False): - pass + directory, filename = os.path.split(path) + name, extension = os.path.splitext(filename) + + self.__txt = path + self.__snt = os.path.join(directory, "%s.snt" % name) + self.__dir = os.path.join(directory, "%s_snt" % name) + + if self.__options["virtualization"] is True: + txt = "%s%s" % (UnitexConstants.VFS_PREFIX, self.__txt) + cp(self.__txt, txt) + + self.__txt = txt + self.__snt = "%s%s" % (UnitexConstants.VFS_PREFIX, self.__snt) + + else: + if os.path.exists(self.__dir) is False: + mkdir(self.__dir) + elif "f" in mode: + rmdir(self.__dir) + mkdir(self.__dir) + + self._normalize() + + if tagged is False: + if "s" in mode: + self._segment() + if "r" in mode: + self._replace() + + if "t" in mode: + self._tokenize() + if "l" in mode: + self._lexicalize() def close(self, clean=True, free=False): if clean is True: @@ -112,8 +331,48 @@ class UnitexProcessor(object): if free is True: self.free() + self.__txt = None + self.__snt = None + self.__dir = None + def tag(self, grammar, output, **kwargs): - raise NotImplementedError + xml = kwargs.get("xml", False) + match_mode = kwargs.get("match_mode", UnitexConstants.MATCH_MODE_LONGEST) + output_mode = UnitexConstants.OUTPUT_MODE_MERGE + + index = self._locate(grammar, match_mode, output_mode) + + if xml is False: + self._concord(index, merge=True, output=output) + if exists(output) is False: + raise UnitexException("No tagged file produced!") + return True + + _output = os.path.join(self.__dir, "concord-merge-temp.txt") + if self.__options["virtualization"] is True: + _output = "%s%s" % (UnitexConstants.VFS_PREFIX, _output) + + self._concord(index, merge=True, output=_output) + if exists(_output) is False: + raise UnitexException("No (temporary) tagged file produced!") + + tagged = open(output, "w", encoding="utf-8") + tagged.write("<?xml version='1.0' encoding='UTF-8'?>\n") + tagged.write("<TAGFILE query='%s'>\n" % grammar) + + merged = UnitexFile() + merged.open(_output, "r") + content = merged.read() + merged.close() + + content = escape(content) + tagged.write(content) + + tagged.write("</TAGFILE>\n") + tagged.close() + rm(_output) + + return True def search(self, grammar, output, **kwargs): raise NotImplementedError -- GitLab