Upload 39 files
Browse files- .gitattributes +4 -0
- LICENSE +674 -0
- MTUOC-server.py +308 -0
- MTUOC-stop-server.py +61 -0
- MTUOC_DeepL.py +49 -0
- MTUOC_GoogleTranslate.py +83 -0
- MTUOC_Lucy.py +40 -0
- MTUOC_Marian.py +66 -0
- MTUOC_Moses.py +77 -0
- MTUOC_OpenNMT.py +44 -0
- MTUOC_misc.py +35 -0
- MTUOC_preprocess.py +115 -0
- MTUOC_splitnumbers.py +44 -0
- MTUOC_tags.py +426 -0
- MTUOC_tokenizer_eng.py +360 -0
- MTUOC_tokenizer_spa.py +359 -0
- MTUOC_translate.py +436 -0
- MTUOC_truecaser.py +199 -0
- MTUOC_typeMTUOC.py +75 -0
- MTUOC_typeModernMT.py +69 -0
- MTUOC_typeMoses.py +45 -0
- MTUOC_typeNMTWizard.py +68 -0
- MTUOC_typeOpenNMT.py +78 -0
- README.md +1 -3
- config-server.yaml +115 -0
- config.py +114 -0
- marian-server-GPU +3 -0
- model.bin +3 -0
- model.npz +3 -0
- requirements.txt +27 -0
- segment.srx +0 -0
- segmentSC.srx +0 -0
- spmodel.model +3 -0
- spmodel.vocab +0 -0
- srx_segmenter.py +106 -0
- start +3 -0
- stop +3 -0
- tc.es +3 -0
- vocab-en.yml +0 -0
- vocab-es.yml +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
marian-server-GPU filter=lfs diff=lfs merge=lfs -text
|
37 |
+
start filter=lfs diff=lfs merge=lfs -text
|
38 |
+
stop filter=lfs diff=lfs merge=lfs -text
|
39 |
+
tc.es filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
@@ -0,0 +1,674 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GNU GENERAL PUBLIC LICENSE
|
2 |
+
Version 3, 29 June 2007
|
3 |
+
|
4 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
5 |
+
Everyone is permitted to copy and distribute verbatim copies
|
6 |
+
of this license document, but changing it is not allowed.
|
7 |
+
|
8 |
+
Preamble
|
9 |
+
|
10 |
+
The GNU General Public License is a free, copyleft license for
|
11 |
+
software and other kinds of works.
|
12 |
+
|
13 |
+
The licenses for most software and other practical works are designed
|
14 |
+
to take away your freedom to share and change the works. By contrast,
|
15 |
+
the GNU General Public License is intended to guarantee your freedom to
|
16 |
+
share and change all versions of a program--to make sure it remains free
|
17 |
+
software for all its users. We, the Free Software Foundation, use the
|
18 |
+
GNU General Public License for most of our software; it applies also to
|
19 |
+
any other work released this way by its authors. You can apply it to
|
20 |
+
your programs, too.
|
21 |
+
|
22 |
+
When we speak of free software, we are referring to freedom, not
|
23 |
+
price. Our General Public Licenses are designed to make sure that you
|
24 |
+
have the freedom to distribute copies of free software (and charge for
|
25 |
+
them if you wish), that you receive source code or can get it if you
|
26 |
+
want it, that you can change the software or use pieces of it in new
|
27 |
+
free programs, and that you know you can do these things.
|
28 |
+
|
29 |
+
To protect your rights, we need to prevent others from denying you
|
30 |
+
these rights or asking you to surrender the rights. Therefore, you have
|
31 |
+
certain responsibilities if you distribute copies of the software, or if
|
32 |
+
you modify it: responsibilities to respect the freedom of others.
|
33 |
+
|
34 |
+
For example, if you distribute copies of such a program, whether
|
35 |
+
gratis or for a fee, you must pass on to the recipients the same
|
36 |
+
freedoms that you received. You must make sure that they, too, receive
|
37 |
+
or can get the source code. And you must show them these terms so they
|
38 |
+
know their rights.
|
39 |
+
|
40 |
+
Developers that use the GNU GPL protect your rights with two steps:
|
41 |
+
(1) assert copyright on the software, and (2) offer you this License
|
42 |
+
giving you legal permission to copy, distribute and/or modify it.
|
43 |
+
|
44 |
+
For the developers' and authors' protection, the GPL clearly explains
|
45 |
+
that there is no warranty for this free software. For both users' and
|
46 |
+
authors' sake, the GPL requires that modified versions be marked as
|
47 |
+
changed, so that their problems will not be attributed erroneously to
|
48 |
+
authors of previous versions.
|
49 |
+
|
50 |
+
Some devices are designed to deny users access to install or run
|
51 |
+
modified versions of the software inside them, although the manufacturer
|
52 |
+
can do so. This is fundamentally incompatible with the aim of
|
53 |
+
protecting users' freedom to change the software. The systematic
|
54 |
+
pattern of such abuse occurs in the area of products for individuals to
|
55 |
+
use, which is precisely where it is most unacceptable. Therefore, we
|
56 |
+
have designed this version of the GPL to prohibit the practice for those
|
57 |
+
products. If such problems arise substantially in other domains, we
|
58 |
+
stand ready to extend this provision to those domains in future versions
|
59 |
+
of the GPL, as needed to protect the freedom of users.
|
60 |
+
|
61 |
+
Finally, every program is threatened constantly by software patents.
|
62 |
+
States should not allow patents to restrict development and use of
|
63 |
+
software on general-purpose computers, but in those that do, we wish to
|
64 |
+
avoid the special danger that patents applied to a free program could
|
65 |
+
make it effectively proprietary. To prevent this, the GPL assures that
|
66 |
+
patents cannot be used to render the program non-free.
|
67 |
+
|
68 |
+
The precise terms and conditions for copying, distribution and
|
69 |
+
modification follow.
|
70 |
+
|
71 |
+
TERMS AND CONDITIONS
|
72 |
+
|
73 |
+
0. Definitions.
|
74 |
+
|
75 |
+
"This License" refers to version 3 of the GNU General Public License.
|
76 |
+
|
77 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
78 |
+
works, such as semiconductor masks.
|
79 |
+
|
80 |
+
"The Program" refers to any copyrightable work licensed under this
|
81 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
82 |
+
"recipients" may be individuals or organizations.
|
83 |
+
|
84 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
85 |
+
in a fashion requiring copyright permission, other than the making of an
|
86 |
+
exact copy. The resulting work is called a "modified version" of the
|
87 |
+
earlier work or a work "based on" the earlier work.
|
88 |
+
|
89 |
+
A "covered work" means either the unmodified Program or a work based
|
90 |
+
on the Program.
|
91 |
+
|
92 |
+
To "propagate" a work means to do anything with it that, without
|
93 |
+
permission, would make you directly or secondarily liable for
|
94 |
+
infringement under applicable copyright law, except executing it on a
|
95 |
+
computer or modifying a private copy. Propagation includes copying,
|
96 |
+
distribution (with or without modification), making available to the
|
97 |
+
public, and in some countries other activities as well.
|
98 |
+
|
99 |
+
To "convey" a work means any kind of propagation that enables other
|
100 |
+
parties to make or receive copies. Mere interaction with a user through
|
101 |
+
a computer network, with no transfer of a copy, is not conveying.
|
102 |
+
|
103 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
104 |
+
to the extent that it includes a convenient and prominently visible
|
105 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
106 |
+
tells the user that there is no warranty for the work (except to the
|
107 |
+
extent that warranties are provided), that licensees may convey the
|
108 |
+
work under this License, and how to view a copy of this License. If
|
109 |
+
the interface presents a list of user commands or options, such as a
|
110 |
+
menu, a prominent item in the list meets this criterion.
|
111 |
+
|
112 |
+
1. Source Code.
|
113 |
+
|
114 |
+
The "source code" for a work means the preferred form of the work
|
115 |
+
for making modifications to it. "Object code" means any non-source
|
116 |
+
form of a work.
|
117 |
+
|
118 |
+
A "Standard Interface" means an interface that either is an official
|
119 |
+
standard defined by a recognized standards body, or, in the case of
|
120 |
+
interfaces specified for a particular programming language, one that
|
121 |
+
is widely used among developers working in that language.
|
122 |
+
|
123 |
+
The "System Libraries" of an executable work include anything, other
|
124 |
+
than the work as a whole, that (a) is included in the normal form of
|
125 |
+
packaging a Major Component, but which is not part of that Major
|
126 |
+
Component, and (b) serves only to enable use of the work with that
|
127 |
+
Major Component, or to implement a Standard Interface for which an
|
128 |
+
implementation is available to the public in source code form. A
|
129 |
+
"Major Component", in this context, means a major essential component
|
130 |
+
(kernel, window system, and so on) of the specific operating system
|
131 |
+
(if any) on which the executable work runs, or a compiler used to
|
132 |
+
produce the work, or an object code interpreter used to run it.
|
133 |
+
|
134 |
+
The "Corresponding Source" for a work in object code form means all
|
135 |
+
the source code needed to generate, install, and (for an executable
|
136 |
+
work) run the object code and to modify the work, including scripts to
|
137 |
+
control those activities. However, it does not include the work's
|
138 |
+
System Libraries, or general-purpose tools or generally available free
|
139 |
+
programs which are used unmodified in performing those activities but
|
140 |
+
which are not part of the work. For example, Corresponding Source
|
141 |
+
includes interface definition files associated with source files for
|
142 |
+
the work, and the source code for shared libraries and dynamically
|
143 |
+
linked subprograms that the work is specifically designed to require,
|
144 |
+
such as by intimate data communication or control flow between those
|
145 |
+
subprograms and other parts of the work.
|
146 |
+
|
147 |
+
The Corresponding Source need not include anything that users
|
148 |
+
can regenerate automatically from other parts of the Corresponding
|
149 |
+
Source.
|
150 |
+
|
151 |
+
The Corresponding Source for a work in source code form is that
|
152 |
+
same work.
|
153 |
+
|
154 |
+
2. Basic Permissions.
|
155 |
+
|
156 |
+
All rights granted under this License are granted for the term of
|
157 |
+
copyright on the Program, and are irrevocable provided the stated
|
158 |
+
conditions are met. This License explicitly affirms your unlimited
|
159 |
+
permission to run the unmodified Program. The output from running a
|
160 |
+
covered work is covered by this License only if the output, given its
|
161 |
+
content, constitutes a covered work. This License acknowledges your
|
162 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
163 |
+
|
164 |
+
You may make, run and propagate covered works that you do not
|
165 |
+
convey, without conditions so long as your license otherwise remains
|
166 |
+
in force. You may convey covered works to others for the sole purpose
|
167 |
+
of having them make modifications exclusively for you, or provide you
|
168 |
+
with facilities for running those works, provided that you comply with
|
169 |
+
the terms of this License in conveying all material for which you do
|
170 |
+
not control copyright. Those thus making or running the covered works
|
171 |
+
for you must do so exclusively on your behalf, under your direction
|
172 |
+
and control, on terms that prohibit them from making any copies of
|
173 |
+
your copyrighted material outside their relationship with you.
|
174 |
+
|
175 |
+
Conveying under any other circumstances is permitted solely under
|
176 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
177 |
+
makes it unnecessary.
|
178 |
+
|
179 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
180 |
+
|
181 |
+
No covered work shall be deemed part of an effective technological
|
182 |
+
measure under any applicable law fulfilling obligations under article
|
183 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
184 |
+
similar laws prohibiting or restricting circumvention of such
|
185 |
+
measures.
|
186 |
+
|
187 |
+
When you convey a covered work, you waive any legal power to forbid
|
188 |
+
circumvention of technological measures to the extent such circumvention
|
189 |
+
is effected by exercising rights under this License with respect to
|
190 |
+
the covered work, and you disclaim any intention to limit operation or
|
191 |
+
modification of the work as a means of enforcing, against the work's
|
192 |
+
users, your or third parties' legal rights to forbid circumvention of
|
193 |
+
technological measures.
|
194 |
+
|
195 |
+
4. Conveying Verbatim Copies.
|
196 |
+
|
197 |
+
You may convey verbatim copies of the Program's source code as you
|
198 |
+
receive it, in any medium, provided that you conspicuously and
|
199 |
+
appropriately publish on each copy an appropriate copyright notice;
|
200 |
+
keep intact all notices stating that this License and any
|
201 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
202 |
+
keep intact all notices of the absence of any warranty; and give all
|
203 |
+
recipients a copy of this License along with the Program.
|
204 |
+
|
205 |
+
You may charge any price or no price for each copy that you convey,
|
206 |
+
and you may offer support or warranty protection for a fee.
|
207 |
+
|
208 |
+
5. Conveying Modified Source Versions.
|
209 |
+
|
210 |
+
You may convey a work based on the Program, or the modifications to
|
211 |
+
produce it from the Program, in the form of source code under the
|
212 |
+
terms of section 4, provided that you also meet all of these conditions:
|
213 |
+
|
214 |
+
a) The work must carry prominent notices stating that you modified
|
215 |
+
it, and giving a relevant date.
|
216 |
+
|
217 |
+
b) The work must carry prominent notices stating that it is
|
218 |
+
released under this License and any conditions added under section
|
219 |
+
7. This requirement modifies the requirement in section 4 to
|
220 |
+
"keep intact all notices".
|
221 |
+
|
222 |
+
c) You must license the entire work, as a whole, under this
|
223 |
+
License to anyone who comes into possession of a copy. This
|
224 |
+
License will therefore apply, along with any applicable section 7
|
225 |
+
additional terms, to the whole of the work, and all its parts,
|
226 |
+
regardless of how they are packaged. This License gives no
|
227 |
+
permission to license the work in any other way, but it does not
|
228 |
+
invalidate such permission if you have separately received it.
|
229 |
+
|
230 |
+
d) If the work has interactive user interfaces, each must display
|
231 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
232 |
+
interfaces that do not display Appropriate Legal Notices, your
|
233 |
+
work need not make them do so.
|
234 |
+
|
235 |
+
A compilation of a covered work with other separate and independent
|
236 |
+
works, which are not by their nature extensions of the covered work,
|
237 |
+
and which are not combined with it such as to form a larger program,
|
238 |
+
in or on a volume of a storage or distribution medium, is called an
|
239 |
+
"aggregate" if the compilation and its resulting copyright are not
|
240 |
+
used to limit the access or legal rights of the compilation's users
|
241 |
+
beyond what the individual works permit. Inclusion of a covered work
|
242 |
+
in an aggregate does not cause this License to apply to the other
|
243 |
+
parts of the aggregate.
|
244 |
+
|
245 |
+
6. Conveying Non-Source Forms.
|
246 |
+
|
247 |
+
You may convey a covered work in object code form under the terms
|
248 |
+
of sections 4 and 5, provided that you also convey the
|
249 |
+
machine-readable Corresponding Source under the terms of this License,
|
250 |
+
in one of these ways:
|
251 |
+
|
252 |
+
a) Convey the object code in, or embodied in, a physical product
|
253 |
+
(including a physical distribution medium), accompanied by the
|
254 |
+
Corresponding Source fixed on a durable physical medium
|
255 |
+
customarily used for software interchange.
|
256 |
+
|
257 |
+
b) Convey the object code in, or embodied in, a physical product
|
258 |
+
(including a physical distribution medium), accompanied by a
|
259 |
+
written offer, valid for at least three years and valid for as
|
260 |
+
long as you offer spare parts or customer support for that product
|
261 |
+
model, to give anyone who possesses the object code either (1) a
|
262 |
+
copy of the Corresponding Source for all the software in the
|
263 |
+
product that is covered by this License, on a durable physical
|
264 |
+
medium customarily used for software interchange, for a price no
|
265 |
+
more than your reasonable cost of physically performing this
|
266 |
+
conveying of source, or (2) access to copy the
|
267 |
+
Corresponding Source from a network server at no charge.
|
268 |
+
|
269 |
+
c) Convey individual copies of the object code with a copy of the
|
270 |
+
written offer to provide the Corresponding Source. This
|
271 |
+
alternative is allowed only occasionally and noncommercially, and
|
272 |
+
only if you received the object code with such an offer, in accord
|
273 |
+
with subsection 6b.
|
274 |
+
|
275 |
+
d) Convey the object code by offering access from a designated
|
276 |
+
place (gratis or for a charge), and offer equivalent access to the
|
277 |
+
Corresponding Source in the same way through the same place at no
|
278 |
+
further charge. You need not require recipients to copy the
|
279 |
+
Corresponding Source along with the object code. If the place to
|
280 |
+
copy the object code is a network server, the Corresponding Source
|
281 |
+
may be on a different server (operated by you or a third party)
|
282 |
+
that supports equivalent copying facilities, provided you maintain
|
283 |
+
clear directions next to the object code saying where to find the
|
284 |
+
Corresponding Source. Regardless of what server hosts the
|
285 |
+
Corresponding Source, you remain obligated to ensure that it is
|
286 |
+
available for as long as needed to satisfy these requirements.
|
287 |
+
|
288 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
289 |
+
you inform other peers where the object code and Corresponding
|
290 |
+
Source of the work are being offered to the general public at no
|
291 |
+
charge under subsection 6d.
|
292 |
+
|
293 |
+
A separable portion of the object code, whose source code is excluded
|
294 |
+
from the Corresponding Source as a System Library, need not be
|
295 |
+
included in conveying the object code work.
|
296 |
+
|
297 |
+
A "User Product" is either (1) a "consumer product", which means any
|
298 |
+
tangible personal property which is normally used for personal, family,
|
299 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
300 |
+
into a dwelling. In determining whether a product is a consumer product,
|
301 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
302 |
+
product received by a particular user, "normally used" refers to a
|
303 |
+
typical or common use of that class of product, regardless of the status
|
304 |
+
of the particular user or of the way in which the particular user
|
305 |
+
actually uses, or expects or is expected to use, the product. A product
|
306 |
+
is a consumer product regardless of whether the product has substantial
|
307 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
308 |
+
the only significant mode of use of the product.
|
309 |
+
|
310 |
+
"Installation Information" for a User Product means any methods,
|
311 |
+
procedures, authorization keys, or other information required to install
|
312 |
+
and execute modified versions of a covered work in that User Product from
|
313 |
+
a modified version of its Corresponding Source. The information must
|
314 |
+
suffice to ensure that the continued functioning of the modified object
|
315 |
+
code is in no case prevented or interfered with solely because
|
316 |
+
modification has been made.
|
317 |
+
|
318 |
+
If you convey an object code work under this section in, or with, or
|
319 |
+
specifically for use in, a User Product, and the conveying occurs as
|
320 |
+
part of a transaction in which the right of possession and use of the
|
321 |
+
User Product is transferred to the recipient in perpetuity or for a
|
322 |
+
fixed term (regardless of how the transaction is characterized), the
|
323 |
+
Corresponding Source conveyed under this section must be accompanied
|
324 |
+
by the Installation Information. But this requirement does not apply
|
325 |
+
if neither you nor any third party retains the ability to install
|
326 |
+
modified object code on the User Product (for example, the work has
|
327 |
+
been installed in ROM).
|
328 |
+
|
329 |
+
The requirement to provide Installation Information does not include a
|
330 |
+
requirement to continue to provide support service, warranty, or updates
|
331 |
+
for a work that has been modified or installed by the recipient, or for
|
332 |
+
the User Product in which it has been modified or installed. Access to a
|
333 |
+
network may be denied when the modification itself materially and
|
334 |
+
adversely affects the operation of the network or violates the rules and
|
335 |
+
protocols for communication across the network.
|
336 |
+
|
337 |
+
Corresponding Source conveyed, and Installation Information provided,
|
338 |
+
in accord with this section must be in a format that is publicly
|
339 |
+
documented (and with an implementation available to the public in
|
340 |
+
source code form), and must require no special password or key for
|
341 |
+
unpacking, reading or copying.
|
342 |
+
|
343 |
+
7. Additional Terms.
|
344 |
+
|
345 |
+
"Additional permissions" are terms that supplement the terms of this
|
346 |
+
License by making exceptions from one or more of its conditions.
|
347 |
+
Additional permissions that are applicable to the entire Program shall
|
348 |
+
be treated as though they were included in this License, to the extent
|
349 |
+
that they are valid under applicable law. If additional permissions
|
350 |
+
apply only to part of the Program, that part may be used separately
|
351 |
+
under those permissions, but the entire Program remains governed by
|
352 |
+
this License without regard to the additional permissions.
|
353 |
+
|
354 |
+
When you convey a copy of a covered work, you may at your option
|
355 |
+
remove any additional permissions from that copy, or from any part of
|
356 |
+
it. (Additional permissions may be written to require their own
|
357 |
+
removal in certain cases when you modify the work.) You may place
|
358 |
+
additional permissions on material, added by you to a covered work,
|
359 |
+
for which you have or can give appropriate copyright permission.
|
360 |
+
|
361 |
+
Notwithstanding any other provision of this License, for material you
|
362 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
363 |
+
that material) supplement the terms of this License with terms:
|
364 |
+
|
365 |
+
a) Disclaiming warranty or limiting liability differently from the
|
366 |
+
terms of sections 15 and 16 of this License; or
|
367 |
+
|
368 |
+
b) Requiring preservation of specified reasonable legal notices or
|
369 |
+
author attributions in that material or in the Appropriate Legal
|
370 |
+
Notices displayed by works containing it; or
|
371 |
+
|
372 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
373 |
+
requiring that modified versions of such material be marked in
|
374 |
+
reasonable ways as different from the original version; or
|
375 |
+
|
376 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
377 |
+
authors of the material; or
|
378 |
+
|
379 |
+
e) Declining to grant rights under trademark law for use of some
|
380 |
+
trade names, trademarks, or service marks; or
|
381 |
+
|
382 |
+
f) Requiring indemnification of licensors and authors of that
|
383 |
+
material by anyone who conveys the material (or modified versions of
|
384 |
+
it) with contractual assumptions of liability to the recipient, for
|
385 |
+
any liability that these contractual assumptions directly impose on
|
386 |
+
those licensors and authors.
|
387 |
+
|
388 |
+
All other non-permissive additional terms are considered "further
|
389 |
+
restrictions" within the meaning of section 10. If the Program as you
|
390 |
+
received it, or any part of it, contains a notice stating that it is
|
391 |
+
governed by this License along with a term that is a further
|
392 |
+
restriction, you may remove that term. If a license document contains
|
393 |
+
a further restriction but permits relicensing or conveying under this
|
394 |
+
License, you may add to a covered work material governed by the terms
|
395 |
+
of that license document, provided that the further restriction does
|
396 |
+
not survive such relicensing or conveying.
|
397 |
+
|
398 |
+
If you add terms to a covered work in accord with this section, you
|
399 |
+
must place, in the relevant source files, a statement of the
|
400 |
+
additional terms that apply to those files, or a notice indicating
|
401 |
+
where to find the applicable terms.
|
402 |
+
|
403 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
404 |
+
form of a separately written license, or stated as exceptions;
|
405 |
+
the above requirements apply either way.
|
406 |
+
|
407 |
+
8. Termination.
|
408 |
+
|
409 |
+
You may not propagate or modify a covered work except as expressly
|
410 |
+
provided under this License. Any attempt otherwise to propagate or
|
411 |
+
modify it is void, and will automatically terminate your rights under
|
412 |
+
this License (including any patent licenses granted under the third
|
413 |
+
paragraph of section 11).
|
414 |
+
|
415 |
+
However, if you cease all violation of this License, then your
|
416 |
+
license from a particular copyright holder is reinstated (a)
|
417 |
+
provisionally, unless and until the copyright holder explicitly and
|
418 |
+
finally terminates your license, and (b) permanently, if the copyright
|
419 |
+
holder fails to notify you of the violation by some reasonable means
|
420 |
+
prior to 60 days after the cessation.
|
421 |
+
|
422 |
+
Moreover, your license from a particular copyright holder is
|
423 |
+
reinstated permanently if the copyright holder notifies you of the
|
424 |
+
violation by some reasonable means, this is the first time you have
|
425 |
+
received notice of violation of this License (for any work) from that
|
426 |
+
copyright holder, and you cure the violation prior to 30 days after
|
427 |
+
your receipt of the notice.
|
428 |
+
|
429 |
+
Termination of your rights under this section does not terminate the
|
430 |
+
licenses of parties who have received copies or rights from you under
|
431 |
+
this License. If your rights have been terminated and not permanently
|
432 |
+
reinstated, you do not qualify to receive new licenses for the same
|
433 |
+
material under section 10.
|
434 |
+
|
435 |
+
9. Acceptance Not Required for Having Copies.
|
436 |
+
|
437 |
+
You are not required to accept this License in order to receive or
|
438 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
439 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
440 |
+
to receive a copy likewise does not require acceptance. However,
|
441 |
+
nothing other than this License grants you permission to propagate or
|
442 |
+
modify any covered work. These actions infringe copyright if you do
|
443 |
+
not accept this License. Therefore, by modifying or propagating a
|
444 |
+
covered work, you indicate your acceptance of this License to do so.
|
445 |
+
|
446 |
+
10. Automatic Licensing of Downstream Recipients.
|
447 |
+
|
448 |
+
Each time you convey a covered work, the recipient automatically
|
449 |
+
receives a license from the original licensors, to run, modify and
|
450 |
+
propagate that work, subject to this License. You are not responsible
|
451 |
+
for enforcing compliance by third parties with this License.
|
452 |
+
|
453 |
+
An "entity transaction" is a transaction transferring control of an
|
454 |
+
organization, or substantially all assets of one, or subdividing an
|
455 |
+
organization, or merging organizations. If propagation of a covered
|
456 |
+
work results from an entity transaction, each party to that
|
457 |
+
transaction who receives a copy of the work also receives whatever
|
458 |
+
licenses to the work the party's predecessor in interest had or could
|
459 |
+
give under the previous paragraph, plus a right to possession of the
|
460 |
+
Corresponding Source of the work from the predecessor in interest, if
|
461 |
+
the predecessor has it or can get it with reasonable efforts.
|
462 |
+
|
463 |
+
You may not impose any further restrictions on the exercise of the
|
464 |
+
rights granted or affirmed under this License. For example, you may
|
465 |
+
not impose a license fee, royalty, or other charge for exercise of
|
466 |
+
rights granted under this License, and you may not initiate litigation
|
467 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
468 |
+
any patent claim is infringed by making, using, selling, offering for
|
469 |
+
sale, or importing the Program or any portion of it.
|
470 |
+
|
471 |
+
11. Patents.
|
472 |
+
|
473 |
+
A "contributor" is a copyright holder who authorizes use under this
|
474 |
+
License of the Program or a work on which the Program is based. The
|
475 |
+
work thus licensed is called the contributor's "contributor version".
|
476 |
+
|
477 |
+
A contributor's "essential patent claims" are all patent claims
|
478 |
+
owned or controlled by the contributor, whether already acquired or
|
479 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
480 |
+
by this License, of making, using, or selling its contributor version,
|
481 |
+
but do not include claims that would be infringed only as a
|
482 |
+
consequence of further modification of the contributor version. For
|
483 |
+
purposes of this definition, "control" includes the right to grant
|
484 |
+
patent sublicenses in a manner consistent with the requirements of
|
485 |
+
this License.
|
486 |
+
|
487 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
488 |
+
patent license under the contributor's essential patent claims, to
|
489 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
490 |
+
propagate the contents of its contributor version.
|
491 |
+
|
492 |
+
In the following three paragraphs, a "patent license" is any express
|
493 |
+
agreement or commitment, however denominated, not to enforce a patent
|
494 |
+
(such as an express permission to practice a patent or covenant not to
|
495 |
+
sue for patent infringement). To "grant" such a patent license to a
|
496 |
+
party means to make such an agreement or commitment not to enforce a
|
497 |
+
patent against the party.
|
498 |
+
|
499 |
+
If you convey a covered work, knowingly relying on a patent license,
|
500 |
+
and the Corresponding Source of the work is not available for anyone
|
501 |
+
to copy, free of charge and under the terms of this License, through a
|
502 |
+
publicly available network server or other readily accessible means,
|
503 |
+
then you must either (1) cause the Corresponding Source to be so
|
504 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
505 |
+
patent license for this particular work, or (3) arrange, in a manner
|
506 |
+
consistent with the requirements of this License, to extend the patent
|
507 |
+
license to downstream recipients. "Knowingly relying" means you have
|
508 |
+
actual knowledge that, but for the patent license, your conveying the
|
509 |
+
covered work in a country, or your recipient's use of the covered work
|
510 |
+
in a country, would infringe one or more identifiable patents in that
|
511 |
+
country that you have reason to believe are valid.
|
512 |
+
|
513 |
+
If, pursuant to or in connection with a single transaction or
|
514 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
515 |
+
covered work, and grant a patent license to some of the parties
|
516 |
+
receiving the covered work authorizing them to use, propagate, modify
|
517 |
+
or convey a specific copy of the covered work, then the patent license
|
518 |
+
you grant is automatically extended to all recipients of the covered
|
519 |
+
work and works based on it.
|
520 |
+
|
521 |
+
A patent license is "discriminatory" if it does not include within
|
522 |
+
the scope of its coverage, prohibits the exercise of, or is
|
523 |
+
conditioned on the non-exercise of one or more of the rights that are
|
524 |
+
specifically granted under this License. You may not convey a covered
|
525 |
+
work if you are a party to an arrangement with a third party that is
|
526 |
+
in the business of distributing software, under which you make payment
|
527 |
+
to the third party based on the extent of your activity of conveying
|
528 |
+
the work, and under which the third party grants, to any of the
|
529 |
+
parties who would receive the covered work from you, a discriminatory
|
530 |
+
patent license (a) in connection with copies of the covered work
|
531 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
532 |
+
for and in connection with specific products or compilations that
|
533 |
+
contain the covered work, unless you entered into that arrangement,
|
534 |
+
or that patent license was granted, prior to 28 March 2007.
|
535 |
+
|
536 |
+
Nothing in this License shall be construed as excluding or limiting
|
537 |
+
any implied license or other defenses to infringement that may
|
538 |
+
otherwise be available to you under applicable patent law.
|
539 |
+
|
540 |
+
12. No Surrender of Others' Freedom.
|
541 |
+
|
542 |
+
If conditions are imposed on you (whether by court order, agreement or
|
543 |
+
otherwise) that contradict the conditions of this License, they do not
|
544 |
+
excuse you from the conditions of this License. If you cannot convey a
|
545 |
+
covered work so as to satisfy simultaneously your obligations under this
|
546 |
+
License and any other pertinent obligations, then as a consequence you may
|
547 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
548 |
+
to collect a royalty for further conveying from those to whom you convey
|
549 |
+
the Program, the only way you could satisfy both those terms and this
|
550 |
+
License would be to refrain entirely from conveying the Program.
|
551 |
+
|
552 |
+
13. Use with the GNU Affero General Public License.
|
553 |
+
|
554 |
+
Notwithstanding any other provision of this License, you have
|
555 |
+
permission to link or combine any covered work with a work licensed
|
556 |
+
under version 3 of the GNU Affero General Public License into a single
|
557 |
+
combined work, and to convey the resulting work. The terms of this
|
558 |
+
License will continue to apply to the part which is the covered work,
|
559 |
+
but the special requirements of the GNU Affero General Public License,
|
560 |
+
section 13, concerning interaction through a network will apply to the
|
561 |
+
combination as such.
|
562 |
+
|
563 |
+
14. Revised Versions of this License.
|
564 |
+
|
565 |
+
The Free Software Foundation may publish revised and/or new versions of
|
566 |
+
the GNU General Public License from time to time. Such new versions will
|
567 |
+
be similar in spirit to the present version, but may differ in detail to
|
568 |
+
address new problems or concerns.
|
569 |
+
|
570 |
+
Each version is given a distinguishing version number. If the
|
571 |
+
Program specifies that a certain numbered version of the GNU General
|
572 |
+
Public License "or any later version" applies to it, you have the
|
573 |
+
option of following the terms and conditions either of that numbered
|
574 |
+
version or of any later version published by the Free Software
|
575 |
+
Foundation. If the Program does not specify a version number of the
|
576 |
+
GNU General Public License, you may choose any version ever published
|
577 |
+
by the Free Software Foundation.
|
578 |
+
|
579 |
+
If the Program specifies that a proxy can decide which future
|
580 |
+
versions of the GNU General Public License can be used, that proxy's
|
581 |
+
public statement of acceptance of a version permanently authorizes you
|
582 |
+
to choose that version for the Program.
|
583 |
+
|
584 |
+
Later license versions may give you additional or different
|
585 |
+
permissions. However, no additional obligations are imposed on any
|
586 |
+
author or copyright holder as a result of your choosing to follow a
|
587 |
+
later version.
|
588 |
+
|
589 |
+
15. Disclaimer of Warranty.
|
590 |
+
|
591 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
592 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
593 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
594 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
595 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
596 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
597 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
598 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
599 |
+
|
600 |
+
16. Limitation of Liability.
|
601 |
+
|
602 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
603 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
604 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
605 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
606 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
607 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
608 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
609 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
610 |
+
SUCH DAMAGES.
|
611 |
+
|
612 |
+
17. Interpretation of Sections 15 and 16.
|
613 |
+
|
614 |
+
If the disclaimer of warranty and limitation of liability provided
|
615 |
+
above cannot be given local legal effect according to their terms,
|
616 |
+
reviewing courts shall apply local law that most closely approximates
|
617 |
+
an absolute waiver of all civil liability in connection with the
|
618 |
+
Program, unless a warranty or assumption of liability accompanies a
|
619 |
+
copy of the Program in return for a fee.
|
620 |
+
|
621 |
+
END OF TERMS AND CONDITIONS
|
622 |
+
|
623 |
+
How to Apply These Terms to Your New Programs
|
624 |
+
|
625 |
+
If you develop a new program, and you want it to be of the greatest
|
626 |
+
possible use to the public, the best way to achieve this is to make it
|
627 |
+
free software which everyone can redistribute and change under these terms.
|
628 |
+
|
629 |
+
To do so, attach the following notices to the program. It is safest
|
630 |
+
to attach them to the start of each source file to most effectively
|
631 |
+
state the exclusion of warranty; and each file should have at least
|
632 |
+
the "copyright" line and a pointer to where the full notice is found.
|
633 |
+
|
634 |
+
<one line to give the program's name and a brief idea of what it does.>
|
635 |
+
Copyright (C) <year> <name of author>
|
636 |
+
|
637 |
+
This program is free software: you can redistribute it and/or modify
|
638 |
+
it under the terms of the GNU General Public License as published by
|
639 |
+
the Free Software Foundation, either version 3 of the License, or
|
640 |
+
(at your option) any later version.
|
641 |
+
|
642 |
+
This program is distributed in the hope that it will be useful,
|
643 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
644 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
645 |
+
GNU General Public License for more details.
|
646 |
+
|
647 |
+
You should have received a copy of the GNU General Public License
|
648 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
649 |
+
|
650 |
+
Also add information on how to contact you by electronic and paper mail.
|
651 |
+
|
652 |
+
If the program does terminal interaction, make it output a short
|
653 |
+
notice like this when it starts in an interactive mode:
|
654 |
+
|
655 |
+
<program> Copyright (C) <year> <name of author>
|
656 |
+
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
657 |
+
This is free software, and you are welcome to redistribute it
|
658 |
+
under certain conditions; type `show c' for details.
|
659 |
+
|
660 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
661 |
+
parts of the General Public License. Of course, your program's commands
|
662 |
+
might be different; for a GUI interface, you would use an "about box".
|
663 |
+
|
664 |
+
You should also get your employer (if you work as a programmer) or school,
|
665 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
666 |
+
For more information on this, and how to apply and follow the GNU GPL, see
|
667 |
+
<https://www.gnu.org/licenses/>.
|
668 |
+
|
669 |
+
The GNU General Public License does not permit incorporating your program
|
670 |
+
into proprietary programs. If your program is a subroutine library, you
|
671 |
+
may consider it more useful to permit linking proprietary applications with
|
672 |
+
the library. If this is what you want to do, use the GNU Lesser General
|
673 |
+
Public License instead of this License. But first, please read
|
674 |
+
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
MTUOC-server.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC-server v 6
|
2 |
+
# Description: an MTUOC server using Sentence Piece as preprocessing step
|
3 |
+
# Copyright (C) 2023 Antoni Oliver
|
4 |
+
# v. 11/09/2023
|
5 |
+
# This program is free software: you can redistribute it and/or modify
|
6 |
+
# it under the terms of the GNU General Public License as published by
|
7 |
+
# the Free Software Foundation, either version 3 of the License, or
|
8 |
+
# (at your option) any later version.
|
9 |
+
#
|
10 |
+
# This program is distributed in the hope that it will be useful,
|
11 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13 |
+
# GNU General Public License for more details.
|
14 |
+
|
15 |
+
# You should have received a copy of the GNU General Public License
|
16 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
17 |
+
|
18 |
+
import os
|
19 |
+
import sys
|
20 |
+
import asyncio
|
21 |
+
import websockets
|
22 |
+
import argparse
|
23 |
+
import codecs
|
24 |
+
import csv
|
25 |
+
|
26 |
+
import srx_segmenter
|
27 |
+
|
28 |
+
|
29 |
+
from websocket import create_connection
|
30 |
+
import socket
|
31 |
+
import time
|
32 |
+
import asyncio
|
33 |
+
import websockets
|
34 |
+
|
35 |
+
import config
|
36 |
+
|
37 |
+
from MTUOC_misc import printLOG
|
38 |
+
from MTUOC_misc import get_IP_info
|
39 |
+
|
40 |
+
from MTUOC_Marian import connect_to_Marian
|
41 |
+
from MTUOC_Marian import translate_segment_Marian
|
42 |
+
|
43 |
+
from MTUOC_OpenNMT import connect_to_OpenNMT
|
44 |
+
from MTUOC_OpenNMT import translate_segment_OpenNMT
|
45 |
+
|
46 |
+
from MTUOC_Moses import connect_to_Moses
|
47 |
+
from MTUOC_Moses import translate_segment_Moses
|
48 |
+
|
49 |
+
from MTUOC_typeMTUOC import start_MTUOC_server
|
50 |
+
from MTUOC_typeMoses import start_Moses_server
|
51 |
+
from MTUOC_typeOpenNMT import start_OpenNMT_server
|
52 |
+
from MTUOC_typeNMTWizard import start_NMTWizard_server
|
53 |
+
from MTUOC_typeModernMT import start_ModernMT_server
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
from MTUOC_tags import TagRestorer
|
59 |
+
|
60 |
+
|
61 |
+
###YAML IMPORTS
|
62 |
+
import yaml
|
63 |
+
from yaml import load, dump
|
64 |
+
try:
|
65 |
+
from yaml import CLoader as Loader, CDumper as Dumper
|
66 |
+
except ImportError:
|
67 |
+
from yaml import Loader, Dumper
|
68 |
+
|
69 |
+
def startMTEngine():
|
70 |
+
printLOG(3,"START MT ENGINE:", config.startMTEngineCommand)
|
71 |
+
os.system(config.startMTEngineCommand)
|
72 |
+
|
73 |
+
|
74 |
+
#YAML
|
75 |
+
|
76 |
+
parser = argparse.ArgumentParser(description='MTUOC-server. With no arguments the config-server.yaml file will be used.')
|
77 |
+
parser.add_argument('-c','--config', action="store", dest="config", help='The server configuration file to be used.',required=False)
|
78 |
+
parser.add_argument('-p','--port', action="store", dest="port", type=int, help='The MTUOC server port.',required=False)
|
79 |
+
parser.add_argument('-t','--type', action="store", dest="type", help='The MTUOC server type.',required=False)
|
80 |
+
|
81 |
+
|
82 |
+
args = parser.parse_args()
|
83 |
+
if args.config:
|
84 |
+
configfile=args.config
|
85 |
+
else:
|
86 |
+
configfile="config-server.yaml"
|
87 |
+
|
88 |
+
stream = open(configfile, 'r',encoding="utf-8")
|
89 |
+
configYAML=yaml.load(stream, Loader=yaml.FullLoader)
|
90 |
+
|
91 |
+
config.MTUOCServer_MTengine=configYAML["MTEngine"]["MTengine"]
|
92 |
+
config.MTUOCServer_type=configYAML["MTUOCServer"]["type"]
|
93 |
+
config.MTUOCServer_port=configYAML["MTUOCServer"]["port"]
|
94 |
+
config.tag_restoration=configYAML["MTUOCServer"]["restore_tags"]
|
95 |
+
config.fix_xml=configYAML["MTUOCServer"]["fix_xml"]
|
96 |
+
|
97 |
+
config.MTUOCServer_URLs=configYAML["MTUOCServer"]["URLs"]
|
98 |
+
config.MTUOCServer_EMAILs=configYAML["MTUOCServer"]["EMAILs"]
|
99 |
+
|
100 |
+
config.code_URLs=configYAML["MTUOCServer"]["code_URLs"]
|
101 |
+
config.code_EMAILs=configYAML["MTUOCServer"]["code_EMAILs"]
|
102 |
+
|
103 |
+
config.min_chars_segment=configYAML["MTUOCServer"]["min_chars_segment"]
|
104 |
+
translation_selection_strategy=configYAML["MTUOCServer"]["translation_selection_strategy"]
|
105 |
+
|
106 |
+
config.segment_input=configYAML["Preprocess"]["segment_input"]
|
107 |
+
config.SRXfile=configYAML["Preprocess"]["SRXfile"]
|
108 |
+
config.SRXlang=configYAML["Preprocess"]["SRXlang"]
|
109 |
+
|
110 |
+
config.rules = srx_segmenter.parse(config.SRXfile)
|
111 |
+
|
112 |
+
config.sllang=configYAML["Preprocess"]["sl_lang"]
|
113 |
+
config.tllang=configYAML["Preprocess"]["tl_lang"]
|
114 |
+
config.tokenize_SL=configYAML["Preprocess"]["tokenize_SL"]
|
115 |
+
config.tokenizerSL=configYAML["Preprocess"]["sl_tokenizer"]
|
116 |
+
tokenizerSLfile=config.tokenizerSL
|
117 |
+
config.tokenizerTL=configYAML["Preprocess"]["tl_tokenizer"]
|
118 |
+
if config.tokenizerSL=="None": config.tokenizerSL=None
|
119 |
+
if config.tokenizerTL=="None": config.tokenizerTL=None
|
120 |
+
|
121 |
+
if config.tokenizerSL=="Moses":
|
122 |
+
config.tokenizerSLType="Moses"
|
123 |
+
import mosestokenizer
|
124 |
+
config.tokenizerSL = mosestokenizer.MosesTokenizer(config.sllang)
|
125 |
+
config.tokenizerSLType="Moses"
|
126 |
+
elif not config.tokenizerSL==None:
|
127 |
+
config.tokenizerSLType="MTUOC"
|
128 |
+
import importlib.util
|
129 |
+
if not config.tokenizerSL.endswith(".py"): config.tokenizerSL=MTUOCtokenizerSL+".py"
|
130 |
+
spec = importlib.util.spec_from_file_location('', config.tokenizerSL)
|
131 |
+
tokenizerSLmod = importlib.util.module_from_spec(spec)
|
132 |
+
spec.loader.exec_module(tokenizerSLmod)
|
133 |
+
config.tokenizerSL=tokenizerSLmod.Tokenizer()
|
134 |
+
|
135 |
+
else:
|
136 |
+
config.tokenizerSL=None
|
137 |
+
|
138 |
+
if config.tokenizerTL=="Moses":
|
139 |
+
import mosestokenizer
|
140 |
+
config.tokenizerTL = mosestokenizer.MosesTokenizer(config.tllang)
|
141 |
+
config.tokenizerTLType="Moses"
|
142 |
+
elif not config.tokenizerTL==None:
|
143 |
+
import importlib.util
|
144 |
+
if not config.tokenizerTL.endswith(".py"): config.tokenizerTL=tokenizerTL+".py"
|
145 |
+
spec = importlib.util.spec_from_file_location('', config.tokenizerTL)
|
146 |
+
tokenizerTLmod = importlib.util.module_from_spec(spec)
|
147 |
+
spec.loader.exec_module(tokenizerTLmod)
|
148 |
+
config.tokenizerTL=tokenizerTLmod.Tokenizer()
|
149 |
+
else:
|
150 |
+
config.tokenizerTL=None
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
config.tcmodel=configYAML["Preprocess"]["tcmodel"]
|
155 |
+
config.truecase=configYAML["Preprocess"]["truecase"]
|
156 |
+
|
157 |
+
if config.tcmodel=="None": config.tcmodel=None
|
158 |
+
if config.tcmodel:
|
159 |
+
from MTUOC_truecaser import Truecaser
|
160 |
+
config.truecaser=Truecaser(tokenizer=tokenizerSLfile,tc_model=config.tcmodel)
|
161 |
+
else:
|
162 |
+
config.truecaser=None
|
163 |
+
|
164 |
+
config.unescape_html_input=configYAML["unescape_html_input"]
|
165 |
+
config.escape_html_input=configYAML["escape_html_input"]
|
166 |
+
|
167 |
+
config.checkistranslatable=configYAML["MTUOCServer"]["checkistranslatable"]
|
168 |
+
|
169 |
+
config.change_input_files=configYAML["change_input_files"].split(";")
|
170 |
+
config.changes_input=[]
|
171 |
+
if not config.change_input_files[0]=="None":
|
172 |
+
for ci in config.change_input_files:
|
173 |
+
with open(ci) as csvfile:
|
174 |
+
csvreader = csv.reader(csvfile, delimiter=';', quotechar='"')
|
175 |
+
for row in csvreader:
|
176 |
+
config.changes_input.append(row)
|
177 |
+
|
178 |
+
config.change_output_files=configYAML["change_output_files"].split(";")
|
179 |
+
config.changes_output=[]
|
180 |
+
if not config.change_output_files[0]=="None":
|
181 |
+
for co in config.change_output_files:
|
182 |
+
with open(co) as csvfile:
|
183 |
+
csvreader = csv.reader(csvfile, delimiter=';', quotechar='"')
|
184 |
+
for row in csvreader:
|
185 |
+
config.changes_output.append(row)
|
186 |
+
|
187 |
+
config.change_translation_files=configYAML["change_translation_files"].split(";")
|
188 |
+
config.changes_translation=[]
|
189 |
+
if not config.change_translation_files[0]=="None":
|
190 |
+
for change_list in config.change_translation_files:
|
191 |
+
with open(change_list) as csvfile:
|
192 |
+
csvreader = csv.reader(csvfile, delimiter=';', quotechar='"')
|
193 |
+
for row in csvreader:
|
194 |
+
config.changes_translation.append(row)
|
195 |
+
|
196 |
+
|
197 |
+
config.tagrestorer=TagRestorer()
|
198 |
+
|
199 |
+
|
200 |
+
|
201 |
+
#sentencepiece
|
202 |
+
config.sentencepiece=configYAML["Preprocess"]["sentencepiece"]
|
203 |
+
config.spmodelSL=configYAML["Preprocess"]["sp_model_SL"]
|
204 |
+
config.spmodelTL=configYAML["Preprocess"]["sp_model_TL"]
|
205 |
+
config.sp_splitter=configYAML["Preprocess"]["sp_splitter"]
|
206 |
+
|
207 |
+
#BPE
|
208 |
+
|
209 |
+
#BPE
|
210 |
+
|
211 |
+
config.BPE=configYAML["Preprocess"]["BPE"]
|
212 |
+
config.bpecodes=configYAML["Preprocess"]["bpecodes"]
|
213 |
+
config.bpe_joiner=configYAML["Preprocess"]["bpe_joiner"]
|
214 |
+
if config.BPE:
|
215 |
+
from subword_nmt import apply_bpe
|
216 |
+
config.bpeobject=apply_bpe.BPE(open(config.bpecodes,encoding="utf-8"))
|
217 |
+
#bos and eos annotate
|
218 |
+
config.bos_annotate=configYAML["Preprocess"]["bos_annotate"]
|
219 |
+
config.bos_symbol=configYAML["Preprocess"]["bos_symbol"]
|
220 |
+
config.eos_annotate=configYAML["Preprocess"]["eos_annotate"]
|
221 |
+
config.eos_symbol=configYAML["Preprocess"]["eos_symbol"]
|
222 |
+
|
223 |
+
config.pre_replace_NUMs=configYAML["Preprocess"]["replaceNUMs"]
|
224 |
+
config.code_NUMs=configYAML["Preprocess"]["code_NUMs"]
|
225 |
+
config.pre_split_NUMs=configYAML["Preprocess"]["splitNUMs"]
|
226 |
+
|
227 |
+
if config.sentencepiece:
|
228 |
+
import sentencepiece as spm
|
229 |
+
config.spSL= spm.SentencePieceProcessor(model_file=config.spmodelSL, out_type=str, add_bos=config.bos_annotate, add_eos=config.eos_annotate)
|
230 |
+
config.spTL= spm.SentencePieceProcessor(model_file=config.spmodelTL, out_type=str)
|
231 |
+
|
232 |
+
|
233 |
+
if config.MTUOCServer_MTengine in ["Marian","Moses","OpenNMT"]:
|
234 |
+
config.startMTEngineV=configYAML["MTEngine"]["startMTEngine"]
|
235 |
+
config.startMTEngineCommand=configYAML["MTEngine"]["startCommand"]
|
236 |
+
config.MTEngineIP=configYAML["MTEngine"]["IP"]
|
237 |
+
config.MTEnginePort=configYAML["MTEngine"]["port"]
|
238 |
+
config.min_len_factor=configYAML["MTEngine"]["min_len_factor"]
|
239 |
+
|
240 |
+
if config.MTUOCServer_MTengine=="GoogleTranslate":
|
241 |
+
config.Google_sllang=configYAML["GoogleTranslate"]["sllang"]
|
242 |
+
config.Google_tllang=configYAML["GoogleTranslate"]["tllang"]
|
243 |
+
config.Google_glossary=configYAML["GoogleTranslate"]["glossary"]
|
244 |
+
if config.Google_glossary=="None": config.Google_glossary=None
|
245 |
+
#state None if no glossary is used, otherwise the name of the glossary
|
246 |
+
config.Google_project_id=configYAML["GoogleTranslate"]["project_id"]
|
247 |
+
config.Google_location=configYAML["GoogleTranslate"]["location"]
|
248 |
+
config.Google_jsonfile=configYAML["GoogleTranslate"]["jsonfile"]
|
249 |
+
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.Google_jsonfile
|
250 |
+
config.client = translateGoogle.TranslationServiceClient()
|
251 |
+
config.client = translateGoogle.TranslationServiceClient()
|
252 |
+
config.parent = "projects/"+config.Google_project_id+"/locations/"+config.Google_location
|
253 |
+
|
254 |
+
if config.MTUOCServer_MTengine=="DeepL":
|
255 |
+
config.DeepL_API_key=configYAML["DeepL"]["API_key"]
|
256 |
+
config.DeepL_sl_lang=configYAML["DeepL"]["sllang"]
|
257 |
+
config.DeepL_tl_lang=configYAML["DeepL"]["tllang"]
|
258 |
+
#tag_handling: html
|
259 |
+
#one of html, xml
|
260 |
+
config.DeepL_formality=configYAML["DeepL"]["formality"]
|
261 |
+
#one of default, less, more
|
262 |
+
config.DeepL_split_sentences=configYAML["DeepL"]["split_sentences"]
|
263 |
+
#one of off, on, nonewlines
|
264 |
+
config.DeepL_glossary=configYAML["DeepL"]["glossary"]
|
265 |
+
if config.DeepL_glossary=="None": config.DeepL_glossary=None
|
266 |
+
config.DeepLtranslator = deepl.Translator(config.DeepL_API_key)
|
267 |
+
|
268 |
+
if config.MTUOCServer_MTengine=="Lucy":
|
269 |
+
###Lucy imports
|
270 |
+
import ast
|
271 |
+
import xmltodict
|
272 |
+
import requests
|
273 |
+
config.Lucy_url=configYAML["Lucy"]["url"]
|
274 |
+
config.Lucy_TRANSLATION_DIRECTION=configYAML["Lucy"]["TRANSLATION_DIRECTION"]
|
275 |
+
config.Lucy_MARK_UNKNOWNS=configYAML["Lucy"]["MARK_UNKNOWNS"]
|
276 |
+
config.Lucy_MARK_ALTERNATIVES=configYAML["Lucy"]["MARK_ALTERNATIVES"]
|
277 |
+
config.Lucy_MARK_COMPOUNDS=configYAML["Lucy"]["MARK_COMPOUNDS"]
|
278 |
+
config.Lucy_CHARSET=configYAML["Lucy"]["CHARSET"]
|
279 |
+
|
280 |
+
config.verbosity_level=int(configYAML["MTUOCServer"]["verbosity_level"])
|
281 |
+
config.log_file=configYAML["MTUOCServer"]["log_file"]
|
282 |
+
|
283 |
+
config.MTUOCServer_ONMT_url_root=configYAML["MTUOCServer"]["ONMT_url_root"]
|
284 |
+
|
285 |
+
if config.log_file=="None":
|
286 |
+
config.log_file=False
|
287 |
+
else:
|
288 |
+
config.sortidalog=codecs.open(config.log_file,"a",encoding="utf-8")
|
289 |
+
config.log_file=True
|
290 |
+
|
291 |
+
if config.startMTEngineV and not config.MTUOCServer_MTengine in ["GoogleTranslate","DeepL","Lucy"]:
|
292 |
+
startMTEngine()
|
293 |
+
|
294 |
+
if config.MTUOCServer_MTengine=="Marian":
|
295 |
+
connect_to_Marian()
|
296 |
+
elif config.MTUOCServer_MTengine=="Moses":
|
297 |
+
config.proxyMoses=connect_to_Moses()
|
298 |
+
|
299 |
+
if config.MTUOCServer_type=="MTUOC":
|
300 |
+
start_MTUOC_server()
|
301 |
+
elif config.MTUOCServer_type=="Moses":
|
302 |
+
start_Moses_server()
|
303 |
+
elif config.MTUOCServer_type=="OpenNMT":
|
304 |
+
start_OpenNMT_server()
|
305 |
+
elif config.MTUOCServer_type=="NMTWizard":
|
306 |
+
start_NMTWizard_server()
|
307 |
+
elif config.MTUOCServer_type=="ModernMT":
|
308 |
+
start_ModernMT_server()
|
MTUOC-stop-server.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_stop_server
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import sys
|
19 |
+
import os
|
20 |
+
import yaml
|
21 |
+
from yaml import load, dump
|
22 |
+
try:
|
23 |
+
from yaml import CLoader as Loader, CDumper as Dumper
|
24 |
+
except ImportError:
|
25 |
+
from yaml import Loader, Dumper
|
26 |
+
import argparse
|
27 |
+
|
28 |
+
#YAML
|
29 |
+
|
30 |
+
parser = argparse.ArgumentParser(description='MTUOC-server. With no arguments the config-server.yaml file will be used.')
|
31 |
+
parser.add_argument('-c','--config', action="store", dest="config", help='The tokenizer to be used.',required=False)
|
32 |
+
parser.add_argument('-p','--port', action="store", dest="port", type=int, help='The MTUOC server port.',required=False)
|
33 |
+
|
34 |
+
args = parser.parse_args()
|
35 |
+
if args.config:
|
36 |
+
configfile=args.config
|
37 |
+
else:
|
38 |
+
configfile="config-server.yaml"
|
39 |
+
|
40 |
+
stream = open(configfile, 'r',encoding="utf-8")
|
41 |
+
config=yaml.load(stream, Loader=yaml.FullLoader)
|
42 |
+
|
43 |
+
MTEnginePort=config["MTEngine"]["port"]
|
44 |
+
MTUOCServer_port=config["MTUOCServer"]["port"]
|
45 |
+
|
46 |
+
if args.port:
|
47 |
+
MTUOCServer_port=args.port
|
48 |
+
|
49 |
+
try:
|
50 |
+
stopcommand2="fuser -k "+str(MTEnginePort)+"/tcp"
|
51 |
+
os.system(stopcommand2)
|
52 |
+
print("MT Engine stopped.")
|
53 |
+
except:
|
54 |
+
print("Unable to stop MT Engine",sys.exc_info())
|
55 |
+
|
56 |
+
try:
|
57 |
+
stopcommand2="fuser -k "+str(MTUOCServer_port)+"/tcp"
|
58 |
+
os.system(stopcommand2)
|
59 |
+
print("MTUOC server stopped.")
|
60 |
+
except:
|
61 |
+
print("Unable to stop MTUOC server",sys.exc_info())
|
MTUOC_DeepL.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_DeepL
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import config
|
19 |
+
from MTUOC_misc import printLOG
|
20 |
+
|
21 |
+
###DeepL imports
|
22 |
+
import deepl
|
23 |
+
|
24 |
+
def DeepL_translate(segment):
|
25 |
+
if config.DeepL_glossary==None:
|
26 |
+
cadena="Translating without glossary from "+config.DeepL_sl_lang+" to "+config.DeepL_tl_lang
|
27 |
+
printLOG(3,cadena)
|
28 |
+
cadena="Source segment: "+segment
|
29 |
+
printLOG(3,cadena)
|
30 |
+
try:
|
31 |
+
translation = config.DeepLtranslator.translate_text(segment, source_lang=config.DeepL_sl_lang, target_lang=config.DeepL_tl_lang,formality=config.DeepL_formality,split_sentences=config.DeepL_split_sentences)
|
32 |
+
except:
|
33 |
+
printLOG(1,"ERROR DeepL:",sys.exc_info())
|
34 |
+
cadena="Translation: "+translation.text
|
35 |
+
printLOG(3,cadena)
|
36 |
+
else:
|
37 |
+
cadena="Translating with glossary "+config.DeepL_glossary_name,DeepL_glossary+" from "+config.DeepL_sl_lang+" to "+config.DeepL_tl_lang
|
38 |
+
printLOG(3,cadena)
|
39 |
+
cadena="Source segment: "+segment
|
40 |
+
printLOG(3,cadena)
|
41 |
+
try:
|
42 |
+
translation = config.DeepLtranslator.translate_text(segment, source_lang=config.DeepL_sl_lang, target_lang=config.DeepL_tl_lang, glossary=config.DeepL_glossary, formality=config.DeepL_formality, split_sentences=config.DeepL_split_sentences)
|
43 |
+
except:
|
44 |
+
printLOG(1,"ERROR DeepL:",sys.exc_info())
|
45 |
+
cadena="Translation: "+translation.text
|
46 |
+
printLOG(3,cadena)
|
47 |
+
|
48 |
+
return(translation.text)
|
49 |
+
|
MTUOC_GoogleTranslate.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_GoogleTranslate
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
from MTUOC_misc import printLOG
|
19 |
+
import config
|
20 |
+
|
21 |
+
###GoogleTranslate imports
|
22 |
+
from google.cloud import translate as translateGoogle
|
23 |
+
from google.cloud import translate
|
24 |
+
|
25 |
+
def Google_translate_text_with_glossary(text):
|
26 |
+
|
27 |
+
glossary = config.client.glossary_path(
|
28 |
+
config.Google_project_id, config.Google_location, config.Google_glossary
|
29 |
+
)
|
30 |
+
|
31 |
+
glossary_config = translateGoogle.TranslateTextGlossaryConfig(glossary=config.Google_glossary)
|
32 |
+
|
33 |
+
# Supported language codes: https://cloud.google.com/translate/docs/languages
|
34 |
+
response = client.translate_text(
|
35 |
+
request={
|
36 |
+
"contents": [text],
|
37 |
+
"target_language_code": config.Google_sllang,
|
38 |
+
"source_language_code": config.Google_tllang,
|
39 |
+
"parent": config.parent,
|
40 |
+
"glossary_config": glossary_config,
|
41 |
+
}
|
42 |
+
)
|
43 |
+
translation=response.glossary_translations[0]
|
44 |
+
return(translation.translated_text)
|
45 |
+
|
46 |
+
def Google_translate_text(text):
|
47 |
+
try:
|
48 |
+
"""Translating Text. segment,config.Google_sllang,config.Google_tllang"""
|
49 |
+
# Detail on supported types can be found here:
|
50 |
+
# https://cloud.google.com/translate/docs/supported-formats
|
51 |
+
response = config.client.translate_text(
|
52 |
+
parent=config.parent,
|
53 |
+
contents=[text],
|
54 |
+
mime_type="text/plain", # mime types: text/plain, text/html
|
55 |
+
source_language_code=config.Google_sllang,
|
56 |
+
target_language_code=config.Google_tllang,
|
57 |
+
)
|
58 |
+
# Display the translation for each input text provided
|
59 |
+
translation=response.translations[0]
|
60 |
+
return(translation.translated_text)
|
61 |
+
except:
|
62 |
+
print("ERROR:",sys.exc_info())
|
63 |
+
|
64 |
+
|
65 |
+
def Google_translate(segment):
|
66 |
+
segment=segment.rstrip()
|
67 |
+
if config.Google_glossary==None:
|
68 |
+
cadena="Translating without glossary from "+config.Google_sllang+" to "+config.Google_tllang
|
69 |
+
printLOG(3,cadena)
|
70 |
+
cadena="Source segment: "+segment
|
71 |
+
printLOG(3,cadena)
|
72 |
+
translation=Google_translate_text(segment)
|
73 |
+
cadena="Translation: "+translation
|
74 |
+
printLOG(3,cadena)
|
75 |
+
else:
|
76 |
+
cadena="Translating with glossary "+Google_glossary+" from "+config.Google_sllang+" to "+config.Google_tllang
|
77 |
+
printLOG(3,cadena)
|
78 |
+
cadena="Source segment: "+segment
|
79 |
+
printLOG(3,cadena)
|
80 |
+
translation=Google_translate_text_with_glossary()
|
81 |
+
cadena="Translation: "+translation
|
82 |
+
printLOG(3,cadena)
|
83 |
+
return(translation)
|
MTUOC_Lucy.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_Lucy
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import config
|
19 |
+
from MTUOC_misc import printLOG
|
20 |
+
|
21 |
+
import json
|
22 |
+
import requests
|
23 |
+
import ast
|
24 |
+
import xmltodict
|
25 |
+
|
26 |
+
def Lucy_translate(slsegment):
|
27 |
+
try:
|
28 |
+
slsegment="[@àéèíóòú@]. "+slsegment
|
29 |
+
url = config.Lucy_url
|
30 |
+
headers = {'Content-Type': 'application/json'}
|
31 |
+
body = {"inputParams": {"param": [{"@name": "TRANSLATION_DIRECTION", "@value": config.Lucy_TRANSLATION_DIRECTION},{"@name": "MARK_UNKNOWNS", "@value": config.Lucy_MARK_UNKNOWNS},{"@name": "MARK_ALTERNATIVES", "@value": config.Lucy_MARK_ALTERNATIVES},{"@name": "MARK_COMPOUNDS", "@value": config.Lucy_MARK_COMPOUNDS}, {"@name": "INPUT", "@value": slsegment},{"@name": "CHARSET","@value":config.Lucy_CHARSET}]}}
|
32 |
+
encoded_body = json.dumps(body)
|
33 |
+
traduccio_xml=requests.post(url = url, auth=('traductor', 'traductor'), headers=headers, data=encoded_body).text
|
34 |
+
response_dict = ast.literal_eval(json.dumps(xmltodict.parse(traduccio_xml)))
|
35 |
+
translation=response_dict["task"]["outputParams"]["param"][0]['@value']
|
36 |
+
translation=translation.replace("[@àéèíóòú@]. ","")
|
37 |
+
except:
|
38 |
+
printLOG(1,"ERROR Lucy:",sys.exc_info())
|
39 |
+
return(translation)
|
40 |
+
|
MTUOC_Marian.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_Marian
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import websocket
|
19 |
+
import socket
|
20 |
+
import time
|
21 |
+
import sys
|
22 |
+
import config
|
23 |
+
|
24 |
+
from MTUOC_misc import printLOG
|
25 |
+
|
26 |
+
|
27 |
+
def connect_to_Marian():
|
28 |
+
print("CONNECT TO MARIAN.")
|
29 |
+
from websocket import create_connection
|
30 |
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
31 |
+
conn=s.connect_ex((config.MTEngineIP, config.MTEnginePort))
|
32 |
+
if conn == 0:marianstarted=True
|
33 |
+
service="ws://"+config.MTEngineIP+":"+str(config.MTEnginePort)+"/translate"
|
34 |
+
error=True
|
35 |
+
while error:
|
36 |
+
try:
|
37 |
+
config.ws = create_connection(service)
|
38 |
+
printLOG(1,"Connection with Marian Server created","")
|
39 |
+
error=False
|
40 |
+
except:
|
41 |
+
printLOG(1,"Error: waiting for Marian server to start. Retrying in 2 seconds.",sys.exc_info())
|
42 |
+
time.sleep(2)
|
43 |
+
error=True
|
44 |
+
|
45 |
+
def translate_segment_Marian(segmentPre):
|
46 |
+
translation_candidates={}
|
47 |
+
translation_candidates["segmentNOTAGSPre"]=segmentPre
|
48 |
+
lseg=len(segmentPre)
|
49 |
+
try:
|
50 |
+
config.ws.send(segmentPre)
|
51 |
+
except:
|
52 |
+
printLOG(1,"Error sending segment to Marian.",sys.exc_info())
|
53 |
+
translations = config.ws.recv()
|
54 |
+
tc_aux=translations.split("\n")
|
55 |
+
translation_candidates["translationNOTAGSPre"]=[]
|
56 |
+
translation_candidates["alignments"]=[]
|
57 |
+
for tca in tc_aux:
|
58 |
+
printLOG(3,"TCA:",tca)
|
59 |
+
try:
|
60 |
+
segmentaux=tca.split(" ||| ")[1].strip()
|
61 |
+
alignmentaux=tca.split(" ||| ")[2].strip()
|
62 |
+
translation_candidates["translationNOTAGSPre"].append(segmentaux)
|
63 |
+
translation_candidates["alignments"].append(alignmentaux)
|
64 |
+
except:
|
65 |
+
pass
|
66 |
+
return(translation_candidates)
|
MTUOC_Moses.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_Moses
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import time
|
19 |
+
import config
|
20 |
+
|
21 |
+
from MTUOC_misc import printLOG
|
22 |
+
|
23 |
+
|
24 |
+
def connect_to_Moses():
|
25 |
+
import xmlrpc.client
|
26 |
+
error=True
|
27 |
+
while error:
|
28 |
+
try:
|
29 |
+
proxyMoses = xmlrpc.client.ServerProxy("http://"+config.MTEngineIP+":"+str(config.MTEnginePort)+"/RPC2")
|
30 |
+
error=False
|
31 |
+
printLOG(1,"Connection with Moses Server created","")
|
32 |
+
except:
|
33 |
+
printLOG(1,"Error: waiting for Moses server to start. Retrying in 2 seconds.","")
|
34 |
+
time.sleep(2)
|
35 |
+
error=True
|
36 |
+
return(proxyMoses)
|
37 |
+
|
38 |
+
def translateAliMoses(aliBRUT):
|
39 |
+
newali=[]
|
40 |
+
for a in aliBRUT:
|
41 |
+
at=str(a['source-word'])+"-"+str(a['target-word'])
|
42 |
+
newali.append(at)
|
43 |
+
newali=" ".join(newali)
|
44 |
+
|
45 |
+
return(newali)
|
46 |
+
|
47 |
+
def translate_segment_Moses(segmentPre):
|
48 |
+
param = {"text": segmentPre}
|
49 |
+
result = config.proxyMoses.translate(param)
|
50 |
+
translation_candidates={}
|
51 |
+
translation_candidates["translationNOTAGSPre"]=[]
|
52 |
+
translation_candidates["alignments"]=[]
|
53 |
+
if "nbest" in result:
|
54 |
+
print("NBEST")
|
55 |
+
else:
|
56 |
+
print("UN SOL RESULTAT")
|
57 |
+
translationREP=result['text']
|
58 |
+
alignmentBRUT=result['word-align']
|
59 |
+
alignments=translateAliMoses(alignmentBRUT)
|
60 |
+
translation_candidates["translationNOTAGSPre"].append(translationREP)
|
61 |
+
translation_candidates["alignments"].append(alignments)
|
62 |
+
|
63 |
+
|
64 |
+
return(translation_candidates)
|
65 |
+
|
66 |
+
'''
|
67 |
+
translation_candidates["translationNOTAGSPre"]=[]
|
68 |
+
translation_candidates["alignments"]=[]
|
69 |
+
for tca in tc_aux:
|
70 |
+
try:
|
71 |
+
segmentaux=tca.split(" ||| ")[1].strip()
|
72 |
+
alignmentaux=tca.split(" ||| ")[2].strip()
|
73 |
+
translation_candidates["translationNOTAGSPre"].append(segmentaux)
|
74 |
+
translation_candidates["alignments"].append(alignmentaux)
|
75 |
+
except:
|
76 |
+
pass
|
77 |
+
'''
|
MTUOC_OpenNMT.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_OpenNMT
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import requests
|
19 |
+
import time
|
20 |
+
import sys
|
21 |
+
import config
|
22 |
+
|
23 |
+
from MTUOC_misc import printLOG
|
24 |
+
|
25 |
+
|
26 |
+
def connect_to_OpenNMT():
|
27 |
+
config.urlOpenNMT = "http://"+config.MTEngineIP+":"+str(config.MTEnginePort)+"/translator/translate"
|
28 |
+
config.headersOpenNMT = {'content-type': 'application/json'}
|
29 |
+
|
30 |
+
|
31 |
+
def translate_segment_OpenNMT(segmentPre):
|
32 |
+
translation_candidates={}
|
33 |
+
translation_candidates["segmentNOTAGSPre"]=segmentPre
|
34 |
+
params = [{ "src" : segmentPre}]
|
35 |
+
response = requests.post(config.urlOpenNMT, json=params, headers=config.headersOpenNMT)
|
36 |
+
target = response.json()
|
37 |
+
selectedtranslationPre=target[0][0]["tgt"]
|
38 |
+
if "align" in target[0][0]:
|
39 |
+
selectedalignments=target[0][0]["align"][0]
|
40 |
+
else:
|
41 |
+
selectedalignments=""
|
42 |
+
translation_candidates["translationNOTAGSPre"]=[selectedtranslationPre]
|
43 |
+
translation_candidates["alignments"]=[selectedalignments]
|
44 |
+
return(translation_candidates)
|
MTUOC_misc.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import socket
|
2 |
+
from datetime import datetime
|
3 |
+
import sys
|
4 |
+
|
5 |
+
import config
|
6 |
+
|
7 |
+
def printLOG(vlevel,m1,m2=""):
|
8 |
+
cadena=str(datetime.now())+"\t"+str(m1)+"\t"+str(m2)
|
9 |
+
if vlevel<=config.verbosity_level:
|
10 |
+
print(cadena)
|
11 |
+
if config.log_file:
|
12 |
+
config.sortidalog.write(cadena+"\n")
|
13 |
+
|
14 |
+
def get_IP_info():
|
15 |
+
try:
|
16 |
+
host_name = socket.gethostname()
|
17 |
+
host_ip = socket.gethostbyname(host_name)
|
18 |
+
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
19 |
+
# doesn't even have to be reachable
|
20 |
+
s.connect(('10.255.255.255', 1))
|
21 |
+
IP = s.getsockname()[0]
|
22 |
+
return(IP)
|
23 |
+
except:
|
24 |
+
IP = '127.0.0.1'
|
25 |
+
return(IP)
|
26 |
+
finally:
|
27 |
+
s.close()
|
28 |
+
|
29 |
+
def capitalizeMTUOC(cadena):
|
30 |
+
resultat=cadena
|
31 |
+
try:
|
32 |
+
resultat=cadena[0].upper()+cadena[1:]
|
33 |
+
except:
|
34 |
+
pass
|
35 |
+
return(resultat)
|
MTUOC_preprocess.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_preprocess
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 23/05/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import config
|
19 |
+
from MTUOC_misc import printLOG
|
20 |
+
import regex as rx
|
21 |
+
import re
|
22 |
+
import html
|
23 |
+
|
24 |
+
|
25 |
+
def remove_control_characters(cadena):
|
26 |
+
return rx.sub(r'\p{C}', '', cadena)
|
27 |
+
|
28 |
+
re_num = re.compile(r'[\d,\.]+')
|
29 |
+
|
30 |
+
def replace_NUMs(segment,code="@NUM@"):
|
31 |
+
trobatsEXPRNUM=re.finditer(re_num,segment)
|
32 |
+
for trobat in trobatsEXPRNUM:
|
33 |
+
if not trobat.group(0) in [".",","]:
|
34 |
+
segment=segment.replace(trobat.group(0),code,1)
|
35 |
+
return(segment)
|
36 |
+
|
37 |
+
def splitnumbers(segment,joiner=""):
|
38 |
+
joiner=joiner+" "
|
39 |
+
xifres = re.findall(re_num,segment)
|
40 |
+
for xifra in xifres:
|
41 |
+
xifrastr=str(xifra)
|
42 |
+
xifrasplit=xifra.split()
|
43 |
+
xifra2=joiner.join(xifra)
|
44 |
+
segment=segment.replace(xifra,xifra2)
|
45 |
+
return(segment)
|
46 |
+
|
47 |
+
def preprocess_segment(segment):
|
48 |
+
if config.escape_html_input:
|
49 |
+
segment=html.escape(segment)
|
50 |
+
if config.unescape_html_input:
|
51 |
+
segment=html.unescape(segment)
|
52 |
+
segment=segment.replace(" <tag0>"," <tag0> ")
|
53 |
+
segment=segment.replace(" </tag0>"," </tag0> ")
|
54 |
+
segment=remove_control_characters(segment)
|
55 |
+
hastags=config.tagrestorer.has_tags(segment)
|
56 |
+
originaltags=config.tagrestorer.get_tags(segment)
|
57 |
+
printLOG(3,"HAS TAGS",hastags)
|
58 |
+
printLOG(3,"TAGS",originaltags)
|
59 |
+
#leading and trailing spaces
|
60 |
+
config.leading_spaces=len(segment)-len(segment.lstrip())
|
61 |
+
config.trailing_spaces=len(segment)-len(segment.rstrip())-1
|
62 |
+
segment=segment.lstrip().rstrip()
|
63 |
+
if config.pre_replace_NUMs:
|
64 |
+
segment=replace_NUMs(segment)
|
65 |
+
if config.pre_split_NUMs:
|
66 |
+
segment=splitnumbers(segment)
|
67 |
+
|
68 |
+
if config.sentencepiece:
|
69 |
+
try:
|
70 |
+
segmentPre=" ".join(config.spSL.encode(segment))
|
71 |
+
except:
|
72 |
+
printLOG(1,"ERROR preprocess segment:",sys.exc_info())
|
73 |
+
else:
|
74 |
+
segmentPre=segment
|
75 |
+
return(segmentPre)
|
76 |
+
|
77 |
+
def postprocess_segment(segmentPre):
|
78 |
+
try:
|
79 |
+
if config.sentencepiece:
|
80 |
+
segmentPost=config.spTL.decode(segmentPre.split())
|
81 |
+
except:
|
82 |
+
printLOG(1,"ERROR preprocess segment:",sys.exc_info())
|
83 |
+
return(segmentPost)
|
84 |
+
|
85 |
+
def tokenizationSL(segment):
|
86 |
+
if config.tokenize_SL and not config.tokenizerSL==None:
|
87 |
+
if config.tokenizerSLType=="MTUOC":
|
88 |
+
tokens=config.tokenizerSL.tokenize(segment)
|
89 |
+
elif config.tokenizerSLType=="Moses":
|
90 |
+
tokens=" ".join(config.tokenizerSL(segment))
|
91 |
+
else:
|
92 |
+
tokens=segment
|
93 |
+
|
94 |
+
return(tokens)
|
95 |
+
|
96 |
+
def tokenizationTL(segment):
|
97 |
+
if config.tokenize_TL and not config.tokenizerTL==None:
|
98 |
+
tokens=config.tokenizerTL.tokenize(segment)
|
99 |
+
else:
|
100 |
+
tokens=segment
|
101 |
+
return(tokens)
|
102 |
+
|
103 |
+
def detokenizationSL(tokens):
|
104 |
+
if not config.tokenizerSL==None:
|
105 |
+
segment=config.tokenizerSL.detokenize(tokens)
|
106 |
+
else:
|
107 |
+
segment=tokens
|
108 |
+
return(tokens)
|
109 |
+
|
110 |
+
def detokenizationTL(tokens):
|
111 |
+
if not config.tokenizerTL==None:
|
112 |
+
segment=config.tokenizerL.detokenize(tokens)
|
113 |
+
else:
|
114 |
+
segment=tokens
|
115 |
+
return(tokens)
|
MTUOC_splitnumbers.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_splitnumbers
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import re
|
19 |
+
import sys
|
20 |
+
|
21 |
+
def splitnumbers(segment,joiner="@@"):
|
22 |
+
joiner=joiner+" "
|
23 |
+
xifres = re.findall(re_num,segment)
|
24 |
+
for xifra in xifres:
|
25 |
+
xifrastr=str(xifra)
|
26 |
+
xifrasplit=xifra.split()
|
27 |
+
xifra2=joiner.join(xifra)
|
28 |
+
segment=segment.replace(xifra,xifra2)
|
29 |
+
return(segment)
|
30 |
+
|
31 |
+
re_num = re.compile(r'[\d,.\-/]+')
|
32 |
+
re_num_tl = re.compile(r'(([\d,.\-/]\s?)+)')
|
33 |
+
|
34 |
+
|
35 |
+
if __name__ == "__main__":
|
36 |
+
for line in sys.stdin:
|
37 |
+
line=line.rstrip()
|
38 |
+
output=splitnumbers(line)
|
39 |
+
print(output)
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
|
MTUOC_tags.py
ADDED
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_tags
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
import re
|
18 |
+
from collections import Counter
|
19 |
+
from bs4 import BeautifulSoup
|
20 |
+
|
21 |
+
|
22 |
+
def lreplace(pattern, sub, string):
|
23 |
+
"""
|
24 |
+
Replaces 'pattern' in 'string' with 'sub' if 'pattern' starts 'string'.
|
25 |
+
"""
|
26 |
+
return re.sub('^%s' % pattern, sub, string)
|
27 |
+
|
28 |
+
def rreplace(pattern, sub, string):
|
29 |
+
"""
|
30 |
+
Replaces 'pattern' in 'string' with 'sub' if 'pattern' ends 'string'.
|
31 |
+
"""
|
32 |
+
return re.sub('%s$' % pattern, sub, string)
|
33 |
+
|
34 |
+
class TagRestorer():
|
35 |
+
def __init__(self):
|
36 |
+
self.taglist=["<tag0>","<tag1>","<tag2>","<tag3>","<tag4>","<tag5>","<tag6>","<tag7>","<tag8>","<tag9>","<tag10>","</tag0>","</tag1>","</tag2>","</tag3>","</tag4>","</tag5>","</tag6>","</tag7>","</tag8>","</tag9>","</tag10>"]
|
37 |
+
|
38 |
+
def has_tags(self, segment):
|
39 |
+
response=False
|
40 |
+
tagsA = re.findall(r'</?.+?/?>', segment)
|
41 |
+
tagsB = re.findall(r'\{[0-9]+\}', segment)
|
42 |
+
if len(tagsA)>0 or len(tagsB)>0:
|
43 |
+
response=True
|
44 |
+
return(response)
|
45 |
+
|
46 |
+
def get_name(self, tag):
|
47 |
+
name=tag.split(" ")[0].replace("<","").replace(">","").replace("/","")
|
48 |
+
return(name)
|
49 |
+
|
50 |
+
def get_tags(self, segment):
|
51 |
+
tagsA = re.findall(r'</?.+?/?>', segment)
|
52 |
+
tagsB = re.findall(r'\{[0-9]+\}', segment)
|
53 |
+
tags=tagsA.copy()
|
54 |
+
tags.extend(tagsB)
|
55 |
+
return(tags)
|
56 |
+
|
57 |
+
def replace_tags(self, segment):
|
58 |
+
equil={}
|
59 |
+
if self.has_tags(segment):
|
60 |
+
tagsA = re.findall(r'</?.+?/?>', segment)
|
61 |
+
tagsB = re.findall(r'\{[0-9]+\}', segment)
|
62 |
+
tags=tagsA.copy()
|
63 |
+
tags.extend(tagsB)
|
64 |
+
conttag=0
|
65 |
+
for tag in tags:
|
66 |
+
if tag.find("</")>-1:
|
67 |
+
tagrep="</tag"+str(conttag)+">"
|
68 |
+
else:
|
69 |
+
tagrep="<tag"+str(conttag)+">"
|
70 |
+
segment=segment.replace(tag,tagrep,1)
|
71 |
+
equil[tagrep]=tag
|
72 |
+
if tag in tagsA:
|
73 |
+
tagclose="</"+self.get_name(tag)+">"
|
74 |
+
tagcloserep="</tag"+str(conttag)+">"
|
75 |
+
if segment.find(tagclose)>-1:
|
76 |
+
segment=segment.replace(tagclose,tagcloserep,1)
|
77 |
+
equil[tagcloserep]=tagclose
|
78 |
+
tags.remove(tagclose)
|
79 |
+
conttag+=1
|
80 |
+
|
81 |
+
return(segment,equil)
|
82 |
+
|
83 |
+
else:
|
84 |
+
return(segment,equil)
|
85 |
+
|
86 |
+
def remove_tags(self, segment):
|
87 |
+
segmentnotags=re.sub('(<[^>]+>)', "",segment)
|
88 |
+
segmentnotags=re.sub('({[0-9]+})', "",segmentnotags)
|
89 |
+
segmentnotags=" ".join(segmentnotags.split())
|
90 |
+
return(segmentnotags)
|
91 |
+
|
92 |
+
def restore_tags_old(self,SOURCENOTAGSTOK, SOURCETAGSTOK, SELECTEDALIGNMENT, TARGETNOTAGSTOK):
|
93 |
+
TARGETTAGLIST=TARGETNOTAGSTOK.split(" ")
|
94 |
+
ali={}
|
95 |
+
for a in SELECTEDALIGNMENT.split():
|
96 |
+
(a1,a2)=a.split("-")
|
97 |
+
a1=int(a1)
|
98 |
+
a2=int(a2)
|
99 |
+
ali[a1]=a2
|
100 |
+
position=0
|
101 |
+
tagpos={}
|
102 |
+
posacu=0
|
103 |
+
SOURCETAGSTOKLIST=SOURCETAGSTOK.split()
|
104 |
+
try:
|
105 |
+
while "▁" in SOURCETAGSTOKLIST: SOURCETAGSTOKLIST.remove("▁")
|
106 |
+
except:
|
107 |
+
pass
|
108 |
+
SOURCENOTAGSTOKLIST=SOURCENOTAGSTOK.split()
|
109 |
+
tagsposition={}
|
110 |
+
cont=0
|
111 |
+
acumulat=0
|
112 |
+
for token in SOURCETAGSTOKLIST:
|
113 |
+
if self.isSTag(token) or self.isSClosingTag(token):
|
114 |
+
if not token in ["<s>","</s>"]:
|
115 |
+
tagsposition[token]=cont
|
116 |
+
TARGETTAGLIST.insert(cont,token)
|
117 |
+
acumulat+=1
|
118 |
+
cont+=1
|
119 |
+
targettags=" ".join(TARGETTAGLIST)
|
120 |
+
return(targettags)
|
121 |
+
|
122 |
+
def remove_start_end_tag(self, segment):
|
123 |
+
try:
|
124 |
+
starttag=re.match("(</?tag[0-9]+>)+",segment)
|
125 |
+
starttag=starttag.group()
|
126 |
+
except:
|
127 |
+
starttag=""
|
128 |
+
try:
|
129 |
+
endtag=re.search("(</?tag[0-9]+>)+$",segment)
|
130 |
+
endtag=endtag.group()
|
131 |
+
except:
|
132 |
+
endtag=""
|
133 |
+
if starttag:
|
134 |
+
segment=lreplace(starttag,"",segment)
|
135 |
+
if endtag:
|
136 |
+
segment=rreplace(endtag,"",segment)
|
137 |
+
return(segment,starttag,endtag)
|
138 |
+
|
139 |
+
def repairSpacesTags(self,slsegment,tlsegment,delimiters=[" ",".",",",":",";","?","!"]):
|
140 |
+
sltags=self.get_tags(slsegment)
|
141 |
+
tltags=self.get_tags(tlsegment)
|
142 |
+
commontags= list((Counter(sltags) & Counter(tltags)).elements())
|
143 |
+
for tag in commontags:
|
144 |
+
try:
|
145 |
+
tagaux=tag
|
146 |
+
chbfSL=slsegment[slsegment.index(tag)-1]
|
147 |
+
chbfTL=tlsegment[tlsegment.index(tag)-1]
|
148 |
+
tagmod=tag
|
149 |
+
if chbfSL in delimiters and chbfTL not in delimiters:
|
150 |
+
tagmod=" "+tagmod
|
151 |
+
if not chbfSL in delimiters and chbfTL in delimiters:
|
152 |
+
tagaux=" "+tagaux
|
153 |
+
try:
|
154 |
+
chafSL=slsegment[slsegment.index(tag)+len(tag)]
|
155 |
+
except:
|
156 |
+
pass
|
157 |
+
try:
|
158 |
+
chafTL=tlsegment[tlsegment.index(tag)+len(tag)]
|
159 |
+
except:
|
160 |
+
pass
|
161 |
+
if chafSL in delimiters and not chafTL in delimiters:
|
162 |
+
tagmod=tagmod+" "
|
163 |
+
if not chafSL in delimiters and chafTL in delimiters:
|
164 |
+
tagaux=tagaux+" "
|
165 |
+
#slsegment=slsegment.replace(tagaux,tagmod,1)
|
166 |
+
tlsegment=tlsegment.replace(tagaux,tagmod,1)
|
167 |
+
tlsegment=tlsegment.replace(" "+tag," "+tag,1)
|
168 |
+
tlsegment=tlsegment.replace(tag+" ",tag+" ",1)
|
169 |
+
|
170 |
+
|
171 |
+
|
172 |
+
except:
|
173 |
+
pass
|
174 |
+
return(tlsegment)
|
175 |
+
|
176 |
+
def numerate(self,segment):
|
177 |
+
numeratedsegment=[]
|
178 |
+
cont=0
|
179 |
+
for token in segment.split():
|
180 |
+
if not token.replace("▁","").strip() in self.taglist:
|
181 |
+
tokenmod=token+"▂"+str(cont)
|
182 |
+
cont+=1
|
183 |
+
else:
|
184 |
+
tokenmod=token
|
185 |
+
numeratedsegment.append(tokenmod)
|
186 |
+
return(" ".join(numeratedsegment))
|
187 |
+
|
188 |
+
def retrieve_indexes(self, segment):
|
189 |
+
indexes=[]
|
190 |
+
for token in segment.split():
|
191 |
+
if token.find("▂")>-1:
|
192 |
+
parts=token.split("▂")
|
193 |
+
try:
|
194 |
+
index=int(parts[-1])
|
195 |
+
indexes.append(index)
|
196 |
+
except:
|
197 |
+
pass
|
198 |
+
if len(indexes)==0:
|
199 |
+
min_value=0
|
200 |
+
max_value=0
|
201 |
+
else:
|
202 |
+
min_value=min(indexes)
|
203 |
+
max_value=max(indexes)
|
204 |
+
return(min_value,max_value)
|
205 |
+
|
206 |
+
def insert_open_close(self, TARGETTAGSTOKNUM,opentag,closetag,minpos,maxpos):
|
207 |
+
position=0
|
208 |
+
num=-1
|
209 |
+
opendone=False
|
210 |
+
closedone=False
|
211 |
+
for token in TARGETTAGSTOKNUM:
|
212 |
+
if token.find("▂")>-1:
|
213 |
+
parts=token.split("▂")
|
214 |
+
try:
|
215 |
+
num=int(parts[-1])
|
216 |
+
except:
|
217 |
+
num=-1
|
218 |
+
if num==minpos and not opendone:
|
219 |
+
TARGETTAGSTOKNUM.insert(position,opentag)
|
220 |
+
opendone=True
|
221 |
+
elif num==maxpos and not closedone:
|
222 |
+
TARGETTAGSTOKNUM.insert(position+1,closetag)
|
223 |
+
closedone=True
|
224 |
+
position+=1
|
225 |
+
return(TARGETTAGSTOKNUM)
|
226 |
+
|
227 |
+
def insert_before(self, segment,insertposition,opentag):
|
228 |
+
position=0
|
229 |
+
num=-1
|
230 |
+
for token in segment:
|
231 |
+
if token.find("▂")>-1:
|
232 |
+
parts=token.split("▂")
|
233 |
+
try:
|
234 |
+
num=int(parts[-1])
|
235 |
+
except:
|
236 |
+
num=-1
|
237 |
+
if num==insertposition:
|
238 |
+
segment.insert(position,opentag)
|
239 |
+
break
|
240 |
+
position+=1
|
241 |
+
return(segment)
|
242 |
+
|
243 |
+
def insert_after(self, segment,insertposition,opentag):
|
244 |
+
position=0
|
245 |
+
num=-1
|
246 |
+
for token in segment:
|
247 |
+
if token.find("▂")>-1:
|
248 |
+
parts=token.split("▂")
|
249 |
+
try:
|
250 |
+
num=int(parts[-1])
|
251 |
+
except:
|
252 |
+
num=-1
|
253 |
+
if num==insertposition:
|
254 |
+
segment.insert(position+1,opentag)
|
255 |
+
break
|
256 |
+
position+=1
|
257 |
+
return(segment)
|
258 |
+
|
259 |
+
|
260 |
+
def insert_opentag(self, TARGETTAGSTOKNUM, position, opentag):
|
261 |
+
alreadydone=[]
|
262 |
+
position2=0
|
263 |
+
num=-1
|
264 |
+
for token in TARGETTAGSTOKNUM:
|
265 |
+
if token.find("▂")>-1:
|
266 |
+
parts=token.split("▂")
|
267 |
+
try:
|
268 |
+
num=int(parts[-1])
|
269 |
+
except:
|
270 |
+
num=-1
|
271 |
+
position2+=1
|
272 |
+
if num==position and not opentag in alreadydone:
|
273 |
+
insertposition=position
|
274 |
+
if insertposition<0: insertposition=0
|
275 |
+
TARGETTAGSTOKNUM=self.insert_before(TARGETTAGSTOKNUM,insertposition,opentag)
|
276 |
+
alreadydone.append(opentag)
|
277 |
+
return(TARGETTAGSTOKNUM)
|
278 |
+
|
279 |
+
def insert_closingtag(self, TARGETTAGSTOKNUM, position, closingtag):
|
280 |
+
alreadydone=[]
|
281 |
+
position2=0
|
282 |
+
num=-1
|
283 |
+
for token in TARGETTAGSTOKNUM:
|
284 |
+
if token.find("▂")>-1:
|
285 |
+
parts=token.split("▂")
|
286 |
+
try:
|
287 |
+
num=int(parts[-1])
|
288 |
+
except:
|
289 |
+
num=-1
|
290 |
+
position2+=1
|
291 |
+
if num==position and not closingtag in alreadydone:
|
292 |
+
insertposition=position
|
293 |
+
if insertposition<0: insertposition=0
|
294 |
+
TARGETTAGSTOKNUM=self.insert_after(TARGETTAGSTOKNUM,insertposition,closingtag)
|
295 |
+
alreadydone.append(closingtag)
|
296 |
+
return(TARGETTAGSTOKNUM)
|
297 |
+
|
298 |
+
|
299 |
+
|
300 |
+
|
301 |
+
return(TARGETTAGSTOKNUM)
|
302 |
+
|
303 |
+
def closest_value(self,input_list, input_value):
|
304 |
+
difference = lambda input_list : abs(input_list - input_value)
|
305 |
+
try:
|
306 |
+
res = min(input_list, key=difference)
|
307 |
+
except:
|
308 |
+
res=""
|
309 |
+
return res
|
310 |
+
|
311 |
+
def restore_tags(self,SOURCENOTAGSTOK, SOURCETAGSTOK, SELECTEDALIGNMENT, TARGETNOTAGSTOK):
|
312 |
+
SOURCETAGSTOK=SOURCETAGSTOK.replace(" ▁ "," ")
|
313 |
+
ali={}
|
314 |
+
nmax=0
|
315 |
+
nmin=100000
|
316 |
+
mmax=0
|
317 |
+
mmin=100000
|
318 |
+
for a in SELECTEDALIGNMENT.split():
|
319 |
+
(a1,a2)=a.split("-")
|
320 |
+
a1=int(a1)
|
321 |
+
a2=int(a2)
|
322 |
+
if not a1 in ali: ####AFEGIT AIXÒ
|
323 |
+
ali[a1]=a2
|
324 |
+
if a1>nmax: nmax=a1
|
325 |
+
if a1<nmin: nmin=a1
|
326 |
+
if a2>mmax: mmax=a2
|
327 |
+
if a2<mmin: mmin=a2
|
328 |
+
#chek is all alignments exists
|
329 |
+
nonexisting=[]
|
330 |
+
for i in range(nmin,nmax):
|
331 |
+
try:
|
332 |
+
b=ali[i]
|
333 |
+
except:
|
334 |
+
nonexisting.append(i)
|
335 |
+
inv_ali = {v: k for k, v in ali.items()}
|
336 |
+
inv_nonexisting=[]
|
337 |
+
for i in range(mmin,mmax):
|
338 |
+
try:
|
339 |
+
b=inv_ali[i]
|
340 |
+
except:
|
341 |
+
inv_nonexisting.append(i)
|
342 |
+
for ne in nonexisting:
|
343 |
+
closest=self.closest_value(inv_nonexisting,ne)
|
344 |
+
ali[ne]=closest
|
345 |
+
SOURCENOTAGSTOKNUM=self.numerate(SOURCENOTAGSTOK)
|
346 |
+
SOURCETAGSTOKNUM=self.numerate(SOURCETAGSTOK)
|
347 |
+
TARGETNOTAGSTOKNUM=self.numerate(TARGETNOTAGSTOK)
|
348 |
+
TARGETTAGSTOKNUM=TARGETNOTAGSTOKNUM.split(" ")
|
349 |
+
taglist=self.taglist.copy()
|
350 |
+
#finding open-close pairs
|
351 |
+
for n in range(0,11):
|
352 |
+
opentag="<tag"+str(n)+">"
|
353 |
+
closetag="</tag"+str(n)+">"
|
354 |
+
regexp=opentag+"(.*?)"+closetag
|
355 |
+
trobat=re.findall(regexp, SOURCETAGSTOKNUM, re.DOTALL)
|
356 |
+
if len(trobat)>0 and opentag in taglist and closetag in taglist:
|
357 |
+
(minpos,maxpos)=self.retrieve_indexes(trobat[0])
|
358 |
+
postrad=[]
|
359 |
+
postrad.append(ali[minpos])
|
360 |
+
postrad.append(ali[maxpos])
|
361 |
+
minpostrad=min(postrad)
|
362 |
+
maxpostrad=max(postrad)
|
363 |
+
TARGETTAGSTOKNUM=self.insert_open_close(TARGETTAGSTOKNUM,opentag,closetag,minpostrad,maxpostrad)
|
364 |
+
taglist.remove(opentag)
|
365 |
+
taglist.remove(closetag)
|
366 |
+
#finding open tags
|
367 |
+
for n in range(0,11):
|
368 |
+
opentag="<tag"+str(n)+">"
|
369 |
+
regexp=opentag+" [^\s]+"
|
370 |
+
trobat=re.findall(regexp, SOURCETAGSTOKNUM, re.DOTALL)
|
371 |
+
if len(trobat)>0 and opentag in taglist:
|
372 |
+
posttoken=trobat[0].replace(opentag,"").strip()
|
373 |
+
try:
|
374 |
+
postnum=int(posttoken.split("▂")[1])
|
375 |
+
except:
|
376 |
+
postnum=None
|
377 |
+
if not postnum==None and opentag in taglist:
|
378 |
+
TARGETTAGSTOKNUM=self.insert_opentag(TARGETTAGSTOKNUM, ali[postnum], opentag)
|
379 |
+
taglist.remove(opentag)
|
380 |
+
#finding closing tags
|
381 |
+
for n in range(0,11):
|
382 |
+
closingtag="</tag"+str(n)+">"
|
383 |
+
regexp="[^\s]+ "+closingtag
|
384 |
+
trobat=re.findall(regexp, SOURCETAGSTOKNUM, re.DOTALL)
|
385 |
+
if len(trobat)>0 and closingtag in taglist:
|
386 |
+
pretoken=trobat[0].replace(closingtag,"").strip()
|
387 |
+
try:
|
388 |
+
prenum=int(pretoken.split("▂")[1])
|
389 |
+
except:
|
390 |
+
prenum=None
|
391 |
+
if not prenum==None and closingtag in taglist:
|
392 |
+
TARGETTAGSTOKNUM=self.insert_closingtag(TARGETTAGSTOKNUM, ali[prenum], closingtag)
|
393 |
+
taglist.remove(closingtag)
|
394 |
+
|
395 |
+
#removing numbering
|
396 |
+
TARGETTAGS=[]
|
397 |
+
for token in TARGETTAGSTOKNUM:
|
398 |
+
TARGETTAGS.append(token.split("▂")[0])
|
399 |
+
|
400 |
+
TARGETTAGS=" ".join(TARGETTAGS)
|
401 |
+
return(TARGETTAGS)
|
402 |
+
|
403 |
+
def fix_xml_tags(self,myxml):
|
404 |
+
if self.has_tags(myxml):
|
405 |
+
tagsPRE=self.get_tags(myxml)
|
406 |
+
myxml2="<fix_xml>"+myxml+"</fix_xml>"
|
407 |
+
soup = BeautifulSoup(myxml2,'xml')
|
408 |
+
fixed=str(soup).replace("<fix_xml>","").replace("</fix_xml>","").split("\n")[-1]
|
409 |
+
tags=self.get_tags(fixed)
|
410 |
+
for TP in tagsPRE:
|
411 |
+
if not TP in tags:
|
412 |
+
return(myxml)
|
413 |
+
for tag in tags:
|
414 |
+
tag2=tag.replace('"',"'")
|
415 |
+
if myxml.find(tag)==-1 and myxml.find(tag2)==-1:
|
416 |
+
fixed=fixed.replace(tag,"")
|
417 |
+
|
418 |
+
if not self.remove_tags(myxml)==self.remove_tags(fixed):
|
419 |
+
fixed=myxml
|
420 |
+
else:
|
421 |
+
fixed=myxml
|
422 |
+
return(fixed)
|
423 |
+
|
424 |
+
|
425 |
+
|
426 |
+
|
MTUOC_tokenizer_eng.py
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_tokenizer_eng 4.0
|
2 |
+
# Copyright (C) 2024 Antoni Oliver
|
3 |
+
# 02/11/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import string
|
19 |
+
import re
|
20 |
+
import sys
|
21 |
+
import html
|
22 |
+
|
23 |
+
QUOTES = (
|
24 |
+
#adapted from: https://gist.github.com/goodmami/ quotes.py
|
25 |
+
'\u0022' # quotation mark (")
|
26 |
+
'\u0027' # apostrophe (')
|
27 |
+
'\u00ab' # left-pointing double-angle quotation mark
|
28 |
+
'\u00bb' # right-pointing double-angle quotation mark
|
29 |
+
'\u2018' # left single quotation mark
|
30 |
+
'\u2019' # right single quotation mark
|
31 |
+
'\u201a' # single low-9 quotation mark
|
32 |
+
'\u201b' # single high-reversed-9 quotation mark
|
33 |
+
'\u201c' # left double quotation mark
|
34 |
+
'\u201d' # right double quotation mark
|
35 |
+
'\u201e' # double low-9 quotation mark
|
36 |
+
'\u201f' # double high-reversed-9 quotation mark
|
37 |
+
'\u2039' # single left-pointing angle quotation mark
|
38 |
+
'\u203a' # single right-pointing angle quotation mark
|
39 |
+
'\u300c' # left corner bracket
|
40 |
+
'\u300d' # right corner bracket
|
41 |
+
'\u300e' # left white corner bracket
|
42 |
+
'\u300f' # right white corner bracket
|
43 |
+
'\u301d' # reversed double prime quotation mark
|
44 |
+
'\u301e' # double prime quotation mark
|
45 |
+
'\u301f' # low double prime quotation mark
|
46 |
+
'\ufe41' # presentation form for vertical left corner bracket
|
47 |
+
'\ufe42' # presentation form for vertical right corner bracket
|
48 |
+
'\ufe43' # presentation form for vertical left corner white bracket
|
49 |
+
'\ufe44' # presentation form for vertical right corner white bracket
|
50 |
+
'\uff02' # fullwidth quotation mark
|
51 |
+
'\uff07' # fullwidth apostrophe
|
52 |
+
'\uff62' # halfwidth left corner bracket
|
53 |
+
'\uff63' # halfwidth right corner bracket
|
54 |
+
)
|
55 |
+
|
56 |
+
HYPENS = ('\u2010','\u2011','\u2012','\u2013')
|
57 |
+
|
58 |
+
class Tokenizer():
|
59 |
+
def __init__(self):
|
60 |
+
self.specialchars=["¿","¡",]
|
61 |
+
self.specialchars.extend(QUOTES)
|
62 |
+
self.specialchars.extend(HYPENS)
|
63 |
+
self.subs=["■'s","■'ll","■'t","■'cause","■'d","■'em","■'ve","■'dn","■'m","■'n","■'re","■'til","■'tween","■'all","ol'■"]
|
64 |
+
self.re_num = re.compile(r'[\d\,\.]+')
|
65 |
+
self.abr=["a.a.","a.a.a.","a.a.u.","a.b.","a.b.a","abbr.","abr.","a.c.","acad.","aclu.","a.d.","adm.","a.e.c.","a.f.l.","afl-cio","afrik.","a.i.a.","a.k.c.","a.l.","a.l.a.","ala.","alt.","alta.","a.m.","a.m.a.","a.m.p.","a.m.u.","antilog.","a.p.","arab.","ariz.","ark.","a.s.","ascap.","at.no.","at.wt.","a.u.","aug.","a.v.","avdp.","ave.","b.a.","b.b.c.","b.c.","b.d.","b.lit.","b.mus.","b.p.","brig.gen.","b.s.","b.t.u.","bul.","bulg.","c.","cal.","calif.","cant.","capt.","c.c.","c.d.","cent.","cento.","c.e.o.","c.g.s.","chem.","chin.","chron.","cie.","c.i.a.","c.i.d.","c.i.o.","cl.","c.m.","cn.","co.","col.","coll.","colo.","comdr.","comp.","com.pop.","conn.","cor.","corp.","cos.","cot.","coul.","cp.","c.p.a.","c.p.l.","c.p.o.","c.s.c.","c.u.","dan.","dar.","d.c.","d.c.l.","d.d.","d.d.s.","d.d.t.","dec.","del.","dept.","deut.","dist.","div.","dr.","ds.","d.sc.","du.","e.c.","e.c.a.","eccles.","ecclus.","ed.","e.d.c.","e.e.","e.e.a.","e.e.c.","e.e.o.c.","e.f.t.a.","e.g.","e.m.f.","e.m.u.","eng.","enl.","eph.","e.r.a.","e.r.p.","e.s.c.","esp.","est.","e.u.","ev.","ex.","ezek.","f.a.a.","fac.","f.a.o.","f.b.i.","f.c.c.","f.d.a.","feb.","f.e.p.c.","finn.","fl.","fla.","floz.","f.m.","fr.","ft.","f.t.c.","g.","ga.","gal.","gall.","gatt.","g.d.p.","gen.","ger.","gm.","g.m.b.","g.m.t.","g.n.p.","g.o.p.","gov.","gr.","grad.","grm.","hab.","hag.","heb.","h.m.s.","hon.","hr.","hung.","hz.","i.a.u.","i.b.m.","i.b.t.","i.c.a.o.","i.c.b.m.","i.c.c.","icel.","i.e.","i.g.y.","ilgwu.","ill.","i.l.o.","i.m.f.","inc.","incl.","ind.","ing.","inst.","introd.","i.q.","i.r.a.","i.r.b.m.","i.r.s.","isa.","ital.","i.t.u.","i.u.p.a.c.","i.w.w.","jan.","jap.","j.d.","jer.","j.g.","jr.","j.","kc.","kg.","kgb.","kgm.","k.k.k.","kl.","km.","kw.","kwh.","ky.","l.","la.","lam.","lat.","lb.","lev.","l.h.d.","lib.","lith.","litt.b.","litt.d.","ll.b.","ll.d.","l.s.d.","lt.","lt.col.","ltd.","lt.gen.","lt.gov.","lts.","m.a.","mac.","maj.","maj.gen.","mal.","mass.","mass.no.","m.b.","m.d.","md.","m.e.","messrs.","mev.","mex.","mfg.","mg.","m.h.g.","mi.","mich.","min.","minn.","miss.","mks.","ml.","mlle.","mls.","mm.","mme.","mo.","mont.","m.p.","m.p.h.","mph.","mr.","mrs.","m.s.","ms.","msgr.","mss.","mt.","mts.","mus.","mus.b.","mus.d.","n.a.a.c.p.","n.a.f.t.a.","n.a.s.a.","n.a.s.d.a.q.","n.a.t.o.","n.b.","n.b.a.","n.c.","n.c.a.a.","n.c.o.","n.dak.","n.e.","n.e.a.","nebr.","neh.","nev.","n.f.","n.f.l.","n.h.","n.h.l.","n.j.","nl.","n.l.r.b.","n.mex.","nnw","no.","non-u.s.","nor.","nov.","n.r.a.","n.r.c.","n.s.","n.s.f.","num.","n.v.","n.y.","n.y.a.","n.y.s.e.","o.a.s.","obad.","oct.","o.e.","o.e.c.d.","o.e.o.","o.e.s.","o.fr.","o.h.g.","okla.","o.n.","ont.","op.","o.p.a.","o.s.","o.s.c.e.","o.s.s.","o.z.","oz.","pa.","p.a.u.","pd.d.","p.e.i.","pers.","p.f.c.","p.g.a.","ph.","ph.b.","ph.d.","philip.","pk.","pkg.","pl.","plc","p.m","p.m.","pn.","po.","pol.","pop.","port.","prof.","prov.","prov(s).","ps.","pseud.","pss.","pt.","pts.","pub.","pvt.","p.w.a.","q.t.","qt.","qts.","que.","r.a.","r.a.f.","rep.","reps.","repr.","rev.","r.i.","r.n.","r.n.a.","rom.","r.o.t.c.","r.p.m.","rpm.","r.r.","r.s.f.s.r.","r.s.v.","rt.rev.","rus.","r.v.","sam.","sask.","s.c.","sc.d.","s.dak.","s.e.","s.e.a.t.o.","sec.","sen.","sens.","sept.","ser.","sgt.","s.j.","skt.","sl.","s.o.s.","span.","s.p.a.","s.p.c.a.","s.p.c.c.","sp.gr.","s.q.","sr.","s.s.","s.s.r.","st.","s.t.d.","s.t.e.","s.t.p.","s.w.","swed.","t.","tablesp.","t.a.n.","t.a.s.s.","tb.","tbl.","tble.","tbles.","tbls.","tblsp.","tbs.","tbsn.","tbsp.","tbsps.","teas.","tenn.","thess.","tim.","t.n.t","tr.","ts.","tsp.","tsps.","turk.","t.v.a.","u.a.w.","u.h.f.","ukr.","u.m.w.","u.n.","uninc.","univ.","u.n.r.r.a.","u.p.i.","u.s.","u.s.a.","u.s.a.f.","u.s.c.g.","u.s.m.c.","u.s.n.","u.s.o.","u.s.s.","u.s.s.r.","u.t.","va.","var.","ved.","v.","v.f.w.","v.h.f.","vol.","vs.","vt.","w.a.c.","w.c.t.u.","w.e.u.","w.f.t.u.","wis.","wmo.","wpa.","wt.","wto.","w.va.","wyo.","yd.","y.m.c.a.","y.m.h.a.","y.w.c.a.","y.w.h.a.","zech.","zeph."]
|
66 |
+
abr_aux=[]
|
67 |
+
abr_aux=[]
|
68 |
+
for a in self.abr:
|
69 |
+
am1=a.capitalize()
|
70 |
+
am2=a.upper()
|
71 |
+
abr_aux.append(am1)
|
72 |
+
abr_aux.append(am2)
|
73 |
+
self.abr.extend(abr_aux)
|
74 |
+
self.setabr=set(self.abr)
|
75 |
+
self.abrsig=()
|
76 |
+
def split_numbers(self,segment):
|
77 |
+
xifres = re.findall(self.re_num,segment)
|
78 |
+
for xifra in xifres:
|
79 |
+
xifrastr=str(xifra)
|
80 |
+
xifrasplit=xifra.split()
|
81 |
+
xifra2=[]
|
82 |
+
contpos=0
|
83 |
+
for x in xifrastr:
|
84 |
+
if not contpos==0: xifra2.append(" ■")
|
85 |
+
xifra2.append(x)
|
86 |
+
contpos+=1
|
87 |
+
xifra2="".join(xifra2)
|
88 |
+
segment=segment.replace(xifra,xifra2)
|
89 |
+
return(segment)
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
def protect_tags(self, segment):
|
94 |
+
tags=re.findall(r'<[^>]+>',segment)
|
95 |
+
for tag in tags:
|
96 |
+
ep=False
|
97 |
+
ef=False
|
98 |
+
if segment.find(" "+tag)==-1:ep=True
|
99 |
+
if segment.find(tag+" ")==-1:ef=True
|
100 |
+
tagmod=tag.replace("<","<").replace(">",">").replace("=","=").replace("'","'").replace('"',""").replace("/","/").replace(" "," ")
|
101 |
+
if ep: tagmod=" ■"+tagmod
|
102 |
+
if ef: tagmod=tagmod+"■ "
|
103 |
+
segment=segment.replace(tag,tagmod)
|
104 |
+
tags=re.findall(r'\{[0-9]+\}',segment)
|
105 |
+
for tag in tags:
|
106 |
+
ep=False
|
107 |
+
ef=False
|
108 |
+
if segment.find(" "+tag)==-1:ep=True
|
109 |
+
if segment.find(tag+" ")==-1:ef=True
|
110 |
+
tagmod=tag.replace("{","{").replace("}","}")
|
111 |
+
if ep: tagmod=" ■"+tagmod
|
112 |
+
if ef: tagmod=tagmod+"■ "
|
113 |
+
segment=segment.replace(tag,tagmod)
|
114 |
+
return(segment)
|
115 |
+
|
116 |
+
def protect_abr(self,cadena):
|
117 |
+
setcadena=set(cadena.split(" "))
|
118 |
+
common=(self.setabr & setcadena)
|
119 |
+
self.abrsig=list(common)
|
120 |
+
for a in common:
|
121 |
+
amod=a+" "
|
122 |
+
apro=a.replace(".",".")
|
123 |
+
cadena=cadena.replace(amod,apro)
|
124 |
+
sigles=re.findall(r'((\w\.){2,})',cadena)
|
125 |
+
for s in sigles:
|
126 |
+
a=s[0]
|
127 |
+
self.abrsig.append(a)
|
128 |
+
amod=a+" "
|
129 |
+
apro=a.replace(".",".")
|
130 |
+
cadena=cadena.replace(amod,apro)
|
131 |
+
return(cadena)
|
132 |
+
|
133 |
+
def unprotect(self, cadena):
|
134 |
+
cadena=cadena.replace("'","'").replace("-","-").replace("<","<").replace(">",">").replace(""",'"').replace("=","=").replace(" ","▁").replace("/","/").replace("{","{").replace("}","}")
|
135 |
+
return(cadena)
|
136 |
+
|
137 |
+
def unprotect_tags(self, cadena):
|
138 |
+
cadena=cadena.replace("<","<").replace(">",">")
|
139 |
+
return(cadena)
|
140 |
+
|
141 |
+
def unprotect_abr(self,cadena):
|
142 |
+
for a in self.abrsig:
|
143 |
+
amod=a+" "
|
144 |
+
apro=a.replace(".",".")
|
145 |
+
cadena=cadena.replace(apro,amod)
|
146 |
+
return(cadena)
|
147 |
+
|
148 |
+
def main_tokenizer(self,segment):
|
149 |
+
segment=" "+segment+" "
|
150 |
+
cadena=self.protect_tags(segment)
|
151 |
+
cadena=self.protect_abr(segment)
|
152 |
+
|
153 |
+
for s in [".",","]:
|
154 |
+
pass
|
155 |
+
for s in self.subs:
|
156 |
+
sA=s.replace("■","")
|
157 |
+
sB=s.replace("'","'").replace("-","-")
|
158 |
+
if s.startswith("■"):sB=" "+sB
|
159 |
+
if s.endswith("■"):sB=sB+" "
|
160 |
+
cadena = re.sub(sA+r'\b', sB, cadena)
|
161 |
+
cadena = re.sub(r'\b'+sA, sB, cadena)
|
162 |
+
cadena = re.sub(sA.upper()+r'\b', sB.upper(), cadena)
|
163 |
+
cadena = re.sub(r'\b'+sA.upper(), sB.upper(), cadena)
|
164 |
+
punt=list(string.punctuation)
|
165 |
+
exceptions=["&",";","#"]
|
166 |
+
for e in exceptions:
|
167 |
+
punt.remove(e)
|
168 |
+
punt.remove(".")
|
169 |
+
punt.remove(",")
|
170 |
+
cadena = re.sub(r'\.(\D)'," . "+r"\1", cadena)
|
171 |
+
cadena = re.sub(r'\,(\D)'," , "+r"\1", cadena)
|
172 |
+
for p in punt:
|
173 |
+
ppre=" ■"+p
|
174 |
+
ppost=p+"■ "
|
175 |
+
try:
|
176 |
+
expr1="(\\S)\\"+p+"(\\s)"
|
177 |
+
expr2=r"\1"+ppre+r"\2"
|
178 |
+
cadena = re.sub(expr1,expr2, cadena)
|
179 |
+
expr1="(\\s)\\"+p+"(\\S)"
|
180 |
+
expr2=r"\1"+ppost+r"\2"
|
181 |
+
cadena = re.sub(expr1,expr2, cadena)
|
182 |
+
except:
|
183 |
+
pass
|
184 |
+
cadena=self.unprotect_tags(cadena)
|
185 |
+
cadena=self.unprotect_abr(cadena)
|
186 |
+
for p in self.specialchars:
|
187 |
+
pmod=p+" "
|
188 |
+
if cadena.find(pmod)>=-1:
|
189 |
+
pmod2=p+"■ "
|
190 |
+
cadena=cadena.replace(p,pmod2)
|
191 |
+
pmod=" "+p
|
192 |
+
if cadena.find(pmod)>=-1:
|
193 |
+
pmod2=" ■"+p
|
194 |
+
cadena=cadena.replace(p,pmod2)
|
195 |
+
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
cadena=self.unprotect(cadena)
|
200 |
+
|
201 |
+
for p in exceptions:
|
202 |
+
pmod=p+" "
|
203 |
+
if cadena.find(pmod)>=-1:
|
204 |
+
pmod2=p+"■ "
|
205 |
+
cadena=cadena.replace(p,pmod2)
|
206 |
+
pmod=" "+p
|
207 |
+
if cadena.find(pmod)>=-1:
|
208 |
+
pmod2=" ■"+p
|
209 |
+
cadena=cadena.replace(p,pmod2)
|
210 |
+
|
211 |
+
cadena=cadena.replace("▁"," ")
|
212 |
+
cadena=' '.join(cadena.split())
|
213 |
+
return(cadena)
|
214 |
+
|
215 |
+
def tokenize(self,segment):
|
216 |
+
tokenized=self.main_tokenizer(segment)
|
217 |
+
tokenized=tokenized.replace("■","")
|
218 |
+
tokenized=' '.join(tokenized.split())
|
219 |
+
return(tokenized)
|
220 |
+
|
221 |
+
def detokenize(self, segment):
|
222 |
+
for sub in self.subs:
|
223 |
+
subA=sub.replace("■"," ")
|
224 |
+
subB=sub.replace("■","")
|
225 |
+
segment=segment.replace(subA,subB)
|
226 |
+
segment=segment.replace(subA.capitalize(),subB.capitalize())
|
227 |
+
segment=segment.replace(subA.upper(),subB.upper())
|
228 |
+
segment=segment.replace(" .",".")
|
229 |
+
segment=segment.replace(" ,",",")
|
230 |
+
segment=segment.replace(" :",":")
|
231 |
+
segment=segment.replace(" ;",";")
|
232 |
+
segment=segment.replace(" :",":")
|
233 |
+
segment=segment.replace(" )",")")
|
234 |
+
segment=segment.replace("( ","(")
|
235 |
+
segment=segment.replace(" ]","]")
|
236 |
+
segment=segment.replace("[ ","[")
|
237 |
+
segment=segment.replace(" }","}")
|
238 |
+
segment=segment.replace("{ ","{")
|
239 |
+
segment=segment.replace(" !","!")
|
240 |
+
segment=segment.replace("¡ ","¡")
|
241 |
+
segment=segment.replace(" ?","?")
|
242 |
+
segment=segment.replace("¿ ","¿")
|
243 |
+
segment=segment.replace(" »","»")
|
244 |
+
segment=segment.replace("« ","«")
|
245 |
+
segment=segment.replace("‘ ","‘")
|
246 |
+
segment=segment.replace(" ’","’")
|
247 |
+
segment=segment.replace("“ ","“")
|
248 |
+
segment=segment.replace(" ”","”")
|
249 |
+
segment=' '.join(segment.split())
|
250 |
+
return(segment)
|
251 |
+
|
252 |
+
def tokenize_j(self,segment):
|
253 |
+
tokenized=self.main_tokenizer(segment)
|
254 |
+
tokenized=' '.join(tokenized.split())
|
255 |
+
return(tokenized)
|
256 |
+
|
257 |
+
def detokenize_j(self,segment):
|
258 |
+
segment=segment.replace(" ■","")
|
259 |
+
segment=segment.replace("■ ","")
|
260 |
+
segment=segment.replace("■","")
|
261 |
+
detok=segment
|
262 |
+
detok=' '.join(detok.split())
|
263 |
+
return(detok)
|
264 |
+
|
265 |
+
def tokenize_jn(self,segment):
|
266 |
+
tokenized=self.tokenize_j(segment)
|
267 |
+
tokenized=self.split_numbers(tokenized)
|
268 |
+
tokenized=' '.join(tokenized.split())
|
269 |
+
return(tokenized)
|
270 |
+
|
271 |
+
def detokenize_jn(self,segment):
|
272 |
+
segment=self.detokenize_j(segment)
|
273 |
+
return(segment)
|
274 |
+
|
275 |
+
def tokenize_s(self,segment):
|
276 |
+
tokenized=self.main_tokenizer(segment)
|
277 |
+
tokenized=tokenized.replace("■ ","■")
|
278 |
+
tokenized=tokenized.replace(" ■","■")
|
279 |
+
tokenized=tokenized.replace(" "," ▁")
|
280 |
+
tokenized=tokenized.replace("■"," ")
|
281 |
+
tokenized=' '.join(tokenized.split())
|
282 |
+
return(tokenized)
|
283 |
+
|
284 |
+
def detokenize_s(self,segment):
|
285 |
+
segment=segment.replace(" ","")
|
286 |
+
segment=segment.replace("▁"," ")
|
287 |
+
detok=segment
|
288 |
+
detok=' '.join(detok.split())
|
289 |
+
return(detok)
|
290 |
+
|
291 |
+
def tokenize_sn(self,segment):
|
292 |
+
tokenized=self.main_tokenizer(segment)
|
293 |
+
tokenized=self.split_numbers(tokenized)
|
294 |
+
tokenized=tokenized.replace("■ ","■")
|
295 |
+
tokenized=tokenized.replace(" ■","■")
|
296 |
+
tokenized=tokenized.replace(" "," ▁")
|
297 |
+
tokenized=tokenized.replace("■"," ")
|
298 |
+
tokenized=' '.join(tokenized.split())
|
299 |
+
return(tokenized)
|
300 |
+
|
301 |
+
def detokenize_sn(self,segment):
|
302 |
+
segment=self.detokenize_s(segment)
|
303 |
+
return(segment)
|
304 |
+
|
305 |
+
def print_help():
|
306 |
+
print("MTUOC_tokenizer_cat.py A tokenizer for Catalan, usage:")
|
307 |
+
print("Simple tokenization:")
|
308 |
+
print(' cat "sentence to tokenize." | python3 MTUOC_tokenizer_cat.py tokenize')
|
309 |
+
print(' python3 MTUOC_tokenizer_cat.py tokenize < file_to_tokenize > tokenized_file')
|
310 |
+
print()
|
311 |
+
print("Simple detokenization:")
|
312 |
+
print(' cat "sentence to tokenize." | python3 MTUOC_tokenizer_cat.py detokenize')
|
313 |
+
print(' python3 MTUOC_tokenizer_cat.py detokenize < file_to_detokenize > detokenized_file')
|
314 |
+
print()
|
315 |
+
print("Advanced options:")
|
316 |
+
print(" tokenization/detokenization with joiner marks (■): tokenize_j / detokenize_j")
|
317 |
+
print(" tokenization/detokenization with joiner marks (■) and number splitting: tokenize_jn / detokenize_jn")
|
318 |
+
print(" tokenization/detokenization with splitter marks (▁): tokenize_s / detokenize_s")
|
319 |
+
print(" tokenization/detokenization with splitter marks (▁) and number splitting: tokenize_sn / detokenize_sn")
|
320 |
+
|
321 |
+
|
322 |
+
if __name__ == "__main__":
|
323 |
+
if len(sys.argv)>1:
|
324 |
+
if sys.argv[1]=="-h" or sys.argv[1]=="--help":
|
325 |
+
print_help()
|
326 |
+
sys.exit()
|
327 |
+
action=sys.argv[1]
|
328 |
+
else:
|
329 |
+
action="tokenize"
|
330 |
+
tokenizer=Tokenizer()
|
331 |
+
for line in sys.stdin:
|
332 |
+
line=line.strip()
|
333 |
+
#normalization of apostrophe
|
334 |
+
line=line.replace("’","'")
|
335 |
+
line=html.unescape(line)
|
336 |
+
if action=="tokenize":
|
337 |
+
outsegment=tokenizer.tokenize(line)
|
338 |
+
elif action=="detokenize":
|
339 |
+
outsegment=tokenizer.detokenize(line)
|
340 |
+
|
341 |
+
elif action=="tokenize_s":
|
342 |
+
outsegment=tokenizer.tokenize_s(line)
|
343 |
+
elif action=="detokenize_s":
|
344 |
+
outsegment=tokenizer.detokenize_s(line)
|
345 |
+
elif action=="tokenize_sn":
|
346 |
+
outsegment=tokenizer.tokenize_sn(line)
|
347 |
+
elif action=="detokenize_sn":
|
348 |
+
outsegment=tokenizer.detokenize_sn(line)
|
349 |
+
|
350 |
+
elif action=="tokenize_j":
|
351 |
+
outsegment=tokenizer.tokenize_j(line)
|
352 |
+
elif action=="detokenize_j":
|
353 |
+
outsegment=tokenizer.detokenize_j(line)
|
354 |
+
elif action=="tokenize_jn":
|
355 |
+
outsegment=tokenizer.tokenize_jn(line)
|
356 |
+
elif action=="detokenize_jn":
|
357 |
+
outsegment=tokenizer.detokenize_jn(line)
|
358 |
+
|
359 |
+
print(outsegment)
|
360 |
+
|
MTUOC_tokenizer_spa.py
ADDED
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_tokenizer_spa 4.0
|
2 |
+
# Copyright (C) 2024 Antoni Oliver
|
3 |
+
# This program is free software: you can redistribute it and/or modify
|
4 |
+
# it under the terms of the GNU General Public License as published by
|
5 |
+
# the Free Software Foundation, either version 3 of the License, or
|
6 |
+
# (at your option) any later version.
|
7 |
+
#
|
8 |
+
# This program is distributed in the hope that it will be useful,
|
9 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11 |
+
# GNU General Public License for more details.
|
12 |
+
|
13 |
+
# You should have received a copy of the GNU General Public License
|
14 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
15 |
+
|
16 |
+
|
17 |
+
import string
|
18 |
+
import re
|
19 |
+
import sys
|
20 |
+
import html
|
21 |
+
|
22 |
+
QUOTES = (
|
23 |
+
#adapted from: https://gist.github.com/goodmami/ quotes.py
|
24 |
+
'\u0022' # quotation mark (")
|
25 |
+
'\u0027' # apostrophe (')
|
26 |
+
'\u00ab' # left-pointing double-angle quotation mark
|
27 |
+
'\u00bb' # right-pointing double-angle quotation mark
|
28 |
+
'\u2018' # left single quotation mark
|
29 |
+
'\u2019' # right single quotation mark
|
30 |
+
'\u201a' # single low-9 quotation mark
|
31 |
+
'\u201b' # single high-reversed-9 quotation mark
|
32 |
+
'\u201c' # left double quotation mark
|
33 |
+
'\u201d' # right double quotation mark
|
34 |
+
'\u201e' # double low-9 quotation mark
|
35 |
+
'\u201f' # double high-reversed-9 quotation mark
|
36 |
+
'\u2039' # single left-pointing angle quotation mark
|
37 |
+
'\u203a' # single right-pointing angle quotation mark
|
38 |
+
'\u300c' # left corner bracket
|
39 |
+
'\u300d' # right corner bracket
|
40 |
+
'\u300e' # left white corner bracket
|
41 |
+
'\u300f' # right white corner bracket
|
42 |
+
'\u301d' # reversed double prime quotation mark
|
43 |
+
'\u301e' # double prime quotation mark
|
44 |
+
'\u301f' # low double prime quotation mark
|
45 |
+
'\ufe41' # presentation form for vertical left corner bracket
|
46 |
+
'\ufe42' # presentation form for vertical right corner bracket
|
47 |
+
'\ufe43' # presentation form for vertical left corner white bracket
|
48 |
+
'\ufe44' # presentation form for vertical right corner white bracket
|
49 |
+
'\uff02' # fullwidth quotation mark
|
50 |
+
'\uff07' # fullwidth apostrophe
|
51 |
+
'\uff62' # halfwidth left corner bracket
|
52 |
+
'\uff63' # halfwidth right corner bracket
|
53 |
+
)
|
54 |
+
|
55 |
+
HYPENS = ('\u2010','\u2011','\u2012','\u2013')
|
56 |
+
|
57 |
+
class Tokenizer():
|
58 |
+
def __init__(self):
|
59 |
+
self.specialchars=["¿","¡",]
|
60 |
+
self.specialchars.extend(QUOTES)
|
61 |
+
self.specialchars.extend(HYPENS)
|
62 |
+
self.subs=[]
|
63 |
+
self.re_num = re.compile(r'[\d\,\.]+')
|
64 |
+
self.abr=["aa.rr.","abr.","abrev.","a.c.","adj.","adm.","admón.","afma.","afmas.","afmo.","afmos.","ag.","ago.","am.","ap.","apdo.","art.","arts.","arz.","arzbpo.","assn.","atte.","av.","avda.","bros.","bv.","cap.","caps.","cg.","cgo.","cia.","cía.","cit.","co.","col.","corp.","cos.","cta.","cte.","ctra.","cts.","d.c.","dcha.","dept.","depto.","dic.","doc.","docs.","dpt.","dpto.","dr.","dra.","dras.","dres.","dto.","dupdo.","ed.","ee.uu.","ej.","emma.","emmas.","emmo.","emmos.","ene.","entlo.","entpo.","esp.","etc.","ex.","excm.","excma.","excmas.","excmo.","excmos.","fasc.","fdo.","feb.","fig.","figs.","fol.","fra.","gb.","gral.","hnos.","hros.","ib.","ibid.","ibíd.","id.","íd.","ilm.","ilma.","ilmas.","ilmo.","ilmos.","iltre.","inc.","intr.","ít.","izq.","izqda.","izqdo.","jr.","jul.","jun.","lám.","lda.","ldo.","lib.","lim.","loc.","ltd.","ltda.","mar.","máx.","may.","mín.","mons.","mr.","mrs.","ms.","mss.","mtro.","nov.","ntra.","ntro.","núm.","ob.","obpo.","oct.","op.","pág.","págs.","párr.","pd.","ph.","pje.","pl.","plc.","pm.","pp.","ppal.","pral.","prof.","pról.","prov.","ps.","pta.","ptas.","pte.","pts.","pza.","rda.","rdo.","ref.","reg.","rel.","rev.","revda.","revdo.","rma.","rmo.","r.p.m.","rte.","sdad.","sec.","secret.","sep.","sig.","smo.","sr.","sra.","sras.","sres.","srs.","srta.","ss.mm.","sta.","sto.","sust.","tech.","tel.","teléf.","telf.","ten.","tfono.","tít.","tlf.","ud.","uds.","vda.","vdo.","vid.","vol.","vols.","vra.","vro.","vta.","abr.","adj.","admón.","admor.","admora.","admtvo.","admvo.","advo.","ag.","ago.","agt.","a.m.","ap.","apdo.","apénd.","aprox.","aptdo.","art.","asoc.","att.","av.","avda.","ayto.","bach.","bibl.","bol.","calif.","cast.","cat.","c.c.","cc.oo.","cf.","cfr.","cjón.","cta.","ctra.","d.","dª.","dcha.","d.e.p.","depto.","dic.","dicbre.","dir.","disp.","dña.","do.","doc.","dom.","dr.","dra.","dto.","dupl.","e.","ed.","ej.","emmo.","ene.","entlo.","epitomadamente.","esc.","esp.","etc.","excma.","excmo.","exp.","expte.","ext.","fac.","facs.","fasc.","fdo.","feb.","febr.","fig.","fra.","gob.","gral.","hble.","hnos.","hosp.","ib.","ibid.","ibíd.","id.","íd.","il.","ilma.","ilmo.","iltre.","inc.","intr.","izq.","izqda.","j.","j.c.","ju.","jue.","jul.","jun.","k.o.","lcdo.","lda.","ldo.","lic.","ltda.","lu.","lun.","ma.","mar.","mart.","máx.","may.","mi.","mié.","miérc.","min.","mín.","mn.","mons.","my.","mzo.","n.b.","nº.","nov.","novbre.","nro.","n.s.","ntra.","ntro.","núm.","núms.","oct.","op.","pág.","pár.","párr.","p.ej.","pl.","p.m.","p.o.","pp.","pp.","pral.","proc.","prof.","ps.","p.s.","pte.","pts.","p.v.p.","pza.","q.d.e.p.","q.e.g.e.","r.d.","r.d.l.","rdo.","ref.","r.i.p.","r.o.","r.p.m.","rvdo.","s.","s.a.","sa.","s.a.","sáb.","sdad.","sep.","sept.","setbre.","s.l.","s.l.u.","s.m.","sr.","sra.","sras.","sres.","s.s.","ss.mm.","ssp.","stmo.","subsp.","tel.","trad.","ud.","ud.","udes.","uds.","urb.","v.a.","v.a.r.","vd.","vda.","vds.","vi.","vid.","vie.","vnes.","vol."]
|
65 |
+
abr_aux=[]
|
66 |
+
abr_aux=[]
|
67 |
+
for a in self.abr:
|
68 |
+
am1=a.capitalize()
|
69 |
+
am2=a.upper()
|
70 |
+
abr_aux.append(am1)
|
71 |
+
abr_aux.append(am2)
|
72 |
+
self.abr.extend(abr_aux)
|
73 |
+
self.setabr=set(self.abr)
|
74 |
+
self.abrsig=()
|
75 |
+
def split_numbers(self,segment):
|
76 |
+
xifres = re.findall(self.re_num,segment)
|
77 |
+
for xifra in xifres:
|
78 |
+
xifrastr=str(xifra)
|
79 |
+
xifrasplit=xifra.split()
|
80 |
+
xifra2=[]
|
81 |
+
contpos=0
|
82 |
+
for x in xifrastr:
|
83 |
+
if not contpos==0: xifra2.append(" ■")
|
84 |
+
xifra2.append(x)
|
85 |
+
contpos+=1
|
86 |
+
xifra2="".join(xifra2)
|
87 |
+
segment=segment.replace(xifra,xifra2)
|
88 |
+
return(segment)
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
def protect_tags(self, segment):
|
93 |
+
tags=re.findall(r'<[^>]+>',segment)
|
94 |
+
for tag in tags:
|
95 |
+
ep=False
|
96 |
+
ef=False
|
97 |
+
if segment.find(" "+tag)==-1:ep=True
|
98 |
+
if segment.find(tag+" ")==-1:ef=True
|
99 |
+
tagmod=tag.replace("<","<").replace(">",">").replace("=","=").replace("'","'").replace('"',""").replace("/","/").replace(" "," ")
|
100 |
+
if ep: tagmod=" ■"+tagmod
|
101 |
+
if ef: tagmod=tagmod+"■ "
|
102 |
+
segment=segment.replace(tag,tagmod)
|
103 |
+
tags=re.findall(r'\{[0-9]+\}',segment)
|
104 |
+
for tag in tags:
|
105 |
+
ep=False
|
106 |
+
ef=False
|
107 |
+
if segment.find(" "+tag)==-1:ep=True
|
108 |
+
if segment.find(tag+" ")==-1:ef=True
|
109 |
+
tagmod=tag.replace("{","{").replace("}","}")
|
110 |
+
if ep: tagmod=" ■"+tagmod
|
111 |
+
if ef: tagmod=tagmod+"■ "
|
112 |
+
segment=segment.replace(tag,tagmod)
|
113 |
+
return(segment)
|
114 |
+
|
115 |
+
def protect_abr(self,cadena):
|
116 |
+
setcadena=set(cadena.split(" "))
|
117 |
+
common=(self.setabr & setcadena)
|
118 |
+
self.abrsig=list(common)
|
119 |
+
for a in common:
|
120 |
+
amod=a+" "
|
121 |
+
apro=a.replace(".",".")
|
122 |
+
cadena=cadena.replace(amod,apro)
|
123 |
+
sigles=re.findall(r'((\w\.){2,})',cadena)
|
124 |
+
for s in sigles:
|
125 |
+
a=s[0]
|
126 |
+
self.abrsig.append(a)
|
127 |
+
amod=a+" "
|
128 |
+
apro=a.replace(".",".")
|
129 |
+
cadena=cadena.replace(amod,apro)
|
130 |
+
return(cadena)
|
131 |
+
|
132 |
+
def unprotect(self, cadena):
|
133 |
+
cadena=cadena.replace("'","'").replace("-","-").replace("<","<").replace(">",">").replace(""",'"').replace("=","=").replace(" ","▁").replace("/","/").replace("{","{").replace("}","}")
|
134 |
+
return(cadena)
|
135 |
+
|
136 |
+
def unprotect_tags(self, cadena):
|
137 |
+
cadena=cadena.replace("<","<").replace(">",">")
|
138 |
+
return(cadena)
|
139 |
+
|
140 |
+
def unprotect_abr(self,cadena):
|
141 |
+
for a in self.abrsig:
|
142 |
+
amod=a+" "
|
143 |
+
apro=a.replace(".",".")
|
144 |
+
cadena=cadena.replace(apro,amod)
|
145 |
+
return(cadena)
|
146 |
+
|
147 |
+
def main_tokenizer(self,segment):
|
148 |
+
segment=" "+segment+" "
|
149 |
+
cadena=self.protect_tags(segment)
|
150 |
+
cadena=self.protect_abr(segment)
|
151 |
+
|
152 |
+
for s in [".",","]:
|
153 |
+
pass
|
154 |
+
for s in self.subs:
|
155 |
+
sA=s.replace("■","")
|
156 |
+
sB=s.replace("'","'").replace("-","-")
|
157 |
+
if s.startswith("■"):sB=" "+sB
|
158 |
+
if s.endswith("■"):sB=sB+" "
|
159 |
+
cadena = re.sub(sA+r'\b', sB, cadena)
|
160 |
+
cadena = re.sub(r'\b'+sA, sB, cadena)
|
161 |
+
cadena = re.sub(sA.upper()+r'\b', sB.upper(), cadena)
|
162 |
+
cadena = re.sub(r'\b'+sA.upper(), sB.upper(), cadena)
|
163 |
+
punt=list(string.punctuation)
|
164 |
+
exceptions=["&",";","#"]
|
165 |
+
for e in exceptions:
|
166 |
+
punt.remove(e)
|
167 |
+
punt.remove(".")
|
168 |
+
punt.remove(",")
|
169 |
+
cadena = re.sub(r'\.(\D)'," . "+r"\1", cadena)
|
170 |
+
cadena = re.sub(r'\,(\D)'," , "+r"\1", cadena)
|
171 |
+
for p in punt:
|
172 |
+
ppre=" ■"+p
|
173 |
+
ppost=p+"■ "
|
174 |
+
try:
|
175 |
+
expr1="(\\S)\\"+p+"(\\s)"
|
176 |
+
expr2=r"\1"+ppre+r"\2"
|
177 |
+
cadena = re.sub(expr1,expr2, cadena)
|
178 |
+
expr1="(\\s)\\"+p+"(\\S)"
|
179 |
+
expr2=r"\1"+ppost+r"\2"
|
180 |
+
cadena = re.sub(expr1,expr2, cadena)
|
181 |
+
except:
|
182 |
+
pass
|
183 |
+
cadena=self.unprotect_tags(cadena)
|
184 |
+
cadena=self.unprotect_abr(cadena)
|
185 |
+
for p in self.specialchars:
|
186 |
+
pmod=p+" "
|
187 |
+
if cadena.find(pmod)>=-1:
|
188 |
+
pmod2=p+"■ "
|
189 |
+
cadena=cadena.replace(p,pmod2)
|
190 |
+
pmod=" "+p
|
191 |
+
if cadena.find(pmod)>=-1:
|
192 |
+
pmod2=" ■"+p
|
193 |
+
cadena=cadena.replace(p,pmod2)
|
194 |
+
|
195 |
+
|
196 |
+
|
197 |
+
|
198 |
+
cadena=self.unprotect(cadena)
|
199 |
+
|
200 |
+
for p in exceptions:
|
201 |
+
pmod=p+" "
|
202 |
+
if cadena.find(pmod)>=-1:
|
203 |
+
pmod2=p+"■ "
|
204 |
+
cadena=cadena.replace(p,pmod2)
|
205 |
+
pmod=" "+p
|
206 |
+
if cadena.find(pmod)>=-1:
|
207 |
+
pmod2=" ■"+p
|
208 |
+
cadena=cadena.replace(p,pmod2)
|
209 |
+
|
210 |
+
cadena=cadena.replace("▁"," ")
|
211 |
+
cadena=' '.join(cadena.split())
|
212 |
+
return(cadena)
|
213 |
+
|
214 |
+
def tokenize(self,segment):
|
215 |
+
tokenized=self.main_tokenizer(segment)
|
216 |
+
tokenized=tokenized.replace("■","")
|
217 |
+
tokenized=' '.join(tokenized.split())
|
218 |
+
return(tokenized)
|
219 |
+
|
220 |
+
def detokenize(self, segment):
|
221 |
+
for sub in self.subs:
|
222 |
+
subA=sub.replace("■"," ")
|
223 |
+
subB=sub.replace("■","")
|
224 |
+
segment=segment.replace(subA,subB)
|
225 |
+
segment=segment.replace(subA.capitalize(),subB.capitalize())
|
226 |
+
segment=segment.replace(subA.upper(),subB.upper())
|
227 |
+
segment=segment.replace(" .",".")
|
228 |
+
segment=segment.replace(" ,",",")
|
229 |
+
segment=segment.replace(" :",":")
|
230 |
+
segment=segment.replace(" ;",";")
|
231 |
+
segment=segment.replace(" :",":")
|
232 |
+
segment=segment.replace(" )",")")
|
233 |
+
segment=segment.replace("( ","(")
|
234 |
+
segment=segment.replace(" ]","]")
|
235 |
+
segment=segment.replace("[ ","[")
|
236 |
+
segment=segment.replace(" }","}")
|
237 |
+
segment=segment.replace("{ ","{")
|
238 |
+
segment=segment.replace(" !","!")
|
239 |
+
segment=segment.replace("¡ ","¡")
|
240 |
+
segment=segment.replace(" ?","?")
|
241 |
+
segment=segment.replace("¿ ","¿")
|
242 |
+
segment=segment.replace(" »","»")
|
243 |
+
segment=segment.replace("« ","«")
|
244 |
+
segment=segment.replace("‘ ","‘")
|
245 |
+
segment=segment.replace(" ’","’")
|
246 |
+
segment=segment.replace("“ ","“")
|
247 |
+
segment=segment.replace(" ”","”")
|
248 |
+
segment=' '.join(segment.split())
|
249 |
+
return(segment)
|
250 |
+
|
251 |
+
def tokenize_j(self,segment):
|
252 |
+
tokenized=self.main_tokenizer(segment)
|
253 |
+
tokenized=' '.join(tokenized.split())
|
254 |
+
return(tokenized)
|
255 |
+
|
256 |
+
def detokenize_j(self,segment):
|
257 |
+
segment=segment.replace(" ■","")
|
258 |
+
segment=segment.replace("■ ","")
|
259 |
+
segment=segment.replace("■","")
|
260 |
+
detok=segment
|
261 |
+
detok=' '.join(detok.split())
|
262 |
+
return(detok)
|
263 |
+
|
264 |
+
def tokenize_jn(self,segment):
|
265 |
+
tokenized=self.tokenize_j(segment)
|
266 |
+
tokenized=self.split_numbers(tokenized)
|
267 |
+
tokenized=' '.join(tokenized.split())
|
268 |
+
return(tokenized)
|
269 |
+
|
270 |
+
def detokenize_jn(self,segment):
|
271 |
+
segment=self.detokenize_j(segment)
|
272 |
+
return(segment)
|
273 |
+
|
274 |
+
def tokenize_s(self,segment):
|
275 |
+
tokenized=self.main_tokenizer(segment)
|
276 |
+
tokenized=tokenized.replace("■ ","■")
|
277 |
+
tokenized=tokenized.replace(" ■","■")
|
278 |
+
tokenized=tokenized.replace(" "," ▁")
|
279 |
+
tokenized=tokenized.replace("■"," ")
|
280 |
+
tokenized=' '.join(tokenized.split())
|
281 |
+
return(tokenized)
|
282 |
+
|
283 |
+
def detokenize_s(self,segment):
|
284 |
+
segment=segment.replace(" ","")
|
285 |
+
segment=segment.replace("▁"," ")
|
286 |
+
detok=segment
|
287 |
+
detok=' '.join(detok.split())
|
288 |
+
return(detok)
|
289 |
+
|
290 |
+
def tokenize_sn(self,segment):
|
291 |
+
tokenized=self.main_tokenizer(segment)
|
292 |
+
tokenized=self.split_numbers(tokenized)
|
293 |
+
tokenized=tokenized.replace("■ ","■")
|
294 |
+
tokenized=tokenized.replace(" ■","■")
|
295 |
+
tokenized=tokenized.replace(" "," ▁")
|
296 |
+
tokenized=tokenized.replace("■"," ")
|
297 |
+
tokenized=' '.join(tokenized.split())
|
298 |
+
return(tokenized)
|
299 |
+
|
300 |
+
def detokenize_sn(self,segment):
|
301 |
+
segment=self.detokenize_s(segment)
|
302 |
+
return(segment)
|
303 |
+
|
304 |
+
def print_help():
|
305 |
+
print("MTUOC_tokenizer_cat.py A tokenizer for Spanish, usage:")
|
306 |
+
print("Simple tokenization:")
|
307 |
+
print(' cat "sentence to tokenize." | python3 MTUOC_tokenizer_spa.py tokenize')
|
308 |
+
print(' python3 MTUOC_tokenizer_spa.py tokenize < file_to_tokenize > tokenized_file')
|
309 |
+
print()
|
310 |
+
print("Simple detokenization:")
|
311 |
+
print(' cat "sentence to tokenize." | python3 MTUOC_tokenizer_spa.py detokenize')
|
312 |
+
print(' python3 MTUOC_tokenizer_spa.py detokenize < file_to_detokenize > detokenized_file')
|
313 |
+
print()
|
314 |
+
print("Advanced options:")
|
315 |
+
print(" tokenization/detokenization with joiner marks (■): tokenize_j / detokenize_j")
|
316 |
+
print(" tokenization/detokenization with joiner marks (■) and number splitting: tokenize_jn / detokenize_jn")
|
317 |
+
print(" tokenization/detokenization with splitter marks (▁): tokenize_s / detokenize_s")
|
318 |
+
print(" tokenization/detokenization with splitter marks (▁) and number splitting: tokenize_sn / detokenize_sn")
|
319 |
+
|
320 |
+
|
321 |
+
if __name__ == "__main__":
|
322 |
+
if len(sys.argv)>1:
|
323 |
+
if sys.argv[1]=="-h" or sys.argv[1]=="--help":
|
324 |
+
print_help()
|
325 |
+
sys.exit()
|
326 |
+
action=sys.argv[1]
|
327 |
+
else:
|
328 |
+
action="tokenize"
|
329 |
+
tokenizer=Tokenizer()
|
330 |
+
for line in sys.stdin:
|
331 |
+
line=line.strip()
|
332 |
+
#normalization of apostrophe
|
333 |
+
line=line.replace("’","'")
|
334 |
+
line=html.unescape(line)
|
335 |
+
if action=="tokenize":
|
336 |
+
outsegment=tokenizer.tokenize(line)
|
337 |
+
elif action=="detokenize":
|
338 |
+
outsegment=tokenizer.detokenize(line)
|
339 |
+
|
340 |
+
elif action=="tokenize_s":
|
341 |
+
outsegment=tokenizer.tokenize_s(line)
|
342 |
+
elif action=="detokenize_s":
|
343 |
+
outsegment=tokenizer.detokenize_s(line)
|
344 |
+
elif action=="tokenize_sn":
|
345 |
+
outsegment=tokenizer.tokenize_sn(line)
|
346 |
+
elif action=="detokenize_sn":
|
347 |
+
outsegment=tokenizer.detokenize_sn(line)
|
348 |
+
|
349 |
+
elif action=="tokenize_j":
|
350 |
+
outsegment=tokenizer.tokenize_j(line)
|
351 |
+
elif action=="detokenize_j":
|
352 |
+
outsegment=tokenizer.detokenize_j(line)
|
353 |
+
elif action=="tokenize_jn":
|
354 |
+
outsegment=tokenizer.tokenize_jn(line)
|
355 |
+
elif action=="detokenize_jn":
|
356 |
+
outsegment=tokenizer.detokenize_jn(line)
|
357 |
+
|
358 |
+
print(outsegment)
|
359 |
+
|
MTUOC_translate.py
ADDED
@@ -0,0 +1,436 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import config
|
2 |
+
import sys
|
3 |
+
import re
|
4 |
+
import string as stringmodule
|
5 |
+
|
6 |
+
|
7 |
+
import srx_segmenter
|
8 |
+
|
9 |
+
from MTUOC_misc import printLOG
|
10 |
+
from MTUOC_misc import capitalizeMTUOC
|
11 |
+
if config.MTUOCServer_MTengine=="GoogleTranslate":
|
12 |
+
from MTUOC_GoogleTranslate import Google_translate
|
13 |
+
if config.MTUOCServer_MTengine=="DeepL":
|
14 |
+
from MTUOC_DeepL import DeepL_translate
|
15 |
+
if config.MTUOCServer_MTengine=="Lucy":
|
16 |
+
from MTUOC_Lucy import Lucy_translate
|
17 |
+
from MTUOC_Marian import translate_segment_Marian
|
18 |
+
from MTUOC_Moses import translate_segment_Moses
|
19 |
+
|
20 |
+
from MTUOC_preprocess import preprocess_segment
|
21 |
+
from MTUOC_preprocess import postprocess_segment
|
22 |
+
|
23 |
+
from MTUOC_preprocess import tokenizationSL
|
24 |
+
from MTUOC_preprocess import tokenizationTL
|
25 |
+
from MTUOC_preprocess import detokenizationSL
|
26 |
+
from MTUOC_preprocess import detokenizationTL
|
27 |
+
|
28 |
+
def segmenta(cadena):
|
29 |
+
segmenter = srx_segmenter.SrxSegmenter(config.rules[config.SRXlang],cadena)
|
30 |
+
segments=segmenter.extract()
|
31 |
+
resposta=[]
|
32 |
+
return(segments)
|
33 |
+
|
34 |
+
def is_first_letter_upper(segment):
|
35 |
+
for character in segment:
|
36 |
+
if character.isalpha() and character.isupper():
|
37 |
+
return(True)
|
38 |
+
elif character.isalpha() and character.islower():
|
39 |
+
return(False)
|
40 |
+
return(False)
|
41 |
+
|
42 |
+
def upper_case_first_letter(segment):
|
43 |
+
pos=0
|
44 |
+
for character in segment:
|
45 |
+
if character.isalpha() and character.islower():
|
46 |
+
llista=list(segment)
|
47 |
+
llista[pos]=llista[pos].upper()
|
48 |
+
segment="".join(llista)
|
49 |
+
return(segment)
|
50 |
+
elif character.isalpha() and character.isupper():
|
51 |
+
return(segment)
|
52 |
+
pos+=1
|
53 |
+
return(segment)
|
54 |
+
|
55 |
+
###URLs EMAILs
|
56 |
+
|
57 |
+
def findEMAILs(string):
|
58 |
+
email=re.findall('\S+@\S+', string)
|
59 |
+
email2=[]
|
60 |
+
for em in email:
|
61 |
+
if em[-1] in stringmodule.punctuation: em=em[0:-1]
|
62 |
+
email2.append(em)
|
63 |
+
return email2
|
64 |
+
|
65 |
+
def findURLs(string):
|
66 |
+
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
|
67 |
+
url = re.findall(regex,string)
|
68 |
+
return [x[0] for x in url]
|
69 |
+
|
70 |
+
def replace_EMAILs(string,code="@EMAIL@"):
|
71 |
+
EMAILs=findEMAILs(string)
|
72 |
+
for EMAIL in EMAILs:
|
73 |
+
string=string.replace(EMAIL,code)
|
74 |
+
return(string)
|
75 |
+
|
76 |
+
def replace_URLs(string,code="@URL@"):
|
77 |
+
URLs=findURLs(string)
|
78 |
+
for URL in URLs:
|
79 |
+
string=string.replace(URL,code)
|
80 |
+
return(string)
|
81 |
+
|
82 |
+
re_num = re.compile(r'[\d,\.]+')
|
83 |
+
|
84 |
+
def replace_NUMs(segment,code="@NUM@"):
|
85 |
+
trobatsEXPRNUM=re.finditer(re_num,segment)
|
86 |
+
for trobat in trobatsEXPRNUM:
|
87 |
+
if not trobat.group(0) in [".",","]:
|
88 |
+
segment=segment.replace(trobat.group(0),code,1)
|
89 |
+
return(segment)
|
90 |
+
|
91 |
+
def splitnumbers(segment,joiner=""):
|
92 |
+
joiner=joiner+" "
|
93 |
+
xifres = re.findall(re_num,segment)
|
94 |
+
equil={}
|
95 |
+
for xifra in xifres:
|
96 |
+
xifrastr=str(xifra)
|
97 |
+
xifrasplit=xifra.split()
|
98 |
+
xifra2=joiner.join(xifra)
|
99 |
+
segment=segment.replace(xifra,xifra2)
|
100 |
+
if xifra2.find(" ")>-1:
|
101 |
+
equil[xifra2]=xifra
|
102 |
+
return(segment,equil)
|
103 |
+
|
104 |
+
def desplitnumbers(segment,equil):
|
105 |
+
for xifra2 in equil:
|
106 |
+
segment=segment.replace(xifra2,equil[xifra2])
|
107 |
+
return(segment)
|
108 |
+
|
109 |
+
def restore_EMAILs(stringA,stringB,code="@EMAIL@"):
|
110 |
+
EMAILs=findEMAILs(stringA)
|
111 |
+
for email in EMAILs:
|
112 |
+
stringB=stringB.replace(code,email,1)
|
113 |
+
return(stringB)
|
114 |
+
|
115 |
+
def restore_URLs(stringA,stringB,code="@URL@"):
|
116 |
+
URLs=findURLs(stringA)
|
117 |
+
for url in URLs:
|
118 |
+
stringB=stringB.replace(code,url,1)
|
119 |
+
return(stringB)
|
120 |
+
|
121 |
+
def restore_NUMs(segmentSL,segmentTL,code="@NUM@"):
|
122 |
+
trobatsEXPRNUM=re.finditer(re_num,segmentSL)
|
123 |
+
position=0
|
124 |
+
for trobat in trobatsEXPRNUM:
|
125 |
+
if not trobat.group(0) in [".",","]:
|
126 |
+
segmentTL=segmentTL.replace(code,trobat.group(0),1)
|
127 |
+
return(segmentTL)
|
128 |
+
|
129 |
+
|
130 |
+
def translate_para(paragraph):
|
131 |
+
if config.segment_input:
|
132 |
+
(segments,separators)=segmenta(paragraph)
|
133 |
+
translations=[]
|
134 |
+
for segment in segments:
|
135 |
+
translation=translate_segment(segment)
|
136 |
+
if config.fix_xml:
|
137 |
+
translation=config.tagrestorer.fix_xml_tags(translation)
|
138 |
+
translations.append(translation)
|
139 |
+
resultat=[]
|
140 |
+
for i in range(0,len(separators)):
|
141 |
+
resultat.append(separators[i])
|
142 |
+
try:
|
143 |
+
resultat.append(translations[i])
|
144 |
+
except:
|
145 |
+
pass
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
translation="".join(resultat)
|
150 |
+
|
151 |
+
else:
|
152 |
+
translation=translate_segment(paragraph)
|
153 |
+
|
154 |
+
return(translation)
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
def restore_tags_translation_candidates(translation_candidates):
|
160 |
+
hastags=config.tagrestorer.has_tags(translation_candidates["segmentTAGS"])
|
161 |
+
if hastags:
|
162 |
+
|
163 |
+
(translation_candidates["segmentTAGS"],equil)=config.tagrestorer.replace_tags(translation_candidates["segmentOrig"])
|
164 |
+
printLOG(3,"replace_tags",translation_candidates["segmentTAGS"])
|
165 |
+
printLOG(3,"equil",equil)
|
166 |
+
(translation_candidates["segmentTAGS"],tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(translation_candidates["segmentOrig"])
|
167 |
+
printLOG(3,"remove_start_end_tag",translation_candidates["segmentTAGS"])
|
168 |
+
printLOG(3,"TAG initial:",tagInici)
|
169 |
+
printLOG(3,"TAG final:",tagFinal)
|
170 |
+
translation_candidates["segmentNOTAGS"]=config.tagrestorer.remove_tags(translation_candidates["segmentTAGS"])
|
171 |
+
originaltags=config.tagrestorer.get_tags(translation_candidates["segmentTAGS"])
|
172 |
+
segmentNOTAGSTOK=tokenizationSL(translation_candidates["segmentNOTAGS"])
|
173 |
+
segmentTAGSTOK=tokenizationSL(translation_candidates["segmentPreTAGS"])
|
174 |
+
translation_candidates["translationTAGS"]=[None] * len(translation_candidates["translationNOTAGSPre"])
|
175 |
+
for i in range(0,len(translation_candidates["translationNOTAGSPre"])):
|
176 |
+
try:
|
177 |
+
if hastags and config.tag_restoration:
|
178 |
+
try:
|
179 |
+
alignment=translation_candidates["alignments"][i]
|
180 |
+
translationNOTAGSTOK=tokenizationTL(translation_candidates["translationNOTAGSPre"][i])
|
181 |
+
translation_candidates["translationTAGS"][i]=config.tagrestorer.restore_tags(segmentNOTAGSTOK, segmentTAGSTOK, alignment, translationNOTAGSTOK)
|
182 |
+
'''
|
183 |
+
if tagInici:
|
184 |
+
translation_candidates["translationTAGS"][i]=tagInici+translation_candidates["translationTAGS"][i]
|
185 |
+
if tagFinal:
|
186 |
+
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i]+tagFinal
|
187 |
+
printLOG(3,"SELECTED TRANSLATION SIMPLE TAGS",translation_candidates["translationTAGS"][i])
|
188 |
+
for t in equil:
|
189 |
+
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(t,equil[t],1)
|
190 |
+
'''
|
191 |
+
except:
|
192 |
+
printLOG(3,"ERROR restoring tags:",sys.exc_info())
|
193 |
+
translation_candidates["translationTAGS"][i]=translationNOTAGSTOK
|
194 |
+
|
195 |
+
else:
|
196 |
+
translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"]
|
197 |
+
printLOG(3,"translationTAGS:",translation_candidates["translationTAGS"][i])
|
198 |
+
|
199 |
+
|
200 |
+
|
201 |
+
except:
|
202 |
+
pass
|
203 |
+
else:
|
204 |
+
translation_candidates["segmentNOTAGS"]=translation_candidates["segmentTAGS"]
|
205 |
+
translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"]
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
|
210 |
+
return(translation_candidates)
|
211 |
+
|
212 |
+
|
213 |
+
def translate_segment(segment):
|
214 |
+
printLOG(3,"translate_segment",segment)
|
215 |
+
if config.MTUOCServer_MTengine=="GoogleTranslate":
|
216 |
+
translation=Google_translate(segment)
|
217 |
+
return(translation)
|
218 |
+
elif config.MTUOCServer_MTengine=="DeepL":
|
219 |
+
translation=DeepL_translate(segment)
|
220 |
+
return(translation)
|
221 |
+
elif config.MTUOCServer_MTengine=="Lucy":
|
222 |
+
translation=Lucy_translate(segment)
|
223 |
+
return(translation)
|
224 |
+
segmentOrig=segment
|
225 |
+
if not config.change_input_files[0]=="None":
|
226 |
+
printLOG(3,"CHANGES INPUT:")
|
227 |
+
printLOG(3,"ORIGINAL:",segmentOrig)
|
228 |
+
for change in config.changes_input:
|
229 |
+
tofind=change[0]
|
230 |
+
tochange=change[1]
|
231 |
+
regexp="\\b"+tofind+"\\b"
|
232 |
+
trobat=re.findall(regexp,segment)
|
233 |
+
if trobat:
|
234 |
+
segment=re.sub(regexp, tochange, segment)
|
235 |
+
printLOG(3,tofind,tochange)
|
236 |
+
printLOG(3,"CHANGED:",segment)
|
237 |
+
hastags=config.tagrestorer.has_tags(segment)
|
238 |
+
originaltags=config.tagrestorer.get_tags(segment)
|
239 |
+
printLOG(3,"hastags",hastags)
|
240 |
+
printLOG(3,"originaltags",originaltags)
|
241 |
+
#truecasing
|
242 |
+
totruecase=False
|
243 |
+
toupperfinal=False
|
244 |
+
if not config.truecase==None and config.truecase=="all": totruecase=True
|
245 |
+
segmentnotags=config.tagrestorer.remove_tags(segment)
|
246 |
+
if not config.truecase==None and config.truecase in ["upper","all"] and segmentnotags.isupper() and not segment=="@URL@" and not segment=="@EMAIL@":
|
247 |
+
totruecase=True
|
248 |
+
toupperfinal=True
|
249 |
+
if config.checkistranslatable:
|
250 |
+
segmentNOTAGS=replace_URLs(segment,config.code_URLs)
|
251 |
+
segmentNOTAGS=replace_EMAILs(segment,config.code_EMAILs)
|
252 |
+
tokens=tokenizationSL(segmentNOTAGS)
|
253 |
+
if not is_translatable(tokens):
|
254 |
+
return(segment)
|
255 |
+
if totruecase:
|
256 |
+
segment=config.truecaser.truecase(segment)
|
257 |
+
if config.tokenizerSL:
|
258 |
+
segment=config.tokenizerSL.tokenize(segment)
|
259 |
+
if hastags:
|
260 |
+
segmentTAGS=segment
|
261 |
+
|
262 |
+
(segmentTAGS,equil)=config.tagrestorer.replace_tags(segmentTAGS)
|
263 |
+
printLOG(3,"segmentTAGS:",segmentTAGS)
|
264 |
+
printLOG(3,"equil:",equil)
|
265 |
+
printLOG(3,"segmentTAGS:",segmentTAGS)
|
266 |
+
(segmentTAGS,tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(segmentTAGS)
|
267 |
+
printLOG(3,"TAG initial:",tagInici)
|
268 |
+
printLOG(3,"TAG final:",tagFinal)
|
269 |
+
|
270 |
+
|
271 |
+
segmentNOTAGS=config.tagrestorer.remove_tags(segment)
|
272 |
+
else:
|
273 |
+
segmentTAGS=segment
|
274 |
+
segmentNOTAGS=segment
|
275 |
+
if len(segmentNOTAGS)<config.min_chars_segment:
|
276 |
+
return(segment)
|
277 |
+
|
278 |
+
|
279 |
+
|
280 |
+
if config.MTUOCServer_EMAILs:
|
281 |
+
segmentTAGS=replace_EMAILs(segmentTAGS)
|
282 |
+
segmentNOTAGS=replace_EMAILs(segmentNOTAGS)
|
283 |
+
printLOG(3,"Replace EMAILs:",segmentTAGS)
|
284 |
+
if config.MTUOCServer_URLs:
|
285 |
+
segmentTAGS=replace_URLs(segmentTAGS)
|
286 |
+
segmentNOTAGS=replace_URLs(segmentNOTAGS)
|
287 |
+
printLOG(3,"Replace URLs:",segmentTAGS)
|
288 |
+
|
289 |
+
if config.pre_replace_NUMs:
|
290 |
+
segmentTAGS=replace_NUMs(segmentTAGS,code=config.code_NUMs)
|
291 |
+
segmentNOTAGS=replace_NUMs(segmentNOTAGS,code=config.code_NUMs)
|
292 |
+
printLOG(3,"Replace NUMs:",segmentTAGS)
|
293 |
+
if config.pre_split_NUMs:
|
294 |
+
(segmentTAGS,equilSplitNum)=splitnumbers(segmentTAGS)
|
295 |
+
(segmentNOTAGS,equilSplitNum2)=splitnumbers(segmentNOTAGS)
|
296 |
+
printLOG(3,"Split NUMs:",segmentTAGS)
|
297 |
+
|
298 |
+
|
299 |
+
#leading and trailing spaces
|
300 |
+
leading_spaces=len(segment)-len(segment.lstrip())
|
301 |
+
trailing_spaces=len(segment)-len(segment.rstrip())-1
|
302 |
+
segmentPre=preprocess_segment(segmentNOTAGS)
|
303 |
+
segmentPreTAGS=preprocess_segment(segmentTAGS)
|
304 |
+
if config.MTUOCServer_MTengine=="Marian":
|
305 |
+
translation_candidates=translate_segment_Marian(segmentPre)
|
306 |
+
elif config.MTUOCServer_MTengine=="Moses":
|
307 |
+
translation_candidates=translate_segment_Moses(segmentPre)
|
308 |
+
|
309 |
+
translation_candidates["segment"]=segment
|
310 |
+
translation_candidates["segmentOrig"]=segmentOrig
|
311 |
+
translation_candidates["segmentTAGS"]=segmentTAGS
|
312 |
+
translation_candidates["segmentPre"]=segmentPre
|
313 |
+
translation_candidates["segmentPreTAGS"]=segmentPreTAGS
|
314 |
+
translation_candidates["segmentNOTAGS"]=segmentNOTAGS
|
315 |
+
print("*****1",translation_candidates)
|
316 |
+
if hastags:
|
317 |
+
translation_candidates=restore_tags_translation_candidates(translation_candidates)
|
318 |
+
#(translation_candidates["segmentTAGS"],equil)=config.tagrestorer.replace_tags(translation_candidates["segmentOrig"])
|
319 |
+
|
320 |
+
else:
|
321 |
+
translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"]
|
322 |
+
print("*****2",translation_candidates)
|
323 |
+
for i in range(0,len(translation_candidates["translationNOTAGSPre"])):
|
324 |
+
translation_candidates["translationTAGS"][i]=postprocess_segment(translation_candidates["translationTAGS"][i])
|
325 |
+
|
326 |
+
if hastags:
|
327 |
+
if tagInici:
|
328 |
+
translation_candidates["translationTAGS"][i]=tagInici+translation_candidates["translationTAGS"][i]
|
329 |
+
if tagFinal:
|
330 |
+
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i]+tagFinal
|
331 |
+
for t in equil:
|
332 |
+
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(t,equil[t],1)
|
333 |
+
#if not config.truecaser==None and is_first_letter_upper(segmentOrig):
|
334 |
+
# translation_candidates["translationTAGS"][i]=upper_case_first_letter(translation_candidates["translationTAGS"][i])
|
335 |
+
print("*****3",translation_candidates)
|
336 |
+
if totruecase:
|
337 |
+
translation_candidates["translationTAGS"][i]=capitalizeMTUOC(translation_candidates["translationTAGS"][i])
|
338 |
+
if toupperfinal:
|
339 |
+
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].upper()
|
340 |
+
###LOWERCASE UPPERCASED TAGS
|
341 |
+
hastagstranslation=config.tagrestorer.has_tags(translation_candidates["translationTAGS"][i])
|
342 |
+
if hastagstranslation:
|
343 |
+
translationtags=config.tagrestorer.get_tags(translation_candidates["translationTAGS"][i])
|
344 |
+
for tt in translationtags:
|
345 |
+
if not tt in originaltags and tt.lower() in originaltags:
|
346 |
+
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(tt,tt.lower())
|
347 |
+
|
348 |
+
if config.MTUOCServer_EMAILs:
|
349 |
+
translation_candidates["translationTAGS"][i]=restore_EMAILs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_EMAILs)
|
350 |
+
if config.MTUOCServer_URLs:
|
351 |
+
translation_candidates["translationTAGS"][i]=restore_URLs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_URLs)
|
352 |
+
'''
|
353 |
+
#config.pre_replace_NUMs
|
354 |
+
if config.pre_replace_NUMs:
|
355 |
+
translation_candidates["translationTAGS"][i]=restore_NUMs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_NUMs)
|
356 |
+
#config.pre_split_NUMs
|
357 |
+
if config.pre_split_NUMs:
|
358 |
+
translation_candidates["translationTAGS"][i]=desplitnumbers(translation_candidates["translationTAGS"][i],equilSplitNum)
|
359 |
+
#detruecase
|
360 |
+
if totruecase:
|
361 |
+
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i][0].upper()+translation_candidates["translationTAGS"][i][1:]
|
362 |
+
#detokenize
|
363 |
+
'''
|
364 |
+
if config.tokenizerSL and not config.tokenizerTL==None:
|
365 |
+
translation_candidates["translationTAGS"][i]=config.tokenizerTL.detokenize(translation_candidates["translationTAGS"][i])
|
366 |
+
|
367 |
+
|
368 |
+
|
369 |
+
translation_candidates["translationTAGS"][i]=config.tagrestorer.repairSpacesTags(translation_candidates["segmentOrig"],translation_candidates["translationTAGS"][i])
|
370 |
+
printLOG(3,"SELECTED TRANSLATION REAL TAGS",translation_candidates["translationTAGS"][i])
|
371 |
+
best_translation=select_best_candidate(translation_candidates,config.translation_selection_strategy)
|
372 |
+
|
373 |
+
translation=best_translation
|
374 |
+
|
375 |
+
if not config.change_output_files[0]=="None":
|
376 |
+
printLOG(3,"CHANGES OUTPUT:")
|
377 |
+
printLOG(3,"ORIGINAL:",translation)
|
378 |
+
for change in config.changes_output:
|
379 |
+
tofind=change[0]
|
380 |
+
tochange=change[1]
|
381 |
+
regexp="\\b"+tofind+"\\b"
|
382 |
+
trobat=re.findall(regexp,translation)
|
383 |
+
if trobat:
|
384 |
+
translation=re.sub(regexp, tochange, translation)
|
385 |
+
printLOG(3,tofind,tochange)
|
386 |
+
printLOG(3,"CHANGED:",translation)
|
387 |
+
|
388 |
+
if not config.change_translation_files[0]=="None":
|
389 |
+
printLOG(3,"CHANGES TRANSLATION:")
|
390 |
+
printLOG(3,"ORIGINAL SOURCE:",segmentOrig)
|
391 |
+
printLOG(3,"ORIGINAL TARGET:",translation)
|
392 |
+
for change in config.changes_translation:
|
393 |
+
tofindSOURCE=change[0]
|
394 |
+
tofindTARGET=change[1]
|
395 |
+
tochange=change[2]
|
396 |
+
regexpSOURCE="\\b"+tofindSOURCE+"\\b"
|
397 |
+
regexpTARGET="\\b"+tofindTARGET+"\\b"
|
398 |
+
trobatSOURCE=re.findall(regexpSOURCE,segmentOrig)
|
399 |
+
trobatTARGET=re.findall(regexpTARGET,translation)
|
400 |
+
if trobatSOURCE and trobatTARGET:
|
401 |
+
translation=re.sub(regexpTARGET, tochange, translation)
|
402 |
+
printLOG(3,tofindTARGET,tochange)
|
403 |
+
printLOG(3,"CHANGED TARGET:",translation)
|
404 |
+
|
405 |
+
return(translation)
|
406 |
+
|
407 |
+
def is_translatable_old(tokens):
|
408 |
+
tokens=tokens.split(" ")
|
409 |
+
translatable=False
|
410 |
+
for token in tokens:
|
411 |
+
if token.isalpha():
|
412 |
+
translatable=True
|
413 |
+
break
|
414 |
+
return(translatable)
|
415 |
+
|
416 |
+
def is_translatable(tokens):
|
417 |
+
translatable=False
|
418 |
+
for token in tokens.split():
|
419 |
+
transtoken=True
|
420 |
+
for character in token:
|
421 |
+
if str(character) in ["0","1","2","3","4","5","6","7","8","9"]:
|
422 |
+
transtoken=False
|
423 |
+
break
|
424 |
+
if transtoken:
|
425 |
+
translatable=True
|
426 |
+
return(translatable)
|
427 |
+
|
428 |
+
def select_best_candidate(translation_candidates,strategy):
|
429 |
+
'''To implement several strategies to select the best candidate. Now it r,eturns the first one.'''
|
430 |
+
if strategy=="First":
|
431 |
+
best_translation=translation_candidates["translationTAGS"][0]
|
432 |
+
return(best_translation)
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
|
MTUOC_truecaser.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_truecaser
|
2 |
+
# v. 07/06/2023
|
3 |
+
# Copyright (C) 2021 Antoni Oliver
|
4 |
+
#
|
5 |
+
# This program is free software: you can redistribute it and/or modify
|
6 |
+
# it under the terms of the GNU General Public License as published by
|
7 |
+
# the Free Software Foundation, either version 3 of the License, or
|
8 |
+
# (at your option) any later version.
|
9 |
+
#
|
10 |
+
# This program is distributed in the hope that it will be useful,
|
11 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13 |
+
# GNU General Public License for more details.
|
14 |
+
|
15 |
+
# You should have received a copy of the GNU General Public License
|
16 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
17 |
+
|
18 |
+
import sys
|
19 |
+
import codecs
|
20 |
+
import pickle
|
21 |
+
import argparse
|
22 |
+
import importlib
|
23 |
+
|
24 |
+
class Truecaser():
|
25 |
+
def __init__(self, MTUOCPath=".", tokenizer=None, tc_model=None):
|
26 |
+
|
27 |
+
self.initchars=["¡","¿","-","*","+","'",'"',"«","»","—","‘","’","“","”","„",]
|
28 |
+
|
29 |
+
if tokenizer==None:
|
30 |
+
self.tokenizer=None
|
31 |
+
else:
|
32 |
+
sys.path.append(MTUOCPath)
|
33 |
+
if tokenizer.endswith(".py"):tokenizer=tokenizer.replace(".py","")
|
34 |
+
self.module = importlib.import_module(tokenizer)
|
35 |
+
self.tokenizer=self.module.Tokenizer()
|
36 |
+
if tc_model:
|
37 |
+
self.tc_model = pickle.load(open(tc_model, "rb" ) )
|
38 |
+
else:
|
39 |
+
self.tc_model={}
|
40 |
+
def set_tc_model(self, tc_model):
|
41 |
+
self.tc_model = pickle.load(open(tc_model, "rb" ) )
|
42 |
+
|
43 |
+
def set_tokenizer(self, tokenizer):
|
44 |
+
if tokenizer.endswith(".py"):tokenizer=tokenizer.replace(".py","")
|
45 |
+
self.module = importlib.import_module(tokenizer)
|
46 |
+
self.tokenizer=self.module.Tokenizer()
|
47 |
+
|
48 |
+
def set_MTUOCPath(self, path):
|
49 |
+
sys.path.append(path)
|
50 |
+
|
51 |
+
def isinitsymbol(self, token):
|
52 |
+
if len(token)==1 and token in self.initchars:
|
53 |
+
return(True)
|
54 |
+
else:
|
55 |
+
return(False)
|
56 |
+
|
57 |
+
def detect_type(self, segment):
|
58 |
+
tipus="unknown"
|
59 |
+
if self.tokenizer==None:
|
60 |
+
tokens=segment.split(" ")
|
61 |
+
else:
|
62 |
+
tokens=self.tokenizer.tokenize(segment)
|
63 |
+
ntok=0
|
64 |
+
utokens=0
|
65 |
+
ltokens=0
|
66 |
+
leadingsplitter=False
|
67 |
+
trailingsplitter=False
|
68 |
+
leadingjoiner=False
|
69 |
+
trailingsplitter=False
|
70 |
+
for token in tokens:
|
71 |
+
token=token.replace("▁","")
|
72 |
+
token=token.replace("■","")
|
73 |
+
if token.isalpha():ntok+=1
|
74 |
+
if token.isalpha() and token==token.lower():
|
75 |
+
ltokens+=1
|
76 |
+
elif token.isalpha() and not token==token.lower():
|
77 |
+
utokens+=1
|
78 |
+
if ntok>=5 and utokens>=ntok/2 and not segment==segment.upper():
|
79 |
+
tipus="titled"
|
80 |
+
elif segment==segment.upper():
|
81 |
+
tipus="uppercased"
|
82 |
+
else:
|
83 |
+
tipus="regular"
|
84 |
+
return(tipus)
|
85 |
+
|
86 |
+
def truecase(self, line, ucf=False, restoreCase=False):
|
87 |
+
if self.tokenizer:
|
88 |
+
tokens=self.tokenizer.tokenize_s(line).split(" ")
|
89 |
+
else:
|
90 |
+
tokens=line.split(" ")
|
91 |
+
nsegment=[]
|
92 |
+
for token in tokens:
|
93 |
+
|
94 |
+
try:
|
95 |
+
leadingsplitter=False
|
96 |
+
trailingsplitter=False
|
97 |
+
leadingjoiner=False
|
98 |
+
trailingjoiner=False
|
99 |
+
if token.startswith("▁"):leadingsplitter=True
|
100 |
+
if token.endswith("▁"):trailingsplitter=True
|
101 |
+
if token.startswith("■"):leadingjoiner=True
|
102 |
+
if token.endswith("■"):trailingjoiner=True
|
103 |
+
token=token.replace("▁","")
|
104 |
+
token=token.replace("■","")
|
105 |
+
try:
|
106 |
+
nlc=self.tc_model[token.lower()]["lc"]
|
107 |
+
except:
|
108 |
+
nlc=0
|
109 |
+
try:
|
110 |
+
nu1=self.tc_model[token.lower()]["u1"]
|
111 |
+
except:
|
112 |
+
nu1=0
|
113 |
+
try:
|
114 |
+
nuc=self.tc_model[token.lower()]["uc"]
|
115 |
+
except:
|
116 |
+
nuc=0
|
117 |
+
proceed=False
|
118 |
+
if not token==token.lower(): proceed=True
|
119 |
+
if restoreCase: proceed=True
|
120 |
+
if proceed:
|
121 |
+
if nlc>0 and nlc>=nu1 and nlc>=nuc:
|
122 |
+
token=token.lower()
|
123 |
+
elif nu1>0 and nu1>nlc and nu1>nuc:
|
124 |
+
token=token.lower().capitalize()
|
125 |
+
elif nuc>0 and nuc>nlc and nuc>nu1:
|
126 |
+
token=token.upper()
|
127 |
+
|
128 |
+
if leadingsplitter:token="▁"+token
|
129 |
+
if trailingsplitter:token=token+"▁"
|
130 |
+
if leadingjoiner:token="■"+token
|
131 |
+
if trailingjoiner:token=token+"■"
|
132 |
+
nsegment.append(token)
|
133 |
+
except:
|
134 |
+
print("ERROR",sys.exc_info())
|
135 |
+
nsegment.append(token)
|
136 |
+
if self.tokenizer:
|
137 |
+
nsegment=self.tokenizer.detokenize_s(" ".join(nsegment))
|
138 |
+
else:
|
139 |
+
nsegment=" ".join(nsegment)
|
140 |
+
if ucf:
|
141 |
+
if self.isinitsymbol(nsegment[0]):firstchar=1
|
142 |
+
else: firstchar=0
|
143 |
+
try:
|
144 |
+
if firstchar==0:
|
145 |
+
nsegment=nsegment[firstchar].upper()+"".join(nsegment[1:])
|
146 |
+
elif firstchar==1:
|
147 |
+
nsegment=nsegment[0]+nsegment[firstchar].upper()+"".join(nsegment[2:])
|
148 |
+
except:
|
149 |
+
pass
|
150 |
+
return(nsegment)
|
151 |
+
|
152 |
+
def detruecase_old(self,line,tokenizer):
|
153 |
+
tokens=line.split(" ")
|
154 |
+
new=[]
|
155 |
+
yet=False
|
156 |
+
if tokenizer:
|
157 |
+
tokens=tokenizer.tokenize_j(line).split(" ")
|
158 |
+
else:
|
159 |
+
tokens=line.split(" ")
|
160 |
+
for token in tokens:
|
161 |
+
if not yet and token.isalpha():
|
162 |
+
yet=True
|
163 |
+
new.append(token[0].upper()+token[1:])
|
164 |
+
else:
|
165 |
+
new.append(token)
|
166 |
+
line=" ".join(new)
|
167 |
+
detrue=tokenizer.detokenize_j(line)
|
168 |
+
return(line)
|
169 |
+
|
170 |
+
def detruecase(self,line):
|
171 |
+
detruecased=line.capitalize()
|
172 |
+
return(line)
|
173 |
+
|
174 |
+
|
175 |
+
if __name__ == "__main__":
|
176 |
+
parser = argparse.ArgumentParser(description='MTUOC program for truecasing.')
|
177 |
+
parser.add_argument('-m','--model', action="store", dest="model", help='The truecasing model to use.',required=True)
|
178 |
+
parser.add_argument('-t','--tokenizer', action="store", dest="tokenizer", help='The tokenizer to used',required=False)
|
179 |
+
parser.add_argument('-u','--ucf', action="store_true", dest="ucf", help='Set if you want first word capitalized',required=False)
|
180 |
+
parser.add_argument('-r','--restore', action="store_true", dest="restore", help='Set if you want to restore case (uppercase lower cased)',required=False)
|
181 |
+
parser.add_argument('--mtuoc','--MTUOC', action="store", dest="MTUOC", help='The path to the MTUOC components',required=False)
|
182 |
+
|
183 |
+
args = parser.parse_args()
|
184 |
+
model=args.model
|
185 |
+
ucf=args.ucf
|
186 |
+
restore=args.restore
|
187 |
+
|
188 |
+
|
189 |
+
if args.MTUOC:
|
190 |
+
MTUOCPath=args.MTUOC
|
191 |
+
else:
|
192 |
+
MTUOCPath=""
|
193 |
+
|
194 |
+
truecaser=Truecaser(MTUOCPath, args.tokenizer, args.model)
|
195 |
+
for line in sys.stdin:
|
196 |
+
line=line.strip()
|
197 |
+
tcline=truecaser.truecase(line, ucf, restore)
|
198 |
+
print(tcline)
|
199 |
+
|
MTUOC_typeMTUOC.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_tupeMTUOC
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
|
18 |
+
import sys
|
19 |
+
from flask import Flask, jsonify, request, make_response
|
20 |
+
|
21 |
+
from MTUOC_misc import get_IP_info
|
22 |
+
from MTUOC_misc import printLOG
|
23 |
+
import config
|
24 |
+
|
25 |
+
from MTUOC_translate import translate_para
|
26 |
+
|
27 |
+
def start_MTUOC_server():
|
28 |
+
cli = sys.modules['flask.cli']
|
29 |
+
cli.show_server_banner = lambda *x: None
|
30 |
+
STATUS_OK = "ok"
|
31 |
+
STATUS_ERROR = "error"
|
32 |
+
printLOG(1,config.verbosity_level,"MTUOC server started using MTUOC protocol")
|
33 |
+
STATUS_ERROR = "error"
|
34 |
+
out={}
|
35 |
+
def start(url_root="./translator",
|
36 |
+
host="0.0.0.0",
|
37 |
+
port=5000,
|
38 |
+
debug=True):
|
39 |
+
def prefix_route(route_function, prefix='', mask='{0}{1}'):
|
40 |
+
def newroute(route, *args, **kwargs):
|
41 |
+
return route_function(mask.format(prefix, route), *args, **kwargs)
|
42 |
+
return newroute
|
43 |
+
|
44 |
+
app = Flask(__name__)
|
45 |
+
app.route = prefix_route(app.route, url_root)
|
46 |
+
|
47 |
+
@app.route('/translate', methods=['POST'])
|
48 |
+
def translateMTUOC():
|
49 |
+
try:
|
50 |
+
body = request.get_json()
|
51 |
+
ts=translate_para(body["src"])
|
52 |
+
jsonObject = {
|
53 |
+
"id": body["id"],
|
54 |
+
"src": body["src"],
|
55 |
+
|
56 |
+
"tgt": ts # Call MTUOC
|
57 |
+
}
|
58 |
+
printLOG(0,config.verbosity_level,jsonObject)
|
59 |
+
return jsonify(jsonObject)
|
60 |
+
except:
|
61 |
+
return make_response("Server Error", 500)
|
62 |
+
|
63 |
+
from waitress import serve
|
64 |
+
serve(app, host=host, port=port,threads= 1)
|
65 |
+
#app.run(debug=debug, host=host, port=port, use_reloader=False,threaded=True)
|
66 |
+
url_root="/"
|
67 |
+
ip="0.0.0.0"
|
68 |
+
ip=get_IP_info()
|
69 |
+
debug="store_true"
|
70 |
+
|
71 |
+
print("MTUOC server IP: ", ip)
|
72 |
+
print("MTUOC server port: ", config.MTUOCServer_port)
|
73 |
+
print("MTUOC server type: MTUOC")
|
74 |
+
|
75 |
+
start(url_root=url_root, host=ip, port=config.MTUOCServer_port,debug=debug)
|
MTUOC_typeModernMT.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_typeModernMT
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
import sys
|
18 |
+
from flask import Flask, jsonify, request
|
19 |
+
|
20 |
+
from MTUOC_misc import get_IP_info
|
21 |
+
from MTUOC_misc import printLOG
|
22 |
+
import config
|
23 |
+
|
24 |
+
from MTUOC_translate import translate_para
|
25 |
+
|
26 |
+
def start_ModernMT_server():
|
27 |
+
cli = sys.modules['flask.cli']
|
28 |
+
cli.show_server_banner = lambda *x: None
|
29 |
+
STATUS_OK = "ok"
|
30 |
+
STATUS_ERROR = "error"
|
31 |
+
printLOG(1,"MTUOC server started as ModernMT server")
|
32 |
+
def start(
|
33 |
+
url_root="",
|
34 |
+
host="",
|
35 |
+
port=config.MTUOCServer_port,
|
36 |
+
debug=True):
|
37 |
+
def prefix_route(route_function, prefix='', mask='{0}{1}'):
|
38 |
+
def newroute(route, *args, **kwargs):
|
39 |
+
return route_function(mask.format(prefix, route), *args, **kwargs)
|
40 |
+
return newroute
|
41 |
+
app = Flask(__name__)
|
42 |
+
app.route = prefix_route(app.route, url_root)
|
43 |
+
@app.route('/translate', methods=['GET'])
|
44 |
+
def translateModernMT():
|
45 |
+
out = {}
|
46 |
+
try:
|
47 |
+
out['data']={}
|
48 |
+
segment=request.args['q']
|
49 |
+
translation=translate_para(segment)
|
50 |
+
out['data']['translation']=translation
|
51 |
+
except:
|
52 |
+
out['status'] = STATUS_ERROR
|
53 |
+
return jsonify(out)
|
54 |
+
#ip=get_IP_info()
|
55 |
+
#app.run(debug=True, host=ip, port=MTUOCServer_port, use_reloader=False, threaded=True)
|
56 |
+
#start()
|
57 |
+
from waitress import serve
|
58 |
+
serve(app, host=host, port=port,threads= 8)
|
59 |
+
#app.run(debug=debug, host=host, port=port, use_reloader=False,threaded=True)
|
60 |
+
url_root="/"
|
61 |
+
ip="0.0.0.0"
|
62 |
+
ip=get_IP_info()
|
63 |
+
debug="store_true"
|
64 |
+
|
65 |
+
print("MTUOC server IP: ", ip)
|
66 |
+
print("MTUOC server port: ", config.MTUOCServer_port)
|
67 |
+
print("MTUOC server type: ModernMT")
|
68 |
+
|
69 |
+
start(url_root=url_root, host=ip, port=config.MTUOCServer_port,debug=debug)
|
MTUOC_typeMoses.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_typeMoses
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
import sys
|
18 |
+
from xmlrpc.server import SimpleXMLRPCServer
|
19 |
+
|
20 |
+
from MTUOC_misc import get_IP_info
|
21 |
+
from MTUOC_misc import printLOG
|
22 |
+
import config
|
23 |
+
|
24 |
+
from MTUOC_translate import translate_para
|
25 |
+
|
26 |
+
def translate(segment):
|
27 |
+
#function for Moses server
|
28 |
+
translation=translate_para(segment['text'])
|
29 |
+
translationdict={}
|
30 |
+
translationdict["text"]=translation
|
31 |
+
return(translationdict)
|
32 |
+
|
33 |
+
def start_Moses_server():
|
34 |
+
server = SimpleXMLRPCServer(("", config.MTUOCServer_port),logRequests=True,)
|
35 |
+
server.register_function(translate)
|
36 |
+
server.register_introspection_functions()
|
37 |
+
# Start the server
|
38 |
+
try:
|
39 |
+
ip=get_IP_info()
|
40 |
+
printLOG(1,"MTUOC server IP: ", ip)
|
41 |
+
printLOG(1,"MTUOC server port: ", config.MTUOCServer_port)
|
42 |
+
printLOG(1,"MTUOC server type: Moses")
|
43 |
+
server.serve_forever()
|
44 |
+
except KeyboardInterrupt:
|
45 |
+
printLOG(1,'Exiting')
|
MTUOC_typeNMTWizard.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_typeNMTWizard
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
import sys
|
18 |
+
from flask import Flask, jsonify, request
|
19 |
+
|
20 |
+
from MTUOC_misc import get_IP_info
|
21 |
+
from MTUOC_misc import printLOG
|
22 |
+
import config
|
23 |
+
|
24 |
+
from MTUOC_translate import translate_para
|
25 |
+
|
26 |
+
def start_NMTWizard_server():
|
27 |
+
cli = sys.modules['flask.cli']
|
28 |
+
cli.show_server_banner = lambda *x: None
|
29 |
+
STATUS_OK = "ok"
|
30 |
+
STATUS_ERROR = "error"
|
31 |
+
printLOG(1,"MTUOC server started as NMTWizard server")
|
32 |
+
out={}
|
33 |
+
def start(url_root="",
|
34 |
+
host="0.0.0.0",
|
35 |
+
port=5000,
|
36 |
+
debug=True):
|
37 |
+
def prefix_route(route_function, prefix='', mask='{0}{1}'):
|
38 |
+
def newroute(route, *args, **kwargs):
|
39 |
+
return route_function(mask.format(prefix, route), *args, **kwargs)
|
40 |
+
return newroute
|
41 |
+
|
42 |
+
app = Flask(__name__)
|
43 |
+
app.route = prefix_route(app.route, url_root)
|
44 |
+
|
45 |
+
@app.route('/translate', methods=['POST'])
|
46 |
+
def translateONMT():
|
47 |
+
inputs = request.get_json(force=True)
|
48 |
+
sourcetext=inputs["src"][0]["text"]
|
49 |
+
try:
|
50 |
+
targettext=translate_para(sourcetext)
|
51 |
+
out={"tgt": [[{"text": targettext}]]}
|
52 |
+
except:
|
53 |
+
out['error'] = "Error"
|
54 |
+
out['status'] = STATUS_ERROR
|
55 |
+
return jsonify(out)
|
56 |
+
from waitress import serve
|
57 |
+
serve(app, host=host, port=port,threads= 1)
|
58 |
+
#app.run(debug=debug, host=host, port=port, use_reloader=False,threaded=True)
|
59 |
+
url_root="/"
|
60 |
+
ip="0.0.0.0"
|
61 |
+
ip=get_IP_info()
|
62 |
+
debug="store_true"
|
63 |
+
|
64 |
+
print("MTUOC server IP: ", ip)
|
65 |
+
print("MTUOC server port: ", config.MTUOCServer_port)
|
66 |
+
print("MTUOC server type: NMTWizard")
|
67 |
+
|
68 |
+
start(url_root=url_root, host=ip, port=config.MTUOCServer_port,debug=debug)
|
MTUOC_typeOpenNMT.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MTUOC_typtNMT
|
2 |
+
# Copyright (C) 2023 Antoni Oliver
|
3 |
+
# v. 07/06/2023
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.
|
13 |
+
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
import sys
|
18 |
+
from flask import Flask, jsonify, request
|
19 |
+
|
20 |
+
from MTUOC_misc import get_IP_info
|
21 |
+
from MTUOC_misc import printLOG
|
22 |
+
import config
|
23 |
+
|
24 |
+
from MTUOC_translate import translate_para
|
25 |
+
|
26 |
+
def start_OpenNMT_server():
|
27 |
+
cli = sys.modules['flask.cli']
|
28 |
+
cli.show_server_banner = lambda *x: None
|
29 |
+
STATUS_OK = "ok"
|
30 |
+
STATUS_ERROR = "error"
|
31 |
+
printLOG(1,"MTUOC server started as OpenNMT server")
|
32 |
+
STATUS_ERROR = "error"
|
33 |
+
out={}
|
34 |
+
def start(url_root="./translator",
|
35 |
+
host="0.0.0.0",
|
36 |
+
port=5000,
|
37 |
+
debug=True):
|
38 |
+
def prefix_route(route_function, prefix='', mask='{0}{1}'):
|
39 |
+
def newroute(route, *args, **kwargs):
|
40 |
+
return route_function(mask.format(prefix, route), *args, **kwargs)
|
41 |
+
return newroute
|
42 |
+
|
43 |
+
app = Flask(__name__)
|
44 |
+
app.route = prefix_route(app.route, url_root)
|
45 |
+
|
46 |
+
@app.route('/translate', methods=['POST'])
|
47 |
+
def translateONMT():
|
48 |
+
inputs = request.get_json(force=True)
|
49 |
+
inputs0=inputs[0]
|
50 |
+
out = {}
|
51 |
+
try:
|
52 |
+
out = [[]]
|
53 |
+
ss=inputs0['src']
|
54 |
+
ts=translate_para(ss)
|
55 |
+
response = {"src": ss, "tgt": ts,
|
56 |
+
"n_best": 0, "pred_score": 0}
|
57 |
+
|
58 |
+
out[0].append(response)
|
59 |
+
except:
|
60 |
+
out['error'] = "Error"
|
61 |
+
out['status'] = STATUS_ERROR
|
62 |
+
|
63 |
+
return jsonify(out)
|
64 |
+
|
65 |
+
|
66 |
+
from waitress import serve
|
67 |
+
serve(app, host=host, port=port,threads= 1)
|
68 |
+
#app.run(debug=debug, host=host, port=port, use_reloader=False,threaded=True)
|
69 |
+
url_root=config.MTUOCServer_ONMT_url_root
|
70 |
+
ip="0.0.0.0"
|
71 |
+
ip=get_IP_info()
|
72 |
+
debug="store_true"
|
73 |
+
|
74 |
+
print("MTUOC server IP: ", ip)
|
75 |
+
print("MTUOC server port: ", config.MTUOCServer_port)
|
76 |
+
print("MTUOC server type: OpenNMT")
|
77 |
+
|
78 |
+
start(url_root=url_root, host=ip, port=config.MTUOCServer_port,debug=debug)
|
README.md
CHANGED
@@ -1,3 +1 @@
|
|
1 |
-
|
2 |
-
license: gpl-3.0
|
3 |
-
---
|
|
|
1 |
+
# MTUOC-server
|
|
|
|
config-server.yaml
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MTEngine:
|
2 |
+
MTengine: Marian
|
3 |
+
#one of Marian, OpenNMT, Moses, GoogleTranslate, DeepL, Lucy
|
4 |
+
startMTEngine: True
|
5 |
+
startCommand: "./marian-server-GPU -m model.bin -v vocab-es.yml vocab-en.yml -p 8200 --n-best --alignment hard --normalize 1 --quiet &"
|
6 |
+
IP: localhost
|
7 |
+
port: 8200
|
8 |
+
min_len_factor: 0.5
|
9 |
+
|
10 |
+
MTUOCServer:
|
11 |
+
port: 8000
|
12 |
+
type: MTUOC
|
13 |
+
#one of MTUOC, Moses, ModernMT, OpenNMT, NMTWizard
|
14 |
+
verbosity_level: 3
|
15 |
+
|
16 |
+
checkistranslatable: True
|
17 |
+
restore_tags: True
|
18 |
+
fix_xml: True
|
19 |
+
|
20 |
+
URLs: True
|
21 |
+
code_URLs: "@URL@"
|
22 |
+
EMAILs: True
|
23 |
+
code_EMAILs: "@EMAIL@"
|
24 |
+
|
25 |
+
replaceNUMs: True
|
26 |
+
|
27 |
+
splitNUMs: False
|
28 |
+
|
29 |
+
min_chars_segment: 2
|
30 |
+
translation_selection_strategy: First
|
31 |
+
log_file: log.log
|
32 |
+
ONMT_url_root: "/translator"
|
33 |
+
|
34 |
+
SRX_file: segment.srx
|
35 |
+
|
36 |
+
|
37 |
+
Preprocess:
|
38 |
+
sl_lang: es
|
39 |
+
tl_lang: en
|
40 |
+
tokenize_SL: False
|
41 |
+
sl_tokenizer: MTUOC_tokenizer_spa.py
|
42 |
+
#one of the MTUOC-tokenizers or state Moses if you use Moses tokenizers
|
43 |
+
tl_tokenizer: MTUOC_tokenizer_eng.py
|
44 |
+
#one of the MTUOC-tokenizers or state Moses if you use Moses tokenizers
|
45 |
+
|
46 |
+
segment_input: True
|
47 |
+
SRXfile: segmentSC.srx
|
48 |
+
SRXlang: Spanish
|
49 |
+
|
50 |
+
replaceNUMs: False
|
51 |
+
code_NUMs: "@NUM@"
|
52 |
+
splitNUMs: False
|
53 |
+
#Truecasing
|
54 |
+
tcmodel: tc.es
|
55 |
+
#or None if not used
|
56 |
+
truecase: upper
|
57 |
+
#one of None, all, upper
|
58 |
+
###SentencePiece###
|
59 |
+
sentencepiece: True
|
60 |
+
sp_model_SL: spmodel.model
|
61 |
+
sp_model_TL: spmodel.model
|
62 |
+
sp_splitter: "▁"
|
63 |
+
|
64 |
+
#BPE
|
65 |
+
BPE: False
|
66 |
+
#True or False: if sentencepiece is set to True, sentencepiece will be used
|
67 |
+
bpecodes: codes_file
|
68 |
+
bpe_joiner: "@@"
|
69 |
+
|
70 |
+
bos_annotate: True
|
71 |
+
bos_symbol: <s>
|
72 |
+
#None or <s> (or other)
|
73 |
+
eos_annotate: True
|
74 |
+
eos_symbol: </s>
|
75 |
+
|
76 |
+
#hmtl entities
|
77 |
+
|
78 |
+
unescape_html_input: True
|
79 |
+
escape_html_input: False
|
80 |
+
|
81 |
+
unescape_html_output: True
|
82 |
+
escape_html_output: False
|
83 |
+
|
84 |
+
change_input_files: None
|
85 |
+
change_output_files: None
|
86 |
+
change_translation_files: None
|
87 |
+
|
88 |
+
GoogleTranslate:
|
89 |
+
sllang: en
|
90 |
+
tllang: es
|
91 |
+
glossary: None
|
92 |
+
#state None if no glossary is used, otherwise the name of the glossary
|
93 |
+
project_id: XXxxXXXXX
|
94 |
+
location: us-central1
|
95 |
+
jsonfile: XXxxXXXXX.json
|
96 |
+
|
97 |
+
DeepL:
|
98 |
+
API_key: XXxxXXXXX
|
99 |
+
sllang: EN
|
100 |
+
tllang: ES
|
101 |
+
#tag_handling: html
|
102 |
+
#one of html, xml
|
103 |
+
formality: default
|
104 |
+
#one of default, less, more
|
105 |
+
split_sentences: "off"
|
106 |
+
#one of "off", "on", "nonewlines"
|
107 |
+
glossary: None
|
108 |
+
|
109 |
+
Lucy:
|
110 |
+
url: http://XXX.XXX.XXX:8080/AutoTranslateRS/V1.3/mtrans/exec/
|
111 |
+
TRANSLATION_DIRECTION: SPANISH-CATALAN
|
112 |
+
MARK_UNKNOWNS: 0
|
113 |
+
MARK_ALTERNATIVES: 0
|
114 |
+
MARK_COMPOUNDS: 0
|
115 |
+
CHARSET: UTF
|
config.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MTUOCServer_port=""
|
2 |
+
MTUOCServer_MTengine=""
|
3 |
+
startMTEngineV=False
|
4 |
+
startMTEngineCommand=""
|
5 |
+
MTUOCServer_type=""
|
6 |
+
MTEngineIP=""
|
7 |
+
MTEnginePort=""
|
8 |
+
verbosity_level=0
|
9 |
+
log_file=""
|
10 |
+
sortidalog=""
|
11 |
+
min_len_factor=0.5
|
12 |
+
min_chars_segment=1
|
13 |
+
tag_restoration=False
|
14 |
+
|
15 |
+
|
16 |
+
segment_input=False
|
17 |
+
SRXfile=None
|
18 |
+
SRXlang=None
|
19 |
+
rules=None
|
20 |
+
|
21 |
+
sllang=""
|
22 |
+
tllang=""
|
23 |
+
tokenize_SL=False
|
24 |
+
tokenize_TL=False
|
25 |
+
tokenizerSL=None
|
26 |
+
tokenizerTL=None
|
27 |
+
tokenizerSLType=""
|
28 |
+
tokenizerTLType=""
|
29 |
+
|
30 |
+
tcmodel=""
|
31 |
+
truecase=""
|
32 |
+
truecaser=None
|
33 |
+
checkistranslatable=False
|
34 |
+
|
35 |
+
MTUOCServer_URLs=False
|
36 |
+
MTUOCServer_EMAILs=False
|
37 |
+
|
38 |
+
code_URLs="@URL@"
|
39 |
+
code_EMAILs="@EMAIL@"
|
40 |
+
|
41 |
+
pre_replace_NUMs=False
|
42 |
+
code_NUMs="@NUM@"
|
43 |
+
pre_split_NUMs=False
|
44 |
+
|
45 |
+
#sentencepiece
|
46 |
+
sentencepiece=""
|
47 |
+
spmodelSL=""
|
48 |
+
spmodelTL=""
|
49 |
+
sp_splitter=""
|
50 |
+
|
51 |
+
#BPE
|
52 |
+
BPE=""
|
53 |
+
bpecodes=""
|
54 |
+
bpe_joiner=""
|
55 |
+
bpeobject=None
|
56 |
+
|
57 |
+
|
58 |
+
leading_spaces=0
|
59 |
+
trailing_spaces=0
|
60 |
+
|
61 |
+
MTUOCServer_NUMs=False
|
62 |
+
MTUOCServer_splitNUMs=False
|
63 |
+
|
64 |
+
escape_html_input=False
|
65 |
+
unescape_html_input=True
|
66 |
+
|
67 |
+
change_input_files=None
|
68 |
+
change_output_files=None
|
69 |
+
change_translation_files=None
|
70 |
+
changes_input=[]
|
71 |
+
changes_output=[]
|
72 |
+
changes_translation=[]
|
73 |
+
|
74 |
+
|
75 |
+
tagrestorer=None
|
76 |
+
|
77 |
+
bos_annotate=False
|
78 |
+
bos_symbol="<s>"
|
79 |
+
#None or <s> (or other)
|
80 |
+
eos_annotate=False
|
81 |
+
eos_symbol="</s>"
|
82 |
+
|
83 |
+
spSL=None
|
84 |
+
spTL=None
|
85 |
+
|
86 |
+
translation_selection_strategy="First"
|
87 |
+
|
88 |
+
Google_sllang=""
|
89 |
+
Google_tllang=""
|
90 |
+
Google_glossary=""
|
91 |
+
Google_project_id=""
|
92 |
+
Google_location=""
|
93 |
+
Google_jsonfile=""
|
94 |
+
client=None
|
95 |
+
parent=None
|
96 |
+
|
97 |
+
DeepL_API_key=""
|
98 |
+
DeepL_sl_lang=""
|
99 |
+
DeepL_tl_lang=""
|
100 |
+
DeepL_formality=""
|
101 |
+
DeepL_split_sentences=""
|
102 |
+
DeepL_glossary=""
|
103 |
+
DeepLtranslator=None
|
104 |
+
|
105 |
+
Lucy_url=""
|
106 |
+
Lucy_TRANSLATION_DIRECTION=""
|
107 |
+
Lucy_MARK_UNKNOWNS=""
|
108 |
+
Lucy_MARK_ALTERNATIVES=""
|
109 |
+
Lucy_MARK_COMPOUNDS=""
|
110 |
+
Lucy_CHARSET=""
|
111 |
+
|
112 |
+
MTUOCServer_ONMT_url_root=None
|
113 |
+
|
114 |
+
proxyMoses=None
|
marian-server-GPU
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5146caf265138f2b7fd84cfa69309072e1c2128600afdd857a199593aff4d7b3
|
3 |
+
size 487624648
|
model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:229f37d69935a792c5c2a01dc8bb961ca8364a6e3cbc57ee3d6e206dd27e5c69
|
3 |
+
size 528817672
|
model.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:352051764d584a60946b7133f0e268e9c4af70e0bacbbf5ae00142caea1759b7
|
3 |
+
size 528823268
|
requirements.txt
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pip3 install websockets
|
2 |
+
pip3 install lxml
|
3 |
+
pip3 install regex
|
4 |
+
pip3 install websocket
|
5 |
+
pip3 install websocket-client
|
6 |
+
pip3 install requests
|
7 |
+
pip3 install flask
|
8 |
+
pip3 install PyYAML
|
9 |
+
pip3 install sentencepiece
|
10 |
+
pip3 install waitress
|
11 |
+
|
12 |
+
Per GoogleTranslate
|
13 |
+
pip3 install google-cloud
|
14 |
+
pip3 install google-cloud-vision
|
15 |
+
pip3 install google-cloud-translate
|
16 |
+
ip3 install google-api-python-client
|
17 |
+
|
18 |
+
|
19 |
+
Per DeepL
|
20 |
+
pip3 install deepl
|
21 |
+
|
22 |
+
Per Lucy
|
23 |
+
pip3 install xmltodict
|
24 |
+
|
25 |
+
|
26 |
+
Per poder crear executables
|
27 |
+
pip3 install pyinstaller
|
segment.srx
ADDED
The diff for this file is too large to render.
See raw diff
|
|
segmentSC.srx
ADDED
The diff for this file is too large to render.
See raw diff
|
|
spmodel.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c314df995019c6bbe81cc35452090270f3e481db8b2306f73e5d565ef774c260
|
3 |
+
size 1349219
|
spmodel.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|
srx_segmenter.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Segment text with SRX.
|
2 |
+
"""
|
3 |
+
__version__ = '0.0.2'
|
4 |
+
|
5 |
+
import lxml.etree
|
6 |
+
import regex
|
7 |
+
from typing import (
|
8 |
+
List,
|
9 |
+
Set,
|
10 |
+
Tuple,
|
11 |
+
Dict,
|
12 |
+
Optional
|
13 |
+
)
|
14 |
+
|
15 |
+
|
16 |
+
class SrxSegmenter:
|
17 |
+
"""Handle segmentation with SRX regex format.
|
18 |
+
"""
|
19 |
+
def __init__(self, rule: Dict[str, List[Tuple[str, Optional[str]]]], source_text: str) -> None:
|
20 |
+
self.source_text = source_text
|
21 |
+
self.non_breaks = rule.get('non_breaks', [])
|
22 |
+
self.breaks = rule.get('breaks', [])
|
23 |
+
|
24 |
+
def _get_break_points(self, regexes: List[Tuple[str, str]]) -> Set[int]:
|
25 |
+
return set([
|
26 |
+
match.span(1)[1]
|
27 |
+
for before, after in regexes
|
28 |
+
for match in regex.finditer('({})({})'.format(before, after), self.source_text)
|
29 |
+
])
|
30 |
+
|
31 |
+
def get_non_break_points(self) -> Set[int]:
|
32 |
+
"""Return segment non break points
|
33 |
+
"""
|
34 |
+
return self._get_break_points(self.non_breaks)
|
35 |
+
|
36 |
+
def get_break_points(self) -> Set[int]:
|
37 |
+
"""Return segment break points
|
38 |
+
"""
|
39 |
+
return self._get_break_points(self.breaks)
|
40 |
+
|
41 |
+
def extract(self) -> Tuple[List[str], List[str]]:
|
42 |
+
"""Return segments and whitespaces.
|
43 |
+
"""
|
44 |
+
non_break_points = self.get_non_break_points()
|
45 |
+
candidate_break_points = self.get_break_points()
|
46 |
+
|
47 |
+
break_point = sorted(candidate_break_points - non_break_points)
|
48 |
+
source_text = self.source_text
|
49 |
+
|
50 |
+
segments = [] # type: List[str]
|
51 |
+
whitespaces = [] # type: List[str]
|
52 |
+
previous_foot = ""
|
53 |
+
for start, end in zip([0] + break_point, break_point + [len(source_text)]):
|
54 |
+
segment_with_space = source_text[start:end]
|
55 |
+
candidate_segment = segment_with_space.strip()
|
56 |
+
if not candidate_segment:
|
57 |
+
previous_foot += segment_with_space
|
58 |
+
continue
|
59 |
+
|
60 |
+
head, segment, foot = segment_with_space.partition(candidate_segment)
|
61 |
+
|
62 |
+
segments.append(segment)
|
63 |
+
whitespaces.append('{}{}'.format(previous_foot, head))
|
64 |
+
previous_foot = foot
|
65 |
+
whitespaces.append(previous_foot)
|
66 |
+
|
67 |
+
return segments, whitespaces
|
68 |
+
|
69 |
+
|
70 |
+
def parse(srx_filepath: str) -> Dict[str, Dict[str, List[Tuple[str, Optional[str]]]]]:
|
71 |
+
"""Parse SRX file and return it.
|
72 |
+
:param srx_filepath: is soruce SRX file.
|
73 |
+
:return: dict
|
74 |
+
"""
|
75 |
+
tree = lxml.etree.parse(srx_filepath)
|
76 |
+
namespaces = {
|
77 |
+
'ns': 'http://www.lisa.org/srx20'
|
78 |
+
}
|
79 |
+
|
80 |
+
rules = {}
|
81 |
+
|
82 |
+
for languagerule in tree.xpath('//ns:languagerule', namespaces=namespaces):
|
83 |
+
rule_name = languagerule.attrib.get('languagerulename')
|
84 |
+
if rule_name is None:
|
85 |
+
continue
|
86 |
+
|
87 |
+
current_rule = {
|
88 |
+
'breaks': [],
|
89 |
+
'non_breaks': [],
|
90 |
+
}
|
91 |
+
|
92 |
+
for rule in languagerule.xpath('ns:rule', namespaces=namespaces):
|
93 |
+
is_break = rule.attrib.get('break', 'yes') == 'yes'
|
94 |
+
rule_holder = current_rule['breaks'] if is_break else current_rule['non_breaks']
|
95 |
+
|
96 |
+
beforebreak = rule.find('ns:beforebreak', namespaces=namespaces)
|
97 |
+
beforebreak_text = '' if beforebreak.text is None else beforebreak.text
|
98 |
+
|
99 |
+
afterbreak = rule.find('ns:afterbreak', namespaces=namespaces)
|
100 |
+
afterbreak_text = '' if afterbreak.text is None else afterbreak.text
|
101 |
+
|
102 |
+
rule_holder.append((beforebreak_text, afterbreak_text))
|
103 |
+
|
104 |
+
rules[rule_name] = current_rule
|
105 |
+
|
106 |
+
return rules
|
start
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b884a8ee6c060a3332d6f72e716ab72f6ec855668ac3080853ee2136cc9eecf9
|
3 |
+
size 5742224
|
stop
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59ce505c15d74857bbfa9095dbf5c0aa0bd601d4d251b074fbdb2956f21a97b9
|
3 |
+
size 5628368
|
tc.es
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f2fa32066ef0df781509a2c3817073f56d6398f69a7121993b1f7431ac984b90
|
3 |
+
size 147397351
|
vocab-en.yml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
vocab-es.yml
ADDED
The diff for this file is too large to render.
See raw diff
|
|