imdbo commited on
Commit
3e9f026
1 Parent(s): 31011b3

Upload tokenizer.perl

Browse files
Files changed (1) hide show
  1. tokenizer.perl +218 -0
tokenizer.perl ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ # ProLNat Tokenizer (provided with Sentence Identifier)
4
+ # autor: Grupo ProLNat@GE, CiTIUS
5
+ # Universidade de Santiago de Compostela
6
+
7
+
8
+ # Script que integra 2 funçoes perl: sentences e tokens
9
+ package Tokens;
10
+
11
+ #<ignore-block>
12
+ use strict;
13
+ binmode STDIN, ':utf8';
14
+ binmode STDOUT, ':utf8';
15
+ use utf8;
16
+ #<ignore-block>
17
+
18
+ # Pipe
19
+ my $pipe = !defined (caller);#<ignore-line>
20
+
21
+ # Absolute path
22
+ use File::Basename;#<ignore-line>
23
+ my $abs_path = ".";#<string>
24
+ $abs_path = dirname(__FILE__);#<ignore-line>
25
+
26
+ ##variaveis globais
27
+ ##para sentences e tokens:
28
+ my $UpperCase = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜ]";#<string>
29
+ my $LowerCase = "[a-záéíóúàèìòùâêîôûñçü]";#<string>
30
+ my $Punct = qr/[\,\;\«\»\“\”\'\"\&\$\#\=\(\)\<\>\!\¡\?\¿\\\[\]\{\}\|\^\*\€\·\¬\…]/;#<string>
31
+ my $Punct_urls = qr/[\:\/\~]/;#<string>
32
+
33
+ ##para splitter:
34
+ ##########INFORMAÇAO DEPENDENTE DA LINGUA###################
35
+ #my $pron = "(me|te|se|le|les|la|lo|las|los|nos|os)";
36
+ # Formas que não se separam do 's (e sim os nomes próprios) so ingles
37
+ my $contr = "([Hh]e|[Hh]ere|[Hh]ow|[Ii]t|[Ss]he|[Tt]hat|[Tt]here|[Ww]hat|[Ww]hen|[Ww]here|[Ww]ho|[Ww]hy)";#<string>
38
+ ###########################################################
39
+ my $w = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜa-záéíóúàèìòùâêîôûñçü]";#<string>
40
+
41
+ sub tokens {
42
+
43
+ my ($sentences) = @_;#<ref><list><string>
44
+
45
+ ###puntuaçoes compostas
46
+ my $susp = "3SUSP012";#<string>
47
+ my $duplo1 = "2DOBR111";#<string>
48
+ my $duplo2 = "2DOBR222";#<string>
49
+ my $duplo3 = "2DOBR333";#<string>
50
+ my $duplo4 = "2DOBR444";#<string>
51
+
52
+ ##pontos e virgulas entre numeros
53
+ my $dot_quant = "44DOTQUANT77";#<string>
54
+ my $comma_quant = "44COMMQUANT77";#<string>
55
+ my $quote_quant = "44QUOTQUANT77";#<string>
56
+
57
+ #(my @sentences) = split ('\n', $texto);
58
+
59
+ my @saida = ();#<list><string>
60
+
61
+ foreach my $sentence (@{$sentences}) {
62
+
63
+ chomp $sentence;
64
+ #substituir puntuaçoes
65
+
66
+ $sentence =~ s/[ ]*$//;
67
+ $sentence =~ s/\.\.\./ $susp /g ;
68
+ $sentence =~ s/\<\</ $duplo1 /g ;
69
+ $sentence =~ s/\>\>/ $duplo2 /g ;
70
+ $sentence =~ s/\'\'/ $duplo3 /g ;
71
+ $sentence =~ s/\`\`/ $duplo4 /g ;
72
+
73
+ $sentence =~ s/([0-9]+)\.([0-9]+)/${1}$dot_quant$2 /g ;
74
+ $sentence =~ s/([0-9]+)\,([0-9]+)/${1}$comma_quant$2 /g ;
75
+ $sentence =~ s/([0-9]+)\'([0-9]+)/${1}$quote_quant$2 /g ;
76
+
77
+ #print STDERR "#$sentence#\n";
78
+ $sentence =~ s/($Punct)/ $1 /g ;
79
+ #print STDERR "2#$sentence#\n";
80
+ $sentence =~ s/($Punct_urls)(?:[\s\n]|$)/ $1 /g ;
81
+
82
+ ##hypen - no fim de palavra ou no principio:
83
+ $sentence =~ s/(\w)- /$1 - /g ;
84
+ $sentence =~ s/ -(\w)/ - $1/g ;
85
+ $sentence =~ s/(\w)-$/$1 -/g ;
86
+ $sentence =~ s/^-(\w)/- $1/g ;
87
+
88
+
89
+ $sentence =~ s/\.$/ \. /g ; ##ponto final
90
+
91
+ my @tokens = split (" ", $sentence);#<array><string>
92
+
93
+ foreach my $token (@tokens) {
94
+
95
+ $token =~ s/^[\s]*//;
96
+ $token =~ s/[\s]*$//;
97
+ $token =~ s/$susp/\.\.\./;
98
+ $token =~ s/$duplo1/\<\</;
99
+ $token =~ s/$duplo2/\>\>/;
100
+ $token =~ s/$duplo3/\'\'/;
101
+ $token =~ s/$duplo4/\`\`/;
102
+ $token =~ s/$dot_quant/\./;
103
+ $token =~ s/$comma_quant/\,/;
104
+ $token =~ s/$quote_quant/\'/;
105
+
106
+ if($pipe){#<ignore-line>
107
+ print "$token ";#<ignore-line>
108
+ }else{#<ignore-line>
109
+ push (@saida, $token);
110
+ }#<ignore-line>
111
+ }
112
+
113
+ if($pipe){#<ignore-line>
114
+ print "\n";#<ignore-line>
115
+ }else{#<ignore-line>
116
+ push (@saida, "");
117
+ }#<ignore-line>
118
+ }
119
+
120
+ return \@saida;
121
+ }
122
+
123
+ #<ignore-block>
124
+ if($pipe){
125
+ my @tokens=<STDIN>;
126
+ tokens(\@tokens);
127
+ }
128
+ #<ignore-block>
129
+
130
+ ###OUTRAS FUNÇOES
131
+
132
+ sub punct {
133
+ my ($p) = @_ ;#<string>
134
+ my $result ="";#<string>
135
+
136
+ if ($p eq "\.") {
137
+ $result = "Fp";
138
+ }
139
+ elsif ($p eq "\,") {
140
+ $result = "Fc";
141
+ }
142
+ elsif ($p eq "\:") {
143
+ $result = "Fd";
144
+ }
145
+ elsif ($p eq "\;") {
146
+ $result = "Fx";
147
+ }
148
+ elsif ($p =~ /^(\-|\-\-)$/) {
149
+ $result = "Fg";
150
+ }
151
+ elsif ($p =~ /^(\'|\"|\`\`|\'\')$/) {
152
+ $result = "Fe";
153
+ }
154
+ elsif ($p eq "\.\.\.") {
155
+ $result = "Fs";
156
+ }
157
+ elsif ($p =~ /^(\<\<|«)/) {
158
+ $result = "Fra";
159
+ }
160
+ elsif ($p =~ /^(\>\>|»)/) {
161
+ $result = "Frc";
162
+ }
163
+ elsif ($p eq "\%") {
164
+ $result = "Ft";
165
+ }
166
+ elsif ($p =~ /^(\/|\\)$/) {
167
+ $result = "Fh";
168
+ }
169
+ elsif ($p eq "\(") {
170
+ $result = "Fpa";
171
+ }
172
+ elsif ($p eq "\)") {
173
+ $result = "Fpt";
174
+ }
175
+ elsif ($p eq "\¿") {
176
+ $result = "Fia";
177
+ }
178
+ elsif ($p eq "\?") {
179
+ $result = "Fit";
180
+ }
181
+ elsif ($p eq "\¡") {
182
+ $result = "Faa";
183
+ }
184
+ elsif ($p eq "\!") {
185
+ $result = "Fat";
186
+ }
187
+ elsif ($p eq "\[") {
188
+ $result = "Fca";
189
+ }
190
+ elsif ($p eq "\]") {
191
+ $result = "Fct";
192
+ }
193
+ elsif ($p eq "\{") {
194
+ $result = "Fla";
195
+ }
196
+ elsif ($p eq "\}") {
197
+ $result = "Flt";
198
+ }
199
+ return $result;
200
+ }
201
+
202
+
203
+ sub lowercase {
204
+ my ($x) = @_ ;#<string>
205
+ $x = lc ($x);
206
+ $x =~ tr/ÁÉÍÓÚÇÑ/áéíóúçñ/;
207
+
208
+ return $x;
209
+ }
210
+
211
+ sub Trim {
212
+ my ($x) = @_ ;#<string>
213
+
214
+ $x =~ s/^[\s]*//;
215
+ $x =~ s/[\s]$//;
216
+
217
+ return $x;
218
+ }