diff --git "a/uroman/lib/NLP/English.pm" "b/uroman/lib/NLP/English.pm" new file mode 100644--- /dev/null +++ "b/uroman/lib/NLP/English.pm" @@ -0,0 +1,3112 @@ +################################################################ +# # +# English # +# # +################################################################ + +package NLP::English; + +use File::Basename; +use File::Spec; + +# tok v1.3.7 (May 16, 2019) + +$chinesePM = NLP::Chinese; +$ParseEntry = NLP::ParseEntry; +$util = NLP::utilities; +$utf8 = NLP::UTF8; +$logfile = ""; +# $logfile2 = (-d "/nfs/isd/ulf/smt/agile") ? "/nfs/isd/ulf/smt/agile/minilog" : ""; +# $util->init_log($logfile2); + +$currency_symbol_list = "\$|\xC2\xA5|\xE2\x82\xAC|\xE2\x82\xA4"; +$english_resources_skeleton_dir = ""; +%dummy_ht = (); + +sub build_language_hashtables { + local($caller, $primary_entity_style_filename, $data_dir) = @_; + + unless ($data_dir) { + $default_data_dir = "/nfs/nlg/users/textmap/brahms-ml/arabic/bin/modules/NLP"; + $data_dir = $default_data_dir if -d $default_data_dir; + } + my $english_word_filename = "$data_dir/EnglishWordlist.txt"; + my $default_entity_style_MT_filename = "$data_dir/EntityStyleMT-zh.txt"; + my $entity_style_all_filename = "$data_dir/EntityStyleAll.txt"; + my $EnglishNonNameCapWords_filename = "$data_dir/EnglishNonNameCapWords.txt"; + $english_resources_skeleton_dir = "$data_dir/EnglishResources/skeleton"; + %english_annotation_ht = (); + %annotation_english_ht = (); + %english_ht = (); + $CardinalMaxWithoutComma = 99999; + $CardinalMaxNonLex = 9999000; + + $primary_entity_style_filename = $default_entity_style_MT_filename unless defined($primary_entity_style_filename); + if ($primary_entity_style_filename =~ /^(ar|zh)$/) { + $languageCode = $primary_entity_style_filename; + $primary_entity_style_filename + = File::Spec->catfile($data_dir, "EntityStyleMT-$languageCode.txt"); + } + + open(IN,$english_word_filename) || die "Can't open $english_word_filename"; + while () { + next unless $_ =~ /^s*[^#\s]/; # unless blank/comment line + $_ =~ s/\s+$//; + $line = $_; + @lines = ($line); + if (($line =~ /::gpe:/) + && (($annotation) = ($line =~ /^.*?::(.*)$/)) + && (($pre_annotation, $singular_english, $post_annotation) = ($annotation =~ /^(.*)::plural-of:([^:]+)(|::.*)\s*$/))) { + $derived_annotation = $singular_english . "::$pre_annotation$post_annotation"; + # print STDERR "derived_annotation: $derived_annotation\n"; + push(@lines, $derived_annotation); + } + foreach $line (@lines) { + ($english,@slots) = split("::",$line); + next unless defined($english); + $english =~ s/\s+$//; + $lc_english = $english; + $lc_english =~ tr/[A-Z]/[a-z]/; + $annotation = "::" . join("::",@slots) . "::"; + $english_annotation_ht{$english} = $annotation; + $english_annotation_ht{$lc_english} = $annotation; + $english_annotation_ht{"_ALT_"}->{$english}->{$annotation} = 1; + $english_annotation_ht{"_ALT_"}->{$lc_english}->{$annotation} = 1; + $synt = ""; + foreach $slot_value (@slots) { + ($slot,$value) = ($slot_value =~ /\s*(\w[^:]+):\s*(\S.*)$/); + next unless defined($value); + $slot =~ s/\s+$//; + $value =~ s/\s+$//; + $synt = $value if $slot eq "synt"; + if (defined($annotation_english_ht{$slot_value})) { + push(@{$annotation_english_ht{$slot_value}},$english); + } else { + my @elist = ($english); + $annotation_english_ht{$slot_value} = \@elist; + } + if ($synt && defined($slot_value) && ($slot ne "synt")) { + $annot = "synt:$synt" . "::$slot_value"; + if (defined($annotation_english_ht{$annot})) { + push(@{$annotation_english_ht{$annot}},$english); + } else { + my @elist = ($english); + $annotation_english_ht{$annot} = \@elist; + } + $english_annotation_ht{"_EN_SYNT_"}->{$english}->{$synt}->{$slot} = $value; + } + } + } + } + close(IN); + + if (open(IN,$EnglishNonNameCapWords_filename)) { + while () { + next unless $_ =~ /^s*[^#\s]/; # unless blank/comment line + $_ =~ s/\s+$//; + $english_ht{(lc $_)}->{COMMON_NON_NAME_CAP} = 1; + } + close(IN); + } else { + print STDERR "Can't open $EnglishNonNameCapWords_filename\n"; + } + + foreach $style ("primary", "all") { + if ($style eq "primary") { + $entity_style_filename = $primary_entity_style_filename || $default_entity_style_MT_filename; + } elsif ($style eq "all") { + $entity_style_filename = $entity_style_all_filename; + } else { + next; + } + %ht = (); + open(IN,$entity_style_filename) || die("Can't open $entity_style_filename (stylefile)"); + my $n_entries = 0; + while () { + next unless $_ =~ /^s*[^#\s]/; # unless blank/comment line + $_ =~ s/\s+$//; + ($slot,$value_string) = ($_ =~ /^([^:]+):\s*(\S.*)$/); + next unless defined($value_string); + if (defined($ht{$slot})) { + print STDERR "Warning: ignoring duplicate entry for $slot in $entity_style_filename\n"; + next; + } + @values = split("::", $value_string); + foreach $value (@values) { + $value =~ s/^\s+//g; + $value =~ s/\s+$//g; + } + my @values_copy = @values; + $ht{$slot} = \@values_copy; + $n_entries++; + } + # print STDERR "Processed $n_entries entries in $entity_style_filename\n"; + close(IN); + if ($style eq "primary") { + %english_entity_style_ht = %ht; + } elsif ($style eq "all") { + %english_entity_style_all_ht = %ht; + } + } + + if (defined($raw = $english_entity_style_ht{CardinalMaxWithoutComma}) + && (@styles = @{$raw}) && ($n = $styles[0]) && ($n =~ /^\d+$/) && ($n >= 999)) { + $CardinalMaxWithoutComma = $n; + } + if (defined($raw = $english_entity_style_ht{CardinalMaxNonLex}) + && (@styles = @{$raw}) && ($n = $styles[0]) && ($n =~ /^\d+$/) && ($n >= 999999)) { + $CardinalMaxNonLex = $n; + } + + return (*english_annotation_ht,*annotation_english_ht,*english_entity_style_ht); +} + +sub read_language_variations { + local($this, $filename, *ht) = @_; + + my $n = 0; + my $line_number = 0; + if (open(IN, $filename)) { + while () { + $line_number++; + $us = $util->slot_value_in_double_colon_del_list($_, "us"); + $uk = $util->slot_value_in_double_colon_del_list($_, "uk"); + $formal = $util->slot_value_in_double_colon_del_list($_, "formal"); + $informal = $util->slot_value_in_double_colon_del_list($_, "informal"); + if ($us && $uk) { + $ht{VARIATION_UK_US}->{$uk}->{$us} = 1; + $n++; + } + if ($informal && $formal) { + $ht{VARIATION_INFORMAL_FORMAL}->{$informal}->{$formal} = 1; + $n++; + } + } + close(IN); + # print STDERR "Read $n spelling variation entries from $filename\n"; + } +} + +sub entity_style_listing { + local($caller,$attr) = @_; + + if (defined($l = $english_entity_style_ht{$attr})) { + @sl = @{$l}; + if (($#sl == 0) && ($sl[0] eq "all")) { + if (defined($al = $english_entity_style_all_ht{$attr})) { + return @{$al}; + } else { + return (); + } + } else { + return @sl; + } + } else { + return (); + } +} + +sub is_abbreviation { + local($caller,$noun) = @_; + + $result = defined($annotation_s = $english_annotation_ht{$noun}) + && ($annotation_s =~ /::abbreviation:true::/); +# print "is_abbreviation($noun): $result\n"; + return $result; +} + +sub noun_adv_sem { + local($caller,$noun) = @_; + + return "" unless defined($annotation_s = $english_annotation_ht{$noun}); + ($adv_sem) = ($annotation_s =~ /::adv_sem:([-_a-z]+)::/); + return "" unless defined($adv_sem); + return $adv_sem; +} + +sub numeral_value { + local($caller,$numeral) = @_; + + return "" unless defined($annotation_s = $english_annotation_ht{$numeral}); + ($value) = ($annotation_s =~ /::value:(\d+)::/); + return "" unless defined($value); + return $value; +} + +sub annot_slot_value { + local($caller,$lex, $slot) = @_; + + return "" unless defined($annotation_s = $english_annotation_ht{$lex}); + ($value) = ($annotation_s =~ /::$slot:([-_a-z]+)(?:::.*|)\s*$/i); + return "" unless defined($value); + return $value; +} + +sub annot_slot_values { + local($caller,$lex, $slot) = @_; + + return () unless @annotations = keys %{$english_annotation_ht{"_ALT_"}->{$lex}}; + @annot_slot_values = (); + foreach $annotation_s (@annotations) { + ($value) = ($annotation_s =~ /::$slot:([^:]+)(?:::.*|)\s*$/i); + if (defined($value)) { + $value =~ s/\s*$//; + push(@annot_slot_values, $value); + } + } + return @annot_slot_values; +} + +# quick and dirty +sub noun_number_form { + local($caller,$noun,$number) = @_; + + $noun = "rupee" if $noun =~ /^Rs\.?$/; + $noun = "kilometer" if $noun =~ /^km$/; + $noun = "kilogram" if $noun =~ /^kg$/; + $noun = "meter" if $noun =~ /^m$/; + $noun = "second" if $noun =~ /^(s|secs?\.?)$/; + $noun = "minute" if $noun =~ /^(mins?\.?)$/; + $noun = "hour" if $noun =~ /^(h|hrs?\.?)$/; + $noun = "year" if $noun =~ /^(yrs?\.?)$/; + $noun = "degree" if $noun =~ /^(deg\.?)$/; + $noun = "foot" if $noun =~ /^(feet|ft\.?)$/; + $noun = "square kilometer" if $noun =~ /^sq\.? km/; + $noun =~ s/metre$/meter/; + $noun =~ s/litre$/liter/; + $noun =~ s/gramme$/gram/; + $noun =~ s/tonne$/ton/; + return $noun if $noun =~ /\$$/; + return $noun unless $number =~ /^[0-9.]+$/; + return $noun if $util->member($noun,"percent"); # no change in plural + return $noun if $noun =~ /\b(yuan|renminbi|RMB|rand|won|yen|ringgit|birr)$/; # no change in plural + return $noun if $number <= 1; + + return $noun if $caller->is_abbreviation($noun); + + $noun =~ s/^(hundred|thousand|million|billion|trillion)\s+//; + return $noun if $noun =~ /^(dollar|kilometer|pound|ton|year)s$/i; + + $original_noun = $noun; + #check for irregular plural + $annot = "synt:noun::plural-of:$noun"; + if (defined($annotation_english_ht{$annot})) { + @elist = @{$annotation_english_ht{$annot}}; + return $elist[0] if @elist; + } + + $noun = $noun . "s"; + return $noun if $noun =~ /(a|e|o|u)ys$/; # days, keys, toys, guys + $noun =~ s/ys$/ies/; # babies + $noun =~ s/ss$/ses/; # buses + $noun =~ s/xs$/xes/; # taxes + $noun =~ s/shs$/shes/; # dishes + $noun =~ s/chs$/ches/; # churches + $noun =~ s/mans$/men/; # women + # print STDERR "NNF: $original_noun($number): $noun\n"; + return $noun; +} + +# quick and dirty +sub lex_candidates { + local($caller,$surf) = @_; + + @lex_cands = ($surf); + $lex_cand = $surf; + $lex_cand =~ s/ies$/y/; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + $lex_cand = $surf; + $lex_cand =~ s/s$//; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + $lex_cand = $surf; + $lex_cand =~ s/es$//; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + $lex_cand = $surf; + $lex_cand =~ s/\.$//; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + $lex_cand = $surf; + $lex_cand =~ s/men$/man/; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + + return @lex_cands; +} + +# quick and dirty +sub pos_tag { + local($caller,$surf) = @_; + + return CD if ($surf =~ /^-?[0-9,\.]+$/); + return NN if ($surf =~ /^($currency_symbol_list\d)/); + @lex_candidates = $caller->lex_candidates($surf); +# print " lex_candidates: @lex_candidates\n"; + foreach $lex_cand (@lex_candidates) { + if (defined($annotation_s = $english_annotation_ht{$lex_cand})) { +# print " annotation: $annotation_s\n"; + ($synt) = ($annotation_s =~ /::synt:([^:]+)::/); + if (defined($synt)) { + if ($synt eq "art") { + return "DT"; + } elsif ($synt eq "adj") { + ($grade) = ($annotation_s =~ /::grade:([^:]+)::/); + if (defined($grade) && ($grade eq "superlative")) { + return "JJS"; + } elsif (defined($grade) && ($grade eq "comparative")) { + return "JJR"; + } else { + return "JJ"; + } + } elsif ($synt eq "noun") { + if ($lex_cand eq $surf) { + return "NN"; + } else { + return "NNS"; + } + } elsif ($synt eq "name") { + return "NNP"; + } elsif ($synt eq "cardinal") { + return "CD"; + } elsif ($synt eq "ordinal") { + return "JJ"; + } elsif ($synt eq "prep") { + return "IN"; + } elsif ($synt eq "conj") { + return "CC"; + } elsif ($synt eq "wh_pron") { + return "WP"; + } elsif ($synt eq "adv") { + return "RB"; + } elsif ($synt eq "genetive_particle") { + return "POS"; + } elsif ($synt eq "ordinal_particle") { + return "NN"; + } elsif ($synt eq "suffix_particle") { + return "NN"; + } elsif ($synt =~ /^int(erjection)?$/) { + return "UH"; + } elsif (($synt =~ /^punctuation$/) + && $util->is_rare_punctuation_string_p($surf)) { + return "SYM"; + } elsif ($synt =~ /\bverb$/) { + if ($surf =~ /^(is)$/) { + return "VBZ"; + } else { + return "VB"; + } + } + } + } + } + return ""; +} + +sub indef_art_filter { + local($caller,$surf) = @_; + + # check article in lexical annotation + # e.g. hour::synt:noun::unit:temporal::indef-article:an + # uniform::synt:noun::indef-article:a + ($surf_article,$word) = ($surf =~ /^(an?) (\S+)\s*/); + if (defined($surf_article) + && defined($word) + && defined($annotation = $english_annotation_ht{$word})) { + ($ann_article) = ($annotation =~ /::indef-article:([^:]+)::/); + if (defined($ann_article)) { + return ($surf_article eq $ann_article) ? $surf : ""; + } + } + return "" if $surf =~ /\ban [bcdfghjklmnpqrstvwxyz]/; + return "" if $surf =~ /\ban (US)\b/; + return "" if $surf =~ /\ba [aeio]/; + return "" if $surf =~ /\ba (under)/; + return $surf; +} + +sub wordlist_synt { + local($caller,$word) = @_; + + return "" unless defined($annotation = $english_annotation_ht{$word}); + ($synt) = ($annotation =~ /::synt:([^:]+)::/); + return $synt || ""; +} + +sub qualifier_filter { + local($caller,$surf) = @_; + + return "" if $surf =~ /\b(over|more than|approximately) (million|billion|trillion)/; + return "" if $surf =~ /\b(over) (once|twice)/; + return $surf; +} + +sub quantity_filter { + local($caller,$surf) = @_; + + return "" if $surf =~ /^(a|an)-/; # avoid "the a-week meeting" + return $surf; +} + +sub value_to_english { + local($caller,$number) = @_; + + $result = ""; + + $annot = "value:$number"; + if (defined($annotation_english_ht{$annot})) { + @elist = @{$annotation_english_ht{$annot}}; + $result = $elist[0] if @elist; + } +# print "value_to_english($number)=$result\n"; + return $result; +} + +sub value_to_english_ordinal { + local($caller,$number) = @_; + + $result = ""; + + $annot = "synt:ordinal::value:$number"; + if (defined($annotation_english_ht{$annot})) { + @elist = @{$annotation_english_ht{$annot}}; + $result = $elist[0] if @elist; + } else { + $annot = "value:$number"; + if (defined($annotation_english_ht{$annot})) { + @elist = @{$annotation_english_ht{$annot}}; + $cardinal = $elist[0] if @elist; + $result = $cardinal . "th"; + $result =~ s/yth$/ieth/; + } + } +# print "value_to_english($number)=$result\n"; + return $result; +} + +sub english_with_synt_slot_value { + local($caller, $english, $synt, $slot) = @_; + + return $english_annotation_ht{"_EN_SYNT_"}->{$english}->{$synt}->{$slot}; +} + +sub english_with_synt_slot_value_defined { + local($caller, $synt, $slot) = @_; + + @englishes_with_synt_slot_value_defined = (); + foreach $english (keys %{$english_annotation_ht{"_EN_SYNT_"}}) { + push(@englishes_with_synt_slot_value_defined, $english) + if defined($english_annotation_ht{"_EN_SYNT_"}->{$english}->{$synt}->{$slot}) + && ! $util->member($english, @englishes_with_synt_slot_value_defined) + } + return @englishes_with_synt_slot_value_defined; +} + +sub number_composed_surface_form { + local($caller,$number,$leave_num_section_p) = @_; + + return "" unless $number =~ /^\d+$/; + $leave_num_section_p = 0 unless defined($leave_num_section_p); + $anchor = "1000000000000000000000000"; + while (($number < $anchor) && ($anchor >= 1000000)) { + $anchor =~ s/000//; + } +# print "number_composed_surface_form number: $number anchor:$anchor\n"; + return "" unless $anchor >= 1000000; + return "" unless $english = $caller->value_to_english($anchor); + $ending = $anchor; + $ending =~ s/^1000//; + return "" unless ($number =~ /$ending$/) || (($number * 1000) % $anchor) == 0; + $num_section = $number / $anchor; + if (($num_section =~ /^[1-9]0?$/) && ! $leave_num_section_p) { + $num_section_english = $caller->value_to_english($num_section); + $num_section = $num_section_english if $num_section_english; + } + $num_section = $caller->commify($num_section); # only for extremely large numbers + return "$num_section $english"; +} + +sub de_scientify { + local($caller,$number) = @_; + +# print "de_scientify: $number\n"; + if ($number =~ /[eE][-+]/) { + ($n,$exp) = ($number =~ /^(\d+)[eE]\+(\d+)$/); + if (defined($exp)) { + $result = $n; + foreach $i (0 .. $exp-1) { + $result .= "0" + } + return $result; + } else { + ($n,$f,$exp) = ($number =~ /^(\d+)\.(\d+)[eE]\+(\d+)$/); + if (defined($exp) && ($exp >= length($f))) { + $result = "$n$f"; + foreach $i (0 .. $exp-1-length($f)) { + $result .= "0"; + } + return $result; + } + } + } + return $number; +} + +sub commify { + local($caller,$number) = @_; + + my $text = reverse $number; + $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; + return scalar reverse $text; +} + +my %plural_rough_number_ht = ( + 10 => "tens", + 12 => "dozens", + 20 => "scores", + 100 => "hundreds", + 1000 => "thousands", + 10000 => "tens of thousands", + 100000 => "hundreds of thousands", + 1000000 => "millions", + 10000000 => "tens of millions", + 100000000 => "hundreds of millions", + 1000000000 => "billions", + 10000000000 => "tens of billions", + 100000000000 => "hundreds of billions", + 1000000000000 => "trillions", + 10000000000000 => "tens of trillions", + 100000000000000 => "hundreds of trillions", +); + +sub plural_rough_plural_number { + local($caller,$number) = @_; + + return $plural_rough_number_ht{$number} || ""; +} + +my %roman_numeral_ht = ( + "I" => 1, + "II" => 2, + "III" => 3, + "IIII" => 4, + "IV" => 4, + "V" => 5, + "VI" => 6, + "VII" => 7, + "VIII" => 8, + "VIIII" => 9, + "IX" => 9, + "X" => 10, + "XX" => 20, + "XXX" => 30, + "XXXX" => 40, + "XL" => 40, + "L" => 50, + "LX" => 60, + "LXX" => 70, + "LXXX" => 80, + "LXXXX" => 90, + "XC" => 90, + "C" => 100, + "CC" => 200, + "CCC" => 300, + "CCCC" => 400, + "CD" => 400, + "D" => 500, + "DC" => 600, + "DCC" => 700, + "DCCC" => 800, + "DCCCC" => 900, + "CM" => 900, + "M" => 1000, + "MM" => 2000, + "MMM" => 3000, + "MMM" => 3000, +); + +sub roman_numeral_value { + local($caller,$s) = @_; + + if (($m, $c, $x, $i) = ((uc $s) =~ /^(M{0,3})(C{1,4}|CD|DC{0,4}|CM|)(X{1,4}|XL|LX{0,4}|XC|)(I{1,4}|IV|VI{0,4}|IX|)$/)) { + $sum = ($roman_numeral_ht{$m} || 0) + + ($roman_numeral_ht{$c} || 0) + + ($roman_numeral_ht{$x} || 0) + + ($roman_numeral_ht{$i} || 0); + return $sum; + } else { + return 0; + } +} + +sub number_surface_forms { + local($caller,$number,$pe) = @_; + + print STDERR "Warning from number_surface_forms: $number not a number\n" + if $logfile && !($number =~ /^(\d+(\.\d+)?|\.\d+)$/); + # $util->log("number_surface_forms number:$number", $logfile); + # $util->log(" surf:$surf", $logfile) if $surf = ($pe && $pe->surf); + + $pe = "" unless defined($pe); + + @num_style_list = @{$english_entity_style_ht{"FollowSourceLanguageNumberStyle"}}; + $follow_num_style = $util->member("yes", @num_style_list) + && (! (($number =~ /^([1-9]|10)$/) && + $util->member("except-small-numbers", @num_style_list))); + $num_style = ($pe) ? $pe->get("num_style") : ""; + if ($follow_num_style) { + if ($num_style =~ /digits_plus_alpha/) { + if ($number =~ /^[1-9]\d?\d?000$/) { + $digital_portion = $number; + $digital_portion =~ s/000$//; + return ("$digital_portion thousand"); + } elsif ($number =~ /^[1-9]\d?\d?000000$/) { + $digital_portion = $number; + $digital_portion =~ s/000000$//; + return ("$digital_portion million"); + } elsif ($number =~ /^[1-9]\d?\d?000000000$/) { + $digital_portion = $number; + $digital_portion =~ s/000000000$//; + return ("$digital_portion billion"); + } + } elsif ($num_style eq "digits") { + if ($number =~ /^\d{1,4}$/) { + return ($number); + } + } + } + + $number = $caller->de_scientify($number); + + $composed_form = $caller->number_composed_surface_form($number); + $composed_form2 = $caller->number_composed_surface_form($number,1); + $lex_form = $caller->value_to_english($number); + $commified_form = $caller->commify($number); + + if ($lex_form) { + if ($number >= 1000000) { + @result = ("one $lex_form", "1 $lex_form", "a $lex_form", $lex_form, $commified_form); + push(@result, $commified_form) if ($number <= $CardinalMaxNonLex); + } elsif ($number >= 100) { + @result = ($commified_form, "one $lex_form", "a $lex_form", $lex_form); + } elsif ($number >= 10) { + @result = ($number, $lex_form); + } elsif ($number == 1) { + @result = ("a", "an", $lex_form); + } elsif ($number == 0) { + @result = ($number, $lex_form); + } else { + @result = ($lex_form); + } + } elsif ($composed_form) { + if ($composed_form eq $composed_form2) { + @result = ($composed_form); + } elsif (($number >= 10000000) && ($composed_form2 =~ /^[1-9]0/)) { + @result = ($composed_form2, $composed_form); + } else { + @result = ($composed_form, $composed_form2); + } + push(@result, $commified_form) if $number <= $CardinalMaxNonLex; + } else { + ($ten,$one) = ($number =~ /^([2-9])([1-9])$/); + ($hundred) = ($number =~ /^([1-9])00$/) unless defined($one); + ($thousand) = ($number =~ /^([1-9]\d?)000$/) unless defined($one) || defined($hundred); + if (defined($one) && defined($ten) + && ($part1 = $caller->value_to_english($ten * 10)) + && ($part2 = $caller->value_to_english($one))) { + $wordy_form = "$part1-$part2"; + @result = ($commified_form, $wordy_form); + } elsif (defined($hundred) + && ($part1 = $caller->value_to_english($hundred))) { + $wordy_form = "$part1 hundred"; + @result = ($commified_form, $wordy_form); + } elsif (defined($thousand) + && ($part1 = $caller->value_to_english($thousand))) { + $wordy_form = "$part1 thousand"; + @result = ($commified_form, $wordy_form); + } elsif ($number =~ /^100000$/) { + @result = ($commified_form, "one hundred thousand", "a hundred thousand", "hundred thousand"); + } elsif ($pe && ($pe->surf eq $number) && ($number =~ /^\d\d\d\d(\.\d+)?$/)) { + @result = ($number); + push(@result, $commified_form) unless $commified_form eq $number; + } elsif ($number =~ /^\d{4,5}$/) { + if ($commified_form eq $number) { + @result = ($number); + } else { + @result = ($commified_form, $number); + } + } else { + @result = ($commified_form); + } + } + push (@result, $number) + unless $util->member($number, @result) || ($number > $CardinalMaxWithoutComma); +# $util->log("number_surface_forms result:@result", $logfile); + + # filter according to num_style + if ($follow_num_style) { + my @filtered_result = (); + foreach $r (@result) { + push(@filtered_result, $r) + if (($num_style eq "digits") && ($r =~ /^\d+$/)) + || (($num_style eq "alpha") && ($r =~ /^[-\@ a-z]*$/i)) + || (($num_style eq "digits_plus_alpha") && ($r =~ /\d.*[a-z]/i)); + } + @result = @filtered_result if @filtered_result; + } + + if ($pe && $pe->childGloss("and")) { + @new_result = (); + foreach $r (@result) { + if ($r =~ /^and /) { + push(@new_result, $r); + } else { + push(@new_result, "and $r"); + } + } + @result = @new_result; + } + return @result; +} + +sub number_range_surface_forms { + local($caller,$pe) = @_; + + $value = $pe->value; + $value_coord = $pe->get("value-coord"); + unless ($value_coord) { + return $caller->number_surface_forms($value); + } + $prefix = ""; + if ($conj = $pe->get("conj")) { + $connector = $conj; + } else { + $connector = ($value_coord == $value + 1) ? "or" : "to"; + } + if ($pe->get("between")) { + $prefix = "between "; + $connector = "and"; + } + + $pe1 = $pe->child("head"); + $pe2 = $pe->child("coord"); + @result1 = $caller->number_surface_forms($value, $pe1); + @result2 = $caller->number_surface_forms($value_coord, $pe2); + @num_style_list = @{$english_entity_style_ht{"FollowSourceLanguageNumberStyle"}}; + $follow_num_style = 1 if $util->member("yes", @num_style_list); + + # between two thousand and three thousand => between two and three thousand + # 3 million to 5 million => 3 to 5 million + if ($follow_num_style && ($#result1 == 0) && ($#result2 == 0)) { + $range = $prefix . $result1[0] . " $connector " . $result2[0]; + $util->log(" range1: $range", $logfile); + $gazillion = "thousand|million|billion|trillion"; + ($a,$gaz1,$b,$gaz2) = ($range =~ /^(.+) ($gazillion) ($connector .+) ($gazillion)$/); + if (defined($a) && defined($gaz1) && defined($b) && defined($gaz2) && ($gaz1 eq $gaz2)) { + $range = "$a $b $gaz1"; + $util->log(" range2: $range", $logfile); + return ($range); + } + } + + @result = (); + foreach $result1 (@result1) { + next if ($value >= 1000) && ($result1 =~ /^\d+$/); + foreach $result2 (@result2) { + next if $result1 =~ /^an?\b/; + push(@result, "$prefix$result1 $connector $result2") + if ($result1 =~ /^[a-z]+$/) && ($result2 =~ /^[a-z]+$/); + next if ($result1 =~ /^[a-z]/) || ($result2 =~ /^[a-z]/); + next if ($value_coord >= 1000) && ($result2 =~ /^\d+$/); + ($digits1,$letters1) = ($result1 =~ /^(\d+(?:.\d+)?) ([a-z].*)$/); + ($digits2,$letters2) = ($result2 =~ /^(\d+(?:.\d+)?) ([a-z].*)$/); + if (defined($digits1) && defined($letters1) + && defined($digits2) && defined($letters2) + && ($letters1 eq $letters2)) { + push(@result, "$prefix$digits1 $connector $digits2 $letters1"); + } elsif (($result1 =~ /^\d{1,3}$/) && ($result2 =~ /^\d{1,3}$/) && !$prefix) { + push(@result, "$result1-$result2"); + if ($connector eq "to") { + my $span = "$result1 to $result2"; + push(@result, $span) unless $util->member($span, @result); + } + } else { + push(@result, "$prefix$result1 $connector $result2"); + } + } + } + unless (@result) { + $result1 = (@result1) ? $result1[0] : $value; + $result2 = (@result2) ? $result2[0] : $value_coord; + @result = "$prefix$result1 $connector $result2"; + } + return @result; +} + +sub q_number_surface_forms { + local($caller,$pe) = @_; + + $surf = $pe->surf; + return ($pe->gloss) unless $value = $pe->value; + if (($value >= 1961) && ($value <= 2030) + && + (($pe->get("struct") eq "sequence of digits") + || + ($surf =~ /^\d+$/))) { + $value = "$prefix $value" if $prefix = $pe->get("prefix"); + @result = ("$value"); + } else { + @result = $caller->number_surface_forms($value,$pe); + @result = $caller->qualify_entities($pe,@result); + } + return @result; +} + +sub ordinal_surface_forms { + local($caller,$number,$exclude_cardinals_p,$exclude_adverbials_p, $pe) = @_; + + if (defined($os = $english_entity_style_ht{"Ordinal"})) { + @ordinal_styles = @{$os}; + } else { + return (); + } + $exclude_cardinals_p = 0 unless defined($exclude_cardinals_p); + @num_style_list = @{$english_entity_style_ht{"FollowSourceLanguageNumberStyle"}}; + $follow_num_style = 1 if $util->member("yes", @num_style_list); + $num_style = ($pe) ? $pe->get("num_style") : ""; + $alpha_ok = ! ($follow_num_style && ($num_style =~ /^digits$/)); + my $c_number = $caller->commify($number); + my $lex_form = ""; + $lex_form = $caller->value_to_english_ordinal($number) if $alpha_ok; + my $adverbial_form + = (($number =~ /^\d+$/) && ($number >= 1) && ($number <= 10) + && $lex_form && $util->member("secondly", @ordinal_styles)) + ? $lex_form . "ly" : ""; + my $num_form = $caller->numeric_ordinal_form($number); + my $c_num_form = $caller->numeric_ordinal_form($c_number); + my @result = (); + +# print "lex_form: $lex_form num_form:$num_form c_num_form:$c_num_form\n"; + if ($lex_form && $util->member("second", @ordinal_styles)) { + if (! $util->member("2nd", @ordinal_styles)) { + @result = ($lex_form); + } elsif ($c_num_form ne $num_form) { + @result = ($c_num_form, $lex_form, $num_form); + } elsif ($number >= 10) { + @result = ($num_form, $lex_form); + } else { + @result = ($lex_form, $num_form); + } + } elsif ($util->member("2nd", @ordinal_styles)) { + if ($c_num_form ne $num_form) { + @result = ($c_num_form, $num_form); + } else { + @result = ($num_form); + } + } + unless ($number =~ /^\d+$/) { + print STDERR "Warning: $number not an integer (for ordinal)\n"; + } + unless ($exclude_cardinals_p) { + $incl_num_card = $util->member("2", @ordinal_styles); + $incl_lex_card = $util->member("two", @ordinal_styles); + foreach $card ($caller->number_surface_forms($number)) { + if ($card =~ /^an?$/) { + # don't include + } elsif ($card =~ /^[0-9,]+$/) { + push(@result, $card) if $incl_num_card; + } else { + push(@result, $card) if $incl_lex_card && $alpha_ok; + } + } + } + push(@result,$adverbial_form) if $adverbial_form && ! $exclude_adverbials_p; + push(@result, $num_form) unless @result; + return @result; +} + +sub ordinal_surface_form { + local($caller,$number,$exclude_cardinals_p,$exclude_adverbials_p, $pe) = @_; + + my @surf_forms = $caller->ordinal_surface_forms($number,$exclude_cardinals_p,$exclude_adverbials_p, $pe); + return (@surf_forms) ? $surf_forms[0] : $caller->numeric_ordinal_form($number); +} + +sub fraction_surface_forms { + local($caller,$pe,$modp) = @_; + + my @result = (); + $numerator = $pe->get("numerator"); + $denominator = $pe->get("denominator"); +# print "numerator: $numerator denominator:$denominator\n"; + @surf_nums = $caller->number_surface_forms($numerator,$pe); + @surf_nums = ("one") if $numerator == 1; + @surf_dens = $caller->ordinal_surface_forms($denominator,1,1); + @surf_dens = ("half") if $denominator == 2; + @surf_dens = ("quarter") if $denominator == 4; + @surf_dens = ("tenth") if $denominator == 10; +# print "surf_nums: @surf_nums surf_dens: @surf_dens\n"; + @fraction_patterns = @{$english_entity_style_ht{"Fraction"}}; + if (@surf_nums && @surf_dens) { + $surf_num = $surf_nums[0]; + $surf_den = $surf_dens[0]; + $surf_num_den = ""; + foreach $sd (@surf_dens) { + $surf_num_den = $sd if $sd =~ /^\d/; + } + $surf_den_w_proper_number = $caller->noun_number_form($surf_den, $numerator); + foreach $fp (@fraction_patterns) { + if ($fp eq "one tenth") { + push(@result, $surf_num . " " . $surf_den_w_proper_number) unless $modp; + } elsif ($fp eq "one-tenth") { + if ($modp) { + push(@result, $surf_num . "-" . $surf_den); + } else { + push(@result, $surf_num . "-" . $surf_den_w_proper_number); + } + } elsif ($fp eq "1/10") { + push(@result, $numerator . "/" . $denominator); + } elsif ($fp eq "1/10th") { + push(@result, $numerator . "/" . $surf_num_den) if $surf_num_den; + } + } + return @result; + } else { + return ($pe->gloss); + } +} + +sub currency_surface_forms { + local($caller,$pe) = @_; + + @currency_surf_forms = (); + return @currency_surf_forms unless $pe->sem =~ /monetary quantity/; + $unit = $pe->get("unit"); + return ($pe->gloss) unless $quant = $pe->get("quant"); + return ($pe->gloss) if $pe->childSem("head") eq "currency symbol"; + $quant_pe = $pe->child("quant"); + if ($unit =~ /^(US|Hongkong) dollar$/) { + @units = $caller->entity_style_listing($unit); + } elsif ($unit eq "yuan") { + @units = $caller->entity_style_listing("Chinese yuan"); + @rmb_pos = @{$english_entity_style_ht{"Chinese RMB position"}}; + @rmb_pos = ("before-number", "after-number") if $util->member("all",@units); + } else { + @units = ($unit); + } + if (($pe->sem =~ /range$/) && $quant_pe) { + @quants = $caller->number_range_surface_forms($quant_pe); + } else { + @quants = $caller->number_surface_forms($quant, $quant_pe); + } + @quants = ($quant) unless @quants; + # print STDERR "units: @units \n"; + foreach $q (@quants) { + foreach $u_sing (@units) { + $u = ($modp) ? $u_sing : $caller->noun_number_form($u_sing, $quant); +# print " q: $q unit: $u value: $quant\n"; + if ($u eq "RMB") { + if ($util->member("before-number", @rmb_pos)) { + if ($q =~ /^\d/) { + push(@currency_surf_forms, "RMB" . $q); + } + } + if ($util->member("after-number", @rmb_pos)) { + push(@currency_surf_forms, $q . " RMB"); + } + } elsif ($u =~ /\$$/) { + if ($q =~ /^\d/) { + $currency_surf_form = $u . $q; + push(@currency_surf_forms, $currency_surf_form); + } + } else { + $new_form = "$q $u"; + push(@currency_surf_forms, $new_form) if $caller->indef_art_filter($new_form); + } + } + } + @currency_surf_forms = $caller->qualify_entities($pe,@currency_surf_forms); + + # print STDERR "currency_surface_forms: @currency_surf_forms \n"; + return @currency_surf_forms; +} + +sub age_surface_forms { + local($caller,$pe, $modp) = @_; + + $gloss = $pe->gloss; + @age_surf_forms = (); + return @age_surf_forms unless $pe->sem =~ /age quantity/; + $unit = $pe->get("unit"); + return ($gloss) unless $quant = $pe->get("quant"); + $temporal_quant_pe = $pe->child("head"); + $synt = $pe->synt; + if ($synt =~ /parenthetical/) { + if ($pe->get("slashed")) { + @age_markers = $caller->entity_style_listing("ParentheticalAgeFormatSlashed"); + @age_markers = $caller->entity_style_listing("ParentheticalAgeFormat") unless @age_markers; + } else { + @age_markers = $caller->entity_style_listing("ParentheticalAgeFormat"); + } + return ($gloss) unless @age_markers; + foreach $a (@age_markers) { + $age_surf_form = $a; + $age_surf_form =~ s/8/$quant/; + push(@age_surf_forms, $age_surf_form); + } + } elsif (($quant =~ /^\d+$/) && ($temporal_quant_pe->sem eq "age unit")) { + @quants = $caller->number_surface_forms($quant); + @quants = ($quant) if $pe->childSurf("quant") =~ /^\d+$/; + foreach $quant2 (@quants) { + if ($modp) { + push(@age_surf_forms, "$quant2-year-old"); + } else { + $plural_marker = ($quant >= 2) ? "s" : ""; + push(@age_surf_forms, "$quant2 year$plural_marker old"); + } + } + } elsif ($temporal_quant_pe && ($temporal_quant_pe->sem eq "temporal quantity")) { + @temporal_quants = $caller->quantity_surface_forms($temporal_quant_pe, $modp); + foreach $temporal_quant (@temporal_quants) { + push(@age_surf_forms, $temporal_quant . (($modp) ? "-" : " ") . "old"); + } + } else { + return ($gloss); + } + + @age_surf_forms = ($gloss) unless @age_surf_forms; + return @age_surf_forms; +} + +sub occurrence_surface_forms { + local($caller,$pe,$modp) = @_; + + @quantity_surf_forms = (); + return ($pe->gloss) unless $quant = $pe->get("quant"); + $quant_coord = $pe->get("quant-coord"); + $quant_pe = $pe->child("quant"); + $unit = "time"; + if (($pe->sem =~ /range$/) && $quant_pe) { + @quants = $caller->number_range_surface_forms($quant_pe); + } else { + @quants = $caller->number_surface_forms($quant, $quant_pe); + } + @quants = ($quant) unless @quants; + if ($modp) { + return () if $pe->get("qualifier") || $quant_coord; + return ("one-time") if $quant eq "1"; + return ("two-time", "two-fold", "2-fold") if $quant eq "2"; + } else { + if ($quant_coord) { + return $caller->qualify_entities($pe, ("once or twice")) + if $quant eq "1" and $quant_coord eq "2"; + } else { + return $caller->qualify_entities($pe, ("once")) if $quant eq "1"; + return $caller->qualify_entities($pe, ("twice", "two times", "2 times", + "2-fold", "two fold")) if $quant eq "2"; + } + } + foreach $q (@quants) { + $u = ($modp) ? $unit : $caller->noun_number_form($unit, $quant); + $new_form = "$q $u"; + if ($modp) { + # for the time being, no "more than/over/..." in modifiers: more than 20-ton + if ($pe->get("qualifier")) { + $new_form = ""; + } else { + $new_form =~ s/-/-to-/; + $new_form =~ s/ /-/g; + } + } + push(@quantity_surf_forms, $new_form) if $new_form; + push(@quantity_surf_forms, "$q-fold") if $q =~ /\d/ || ($quant <= 9); + } + @quantity_surf_forms = $caller->qualify_entities($pe,@quantity_surf_forms); + + return @quantity_surf_forms; +} + +sub quantity_surface_forms { + local($caller,$pe,$modp) = @_; + + if ($pe->get("complex") eq "true") { + return () if $modp; + $quantity_surf_form = $pe->gloss; + return ($quantity_surf_form); + } + + @quantity_surf_forms = (); + $sem = $pe->get("sem"); + $scale = $pe->get("scale"); + $scale_mod = $pe->get("scale_mod"); + $unit = $pe->get("unit") || $scale; + $mod_gloss = $pe->get("mod"); + return ($pe->gloss) unless $quant = $pe->get("quant"); + $quant_coord = $pe->get("quant-coord"); + $quant_comb = $quant_coord || $quant; + $quant_pe = $pe->child("quant"); + if (defined($u_style = $english_entity_style_ht{"\u$unit"})) { + @units = @{$u_style}; + } else { + @units = ($unit); + } + if (($pe->sem =~ /range$/) && $quant_pe) { + @quants = $caller->number_range_surface_forms($quant_pe); + } else { + @quants = $caller->number_surface_forms($quant, $quant_pe); + } + @quants = ($quant) unless @quants; + foreach $q (@quants) { + foreach $u_sing (@units) { + my $u = $u_sing; + if (($sem =~ /seismic quantity/) && $scale) { + $scale =~ s/(\w+)\s*/\u\L$1/g if $scale =~ /^(Richter|Mercalli)/i; + $u = "on the $scale_mod $scale scale"; + $u =~ s/\s+/ /g; + } elsif (($u_sing =~ /\S/) && ! $modp) { + $u = $caller->noun_number_form($u_sing, $quant_comb); + } +# print " q: $q unit: $u value: $quant modp: $modp\n"; + @mods = (""); + @mods = ("consecutive", "in a row") if $mod_gloss eq "continuous"; + foreach $mod (@mods) { + $pre_quant_mod = ""; + $in_quant_mod = ($mod =~ /(consecutive)/) ? "$mod " : ""; + $post_quant_mod = ($mod =~ /(in a row)/) ? " $mod" : ""; + $new_form = "$pre_quant_mod$q $in_quant_mod$u$post_quant_mod"; + if ($caller->is_abbreviation($u)) { + if (($pe->sem =~ /range/) && ($q =~ /^[-0-9,\. to]+$/) + && $modp && !($new_form =~ / (to|or) /)) { + $new_form =~ s/-/-to-/; + $new_form =~ s/ /-/g; + } elsif ($q =~ /^[-0-9,\.]+$/) { +# $new_form =~ s/ //g; + } else { + $new_form = ""; + } + } elsif ($modp) { + # for the time being, no "more than/over/..." in modifiers: more than 20-ton + if (($pe->get("qualifier")) || $mod) { + $new_form = ""; + } elsif ($u =~ /(square|cubic|metric|short)/) { + # no hyphenation for the time being (based on CTE style) + } elsif (($pe->sem =~ /range/) && !($new_form =~ / (to|or) /)) { + $new_form =~ s/-/-to-/; + $new_form =~ s/ /-/g; + } else { + $new_form =~ s/ /-/g; + } + } + push(@quantity_surf_forms, $new_form) + if $new_form && $caller->quantity_filter($new_form) && $caller->indef_art_filter($new_form); + } + } + } + @quantity_surf_forms = $caller->qualify_entities($pe,@quantity_surf_forms); + + # print STDERR "QSF unit:$unit sem:$sem Result(s): " . join("; ", @quantity_surf_forms) . "\n"; + return @quantity_surf_forms; +} + +sub qualify_entities { + local($caller,$pe,@surf_forms) = @_; + + $prefix = $pe->get("prefix"); + $prefix_clause = ($prefix) ? "$prefix " : ""; + if ($qualifier = $pe->get("qualifier")) { + $qualifier =~ s/-/ /g; + $qualifier_key = $qualifier; + $qualifier_key =~ s/(\w+)\s*/\u\L$1/g; + # print "qualifier_key: $qualifier_key\n"; + @new_list = (); + if (defined($value = $english_entity_style_ht{$qualifier_key})) { + @quals = @{$value}; + # print STDERR " qk $qualifier_key in ht: @quals :: @surf_forms\n"; + foreach $q (@quals) { + foreach $surf_form (@surf_forms) { + $new_form = "$prefix_clause$q $surf_form"; + push(@new_list, $new_form) if $caller->qualifier_filter($new_form); + } + } + return @new_list if @new_list; + } else { + @keys = sort keys %english_entity_style_ht; + # print STDERR " did not find qk $qualifier_key in ht: @keys\n"; + foreach $surf_form (@surf_forms) { + if (($qualifier =~ /^(couple|few|lot|many|number|several|some)$/i) + && (($art, $lex) = ($surf_form =~ /^(an?)\s+(\S|\S.*\S)\s*$/i))) { + $plural_form = $caller->noun_number_form($lex,2); + $new_form = "$prefix_clause$qualifier $plural_form"; + } else { + $new_form = "$prefix_clause$qualifier $surf_form"; + } + push(@new_list, $new_form) if $caller->qualifier_filter($new_form); + } + return @new_list if @new_list; + } + } + if ($prefix) { + @prefixed_surf_forms = (); + foreach $surf_form (@surf_forms) { + if ($surf_form =~ /^$prefix /) { # already prefixed + push(@prefixed_surf_forms, $surf_form); + } else { + push(@prefixed_surf_forms, "$prefix $surf_form"); + } + } + return @prefixed_surf_forms; + } else { + return @surf_forms; + } +} + +sub percent_surface_forms { + local($caller,$pe,$modp) = @_; + + @percent_surf_forms = (); + return @percent_surf_forms unless $pe->sem eq "percentage"; + $prefix = ""; + $quant = $pe->gloss; + $quant =~ s/%$//; + $quant =~ s/^and //; + if ($pe->gloss =~ /^and /) { + $prefix = "and"; + } + @percent_markers = $caller->entity_style_listing("Percentage"); + @quants = $caller->number_surface_forms($quant); + @quants = ($quant) unless @quants; + foreach $p (@percent_markers) { + foreach $q (@quants) { + if ($p =~ /%$/) { + if ($q =~ /\d$/) { + $percent_surf_form = $q . "%"; + $percent_surf_form = "$prefix $percent_surf_form" if $prefix; + push(@percent_surf_forms, $percent_surf_form); + push(@percent_surf_forms, "by $percent_surf_form") unless $modp || $percent_surf_form =~ /^and /; + } + } else { + if ((($p =~ /^\d/) && ($q =~ /^\d/)) + || + (($p =~ /^[a-z]/) && ($q =~ /^[a-z]/))) { + if ($p =~ /percentage point/) { + if ($quant == 1) { + $percent_surf_form = $q . " percentage point"; + } else { + $percent_surf_form = $q . " percentage points"; + } + } else { + $percent_surf_form = $q . " percent"; + } + $percent_surf_form = "$prefix $percent_surf_form" if $prefix; + $percent_surf_form =~ s/ /-/g if $modp; + push(@percent_surf_forms, $percent_surf_form); + push(@percent_surf_forms, "by $percent_surf_form") unless $modp || $percent_surf_form =~ /^and /; + } + } + } + } + return @percent_surf_forms; +} + +sub decade_century_surface_forms { + local($caller,$pe) = @_; + + if ($pe->sem =~ /century/) { + $gloss = $pe->gloss; + return ("the $gloss", "in the $gloss", $gloss); + } + @decade_surf_forms = (); + return @decade_surf_forms unless $pe->sem =~ /year range\b.*\bdecade/; + @decade_markers = @{$english_entity_style_ht{"Decade"}}; + @extend_decades = @{$english_entity_style_ht{"ExtendDecades"}}; + @extended_decades = @{$english_entity_style_ht{"ExtendedDecade"}}; + $extended_decade = (@extended_decades) ? $extended_decades[0] : "none"; + + $value = $pe->value; + $extended_value = ""; + foreach $extend_decade (@extend_decades) { + if ($extend_decade =~ /$value$/) { + $extended_value = $extend_decade unless $extended_value eq $extend_decade; + last; + } + } + if ($sub = $pe->get("sub")) { + $sub_clause = "$sub "; + $sub_clause =~ s/(mid) /$1-/; + } else { + $sub_clause = ""; + } + + if (! $extended_value) { + @values = ($value); + } elsif ($extended_decade eq "ignore") { + @values = ($value); + } elsif ($extended_decade eq "only") { + @values = ($extended_value); + } elsif ($extended_decade eq "primary") { + @values = ($extended_value, $value); + } elsif ($extended_decade eq "secondary") { + @values = ($value, $extended_value); + } else { + @values = ($value); + } + foreach $v (@values) { + foreach $dm (@decade_markers) { + $dm_ending = $dm; + $dm_ending =~ s/^\d+//; + push (@decade_surf_forms, "the $sub_clause$v$dm_ending"); + push (@decade_surf_forms, "in the $sub_clause$v$dm_ending"); + push (@decade_surf_forms, "$sub_clause$v$dm_ending"); + } + } + return @decade_surf_forms; +} + +sub day_of_the_month_surface_forms { + local($caller,$pe) = @_; + + @dom_surf_forms = (); + return @dom_surf_forms + unless ($pe->sem eq "day of the month") + && ($day_number = $pe->get("day-number")); + @dom_markers = @{$english_entity_style_ht{"DayOfTheMonth"}}; + foreach $dm (@dom_markers) { + $ord = $caller->numeric_ordinal_form($day_number); + if ($dm eq "on the 5th") { + push (@dom_surf_forms, "on the $ord"); + } elsif ($dm eq "the 5th") { + push (@dom_surf_forms, "the $ord"); + } elsif ($dm eq "5th") { + push (@dom_surf_forms, $ord); + } + } + return @dom_surf_forms; +} + +sub score_surface_forms { + local($caller,$pe) = @_; + + @score_surf_forms = (); + if (($score1 = $pe->get("score1")) + && ($score2 = $pe->get("score2"))) { + @score_markers = @{$english_entity_style_ht{"ScoreMarker"}}; + @score_markers = (":") unless @score_markers; + foreach $sm (@score_markers) { + push (@score_surf_forms, "$score1$sm$score2"); + } + } + push(@score_surf_forms, $pe->gloss) unless @score_surf_forms; + return @score_surf_forms; +} + +sub day_of_the_week_surface_forms { + local($caller,$pe) = @_; + + @dom_surf_forms = (); + @dom_markers = @{$english_entity_style_ht{"DayOfTheWeek"}}; + $gloss = $pe->get("gloss"); + $weekday = $pe->get("weekday"); + $weekday = $gloss if ($weekday eq "") && ($gloss =~ /^\S+$/); + $relday = $pe->get("relday"); + $period = $pe->get("period"); + foreach $dm (@dom_markers) { + if (($dm =~ /NOPERIOD/) && $period) { + $surf = ""; # bad combination + } elsif (($dm eq "Sunday") || ! $relday) { + $surf = $weekday; + $surf .= " $period" if $period; + } elsif ($dm =~ /morning/) { + if ($period) { + $surf = $dm; + $surf =~ s/tomorrow/$relday/; + $surf =~ s/morning/$period/; + $surf =~ s/Sunday/$weekday/; + } else { + $surf = ""; # bad combination + } + } else { + $surf = $dm; + if ($period) { + if ($relday eq "today") { + $core_surf = "this $period"; + } else { + $core_surf = "$relday $period"; + } + } else { + $core_surf = $relday; + } + $surf =~ s/tomorrow/$core_surf/; + $surf =~ s/Sunday/$weekday/; + } + $surf =~ s/yesterday night/last night/; + $surf =~ s/this noon, ($weekday)(,\s*)?/today, $1, at noon/; + $surf =~ s/this noon/today at noon/; + $surf =~ s/this night/tonight/; + $surf =~ s/\s*NOPERIOD\s*$//; + push (@dom_surf_forms, $surf) unless $util->member($surf, @dom_surf_forms) || ! $surf; + $on_weekday = "on $surf"; + push (@dom_surf_forms, $on_weekday) + if ($surf eq $weekday) && ! $util->member($on_weekday, @dom_surf_forms); + } + return @dom_surf_forms; +} + +sub date_surface_forms { + local($caller,$pe,$modp) = @_; + + @date_surf_forms = (); + $sem = $pe->sem; + $synt = $pe->synt; + return @date_surf_forms unless $sem =~ /date(\+year)?/; + $day = $pe->get("day"); + $weekday = $pe->get("weekday"); + $month_name = $pe->get("month-name"); + $month_number = $pe->get("month-number"); + $year = $pe->get("year"); + $era = $pe->get("era"); + $era_clause = ""; + $calendar_type = $pe->get("calendar"); + $calendar_type_clause = ""; + $calendar_type_clause = " AH" if $calendar_type eq "Islamic"; + $ad_year = $year; + if ($era eq "Republic era") { + $ad_year = $year + 1911; + $era_clause = " (year $year of the $era)"; + } + $rel = $pe->get("rel"); + if ($sep = $pe->get("sep")) { + $date_surf_form = "$month_number$sep$day"; + $date_surf_form .= "$sep$year" if $year; + $date_surf_form = "$weekday, $date_surf_form" if $weekday; + $date_surf_form = "on $date_surf_form" if $synt eq "pp"; + return ($date_surf_form); + } + @date_months = @{$english_entity_style_ht{"DateMonth"}}; + @date_days = @{$english_entity_style_ht{"DateDay"}}; + @date_order = @{$english_entity_style_ht{"DateOrder"}}; + foreach $m (@date_months) { + if ($m eq "September") { + $surf_month = $month_name; + } elsif ($m =~ /^Sep(\.)?$/) { + if ($month_name eq "May") { + $surf_month = $month_name; + } else { + $period_clause = ($m =~ /\.$/) ? "." : ""; + $surf_month = substr($month_name, 0, 3) . $period_clause; + } + } elsif ($m =~ /^Sept(\.)?$/) { + if ($util->member($month_name, "February", "September")) { + $period_clause = ($m =~ /\.$/) ? "." : ""; + $surf_month = substr($month_name, 0, 4) . $period_clause; + } else { + $surf_month = ""; + } + } else { + $surf_month = ""; + } + foreach $d (@date_days) { + if ($d =~ /^\d+$/) { + $surf_day = $day; + } elsif ($d =~ /^\d+[sthrd]+$/) { + $surf_day = $caller->numeric_ordinal_form($day); + } else { + $surf_day = ""; + } + if ($surf_month && $surf_day) { + foreach $o (@date_order) { + if ($calendar_type eq "Islamic") { + $date_surf_form = "$surf_day $surf_month"; + } elsif ($o eq "September 6, 1998") { + $date_surf_form = "$surf_month $surf_day"; + } elsif ($o eq "6 September, 1998") { + $date_surf_form = "$surf_day $surf_month"; + } + $date_surf_form = "$weekday, $date_surf_form" if $weekday; + $consider_on_p = 1; + if ($year) { + $date_surf_form .= "," unless $calendar_type eq "Islamic"; + $date_surf_form .= " $ad_year$calendar_type_clause$era_clause"; + } elsif ($rel) { + if ($rel eq "current") { + $date_surf_form = "this $date_surf_form"; + } else { + $date_surf_form = "$rel $date_surf_form"; + } + $consider_on_p = 0; + } + push(@date_surf_forms, $date_surf_form) + unless $util->member($date_surf_form, @date_surf_forms) || ($synt eq "pp"); + if ($consider_on_p) { + $on_date_surf_form = "on $date_surf_form"; + push(@date_surf_forms, $on_date_surf_form) + unless $modp || $util->member($on_date_surf_form, @date_surf_forms); + } + + if (($synt eq "pp") && ($sem eq "date")) { + push(@date_surf_forms, $date_surf_form) + unless $util->member($date_surf_form, @date_surf_forms); + } + } + } + } + } + return @date_surf_forms; + # rel, last, next, this +} + +sub numeric_ordinal_form { + local($caller,$cardinal) = @_; + + return $cardinal . "th" if $cardinal =~ /1\d$/; + return $cardinal . "st" if $cardinal =~ /1$/; + return $cardinal . "nd" if $cardinal =~ /2$/; + return $cardinal . "rd" if $cardinal =~ /3$/; + return $cardinal . "h" if $cardinal =~ /t$/; + $cardinal =~ s/y$/ie/; + return $cardinal . "th"; +} + +sub guard_urls_x045 { + local($caller, $s) = @_; + + # URLs (http/https/ftp/mailto) + my $result = ""; + while (($pre, $url, $post) = ($s =~ /^(.*?)((?:(?:https?|ftp):\/\/|mailto:)[#%-;=?-Z_-z~]*[-a-zA-Z0-9\/#])(.*)$/)) { + $result .= "$pre\x04$url\x05"; + $s = $post; + } + $result .= $s; + + # emails + $s = $result; + $result = ""; + while (($pre, $email, $post) = ($s =~ /^(.*?[ ,;:()\/\[\]{}<>|"'])([a-z][-_.a-z0-9]*[a-z0-9]\@[a-z][-_.a-z0-9]*[a-z0-9]\.(?:[a-z]{2,}))([ .,;:?!()\/\[\]{}<>|"'].*)$/i)) { + $result .= "$pre\x04$email\x05"; + $s = $post; + } + $result .= $s; + + # (Twitter style) #hashtag or @handle + $s = $result; + $result = ""; + while (($pre, $hashtag, $post) = ($s =~ /^(.*?[ .,;()\[\]{}'])([#@](?:[a-z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|HHERE)(?:[_a-z0-9]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])*(?:[a-z0-9]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]))(.*)$/i)) { + $result .= "$pre\x04$hashtag\x05"; + $s = $post; + } + $result .= $s; + + # Keep together number+letter in: Fig. 4g; Chromosome 12p + $result =~ s/((?:\b(?:fig))(?:_DONTBREAK_)?\.?|\b(?:figures?|tables?|chromosomes?)|]*\b(?:fig)\b[^<>]*>)\s*(\d+[a-z])\b/$1 \x04$2\x05/gi; + + # special combinations, e.g. =/= emoticons such as :) + $s = $result; + $result = ""; + while (($pre, $special, $post) = ($s =~ /^(.*?)(:-?\)|:-?\(|=\/=?|\?+\/\?+|=\[)(.*)$/)) { + $result .= "$pre\x04$special\x05"; + $s = $post; + } + $result .= $s; + + return $result; +} + +sub guard_xml_tags_x0123 { + local($caller, $s) = @_; + + my $result = ""; + # xml tag might or might not already have "@" on left and/or right end: @
@ + while (($pre, $tag, $post) = ($s =~ /^(.*?)(\@?<\/?(?:[a-z][-_:a-z0-9]*)(?:\s+[a-z][-_:a-z0-9]*="[^"]*")*\s*\/?>\@?|&(?:amp|gt|lt|quot);|\[(?:QUOTE|URL)=[^ \t\n\[\]]+\]|\[\/?(?:QUOTE|IMG|INDENT|URL)\]|<\$[-_a-z0-9]+\$>|<\!--.*?-->)(.*)$/si)) { + $result .= $pre; + if (($pre =~ /\S$/) && ($tag =~ /^\S/)) { + $result .= " \x01"; + $result .= "\@" if ($tag =~ /^<[a-z]/i) && (! ($pre =~ /[,;(>]$/)); #) + } else { + $result .= "\x01"; + } + $guarded_tag = $tag; + $guarded_tag =~ s/ /\x02/g; + # print STDERR "tag: $tag\nguarded_tag: $guarded_tag\n" if ($result =~ /Harvey/) || ($s =~ /Harvey/); + $result .= $guarded_tag; + if (($tag =~ /\S$/) && ($post =~ /^\S/)) { # ( + $result .= "\@" if (($tag =~ /^<\//) || ($tag =~ /\/>$/)) && (! ($result =~ /\@$/)) && (! ($post =~ /^[,;)<]/)); + $result .= "\x03 "; + } else { + $result .= "\x03"; + } + $s = $post; + } + $result .= $s; + return $result; +} + +sub restore_urls_x045_guarded_string { + local($caller, $s) = @_; + + my $orig = $s; + while (($pre, $url, $post) = ($s =~ /^(.*?)\x04([^\x04\x05]*?)\x05(.*)$/)) { + $url =~ s/ \@([-:\/])/$1/g; + $url =~ s/([-:\/])\@ /$1/g; + $url =~ s/ //g; + $url =~ s/\x02/ /g; + $s = "$pre$url$post"; + } + if ($s =~ /[\x04\x05]/) { + print STDERR "Removing unexpectedly unremoved x04/x05 marks from $s\n"; + $s =~ s/[\x04\x05]//g; + } + return $s; +} + +sub restore_xml_tags_x0123_guarded_string { + local($caller, $s) = @_; + + my $result = ""; + while (($pre, $tag, $post) = ($s =~ /^(.*?)\x01(.*?)\x03(.*)$/)) { + $result .= $pre; + $tag =~ s/ \@([-:\/])/$1/g; + $tag =~ s/([-:\/])\@ /$1/g; + $tag =~ s/ //g; + $tag =~ s/\x02/ /g; + $result .= $tag; + $s = $post; + } + $result .= $s; + return $result; +} + +sub load_english_abbreviations { + local($caller, $filename, *ht, $verbose) = @_; + # e.g. /nfs/nlg/users/textmap/brahms-ml/arabic/data/EnglishAbbreviations.txt + + $verbose = 1 unless defined($verbose); + my $n = 0; + if (open(IN, $filename)) { + while () { + next if /^\# /; + s/\s*$//; + my @expansions; + if (@expansions = split(/\s*::\s*/, $_)) { + my $abbrev = shift @expansions; + $ht{IS_ABBREVIATION}->{$abbrev} = 1; + $ht{IS_LC_ABBREVIATION}->{(lc $abbrev)} = 1; + foreach $expansion (@expansions) { + $ht{ABBREV_EXPANSION}->{$abbrev}->{$expansion} = 1; + $ht{ABBREV_EXPANSION_OF}->{$expansion}->{$abbrev} = 1; + } + $n++; + } + } + close(IN); + print STDERR "Loaded $n entries from $filename\n" if $verbose; + } else { + print STDERR "Can't open $filename\n"; + } +} + +sub load_split_patterns { + local($caller, $filename, *ht) = @_; + # e.g. /nfs/nlg/users/textmap/brahms-ml/arabic/data/BioSplitPatterns.txt + + my $n = 0; + if (open(IN, $filename)) { + while () { + next if /^\# /; + s/\s*$//; + if (($s) = ($_ =~ /^SPLIT-DASH-X\s+(\S.*\S|\S)\s*$/)) { + $ht{SPLIT_DASH_X}->{$s} = 1; + $ht{LC_SPLIT_DASH_X}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^SPLIT-X-DASH\s+(\S.*\S|\S)\s*$/)) { + $ht{SPLIT_X_DASH}->{$s} = 1; + $ht{LC_SPLIT_X_DASH}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^DO-NOT-SPLIT-DASH-X\s+(\S.*\S|\S)\s*$/)) { + $ht{DO_NOT_SPLIT_DASH_X}->{$s} = 1; + $ht{LC_DO_NOT_SPLIT_DASH_X}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^DO-NOT-SPLIT-X-DASH\s+(\S.*\S|\S)\s*$/)) { + $ht{DO_NOT_SPLIT_X_DASH}->{$s} = 1; + $ht{LC_DO_NOT_SPLIT_X_DASH}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^DO-NOT-SPLIT\s+(\S.*\S|\S)\s*$/)) { + $ht{DO_NOT_SPLIT}->{$s} = 1; + $ht{LC_DO_NOT_SPLIT}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^SPLIT\s+(\S.*\S|\S)\s*$/)) { + $ht{SPLIT}->{$s} = 1; + $ht{LC_SPLIT}->{(lc $s)} = 1; + $n++; + } + } + close(IN); + print STDERR "Loaded $n entries from $filename\n"; + } else { + print STDERR "Can't open $filename\n"; + } +} + +sub guard_abbreviations_with_dontbreak { + local($caller, $s, *ht) = @_; + + my $orig = $s; + my $result = ""; + while (($pre,$potential_abbrev,$period,$post) = ($s =~ /^(.*?)((?:[a-z]+\.-?)*(?:[a-z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])+)(\.)(.*)$/i)) { + if (($pre =~ /([-&\/0-9]|[-\/]\@ )$/) + && (! ($pre =~ /\b[DR](?: \@)?-(?:\@ )?$/))) { # D-Ariz. + $result .= "$pre$potential_abbrev$period"; + } else { + $result .= $pre . $potential_abbrev; + $potential_abbrev_with_period = $potential_abbrev . $period; + if ($ht{IS_ABBREVIATION}->{$potential_abbrev_with_period}) { + $result .= "_DONTBREAK_"; + } elsif ($ht{IS_LC_ABBREVIATION}->{(lc $potential_abbrev_with_period)}) { + $result .= "_DONTBREAK_"; + } + $result .= $period; + } + $s = $post; + } + $result .= $s; + $result =~ s/\b([Nn])o\.(\s*\d)/$1o_DONTBREAK_.$2/g; + return $result; +} + +$alpha = "(?:[a-z]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])"; +$alphanum = "(?:[a-z0-9]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])(?:[-_a-z0-9]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])*(?:[a-z0-9]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])|(?:[a-z0-9]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])"; + +sub normalize_punctuation { + local($caller, $s) = @_; + + $s =~ s/\xE2\x80[\x93\x94]/-/g; # ndash, mdash to hyphen + $s =~ s/ \@([-\/])/$1/g; + $s =~ s/([-\/])\@ /$1/g; + return $s; +} + +sub update_replace_characters_based_on_context { + local($caller, $s) = @_; + + # This is just a start. Collect stats over text with non-ASCII, e.g. K?ln. + # HHERE + my $rest = $s; + $s = ""; + while (($pre, $left, $repl_char, $right, $post) = ($rest =~ /^(.*?\s+)(\S*)(\xEF\xBF\xBD)(\S*)(\s.*)$/)) { + $s .= "$pre$left"; + if (($left =~ /[a-z]$/i) && ($right =~ /^s(?:[-.,:;?!].*)?$/i)) { # China's etc. + $repl_char = "\xE2\x80\x99"; # right single quotation mark + } elsif (($left =~ /n$/i) && ($right =~ /^t$/i)) { # don't etc. + $repl_char = "\xE2\x80\x99"; # right single quotation mark + } elsif (($left =~ /[a-z]\s*[.]$/i) && ($right eq "")) { # end of sentence + $repl_char = "\xE2\x80\x9D"; # right double quotation mark + } elsif (($left eq "") && ($right =~ /^[A-Z]/i)) { # start of word + $repl_char = "\xE2\x80\x9C"; # left double quotation mark + } + $s .= "$repl_char$right"; + $rest = $post; + } + $s .= $rest; + + return $s; +} + +sub tokenize { + local($caller, $s, *ht, $control) = @_; + + my $local_verbose = 0; + print "Point A: $s\n" if $local_verbose; + $control = "" unless defined($control); + my $bio_p = ($control =~ /\bbio\b/); + + $s = $utf8->repair_misconverted_windows_to_utf8_strings($s); + print "Point A2: $s\n" if $local_verbose; + $s = $utf8->delete_weird_stuff($s); + print "Point B: $s\n" if $local_verbose; + + # reposition xml-tag with odd space + $s =~ s/( +)((?:<\/[a-z][-_a-z0-9]*>)+)(\S)/$2$1$3/ig; + $s =~ s/(\S)((?:<[a-z][^<>]*>)+)( +)/$1$3$2/ig; + print "Point C: $s\n" if $local_verbose; + + $a_value = $ht{IS_ABBREVIATION}->{"Fig."} || "n/a"; + $s = $caller->guard_abbreviations_with_dontbreak($s, *ht); + my $standard_abbrev_s = "Adm|al|Apr|Aug|Calif|Co|Dec|Dr|etc|e.g|Feb|Febr|Gen|Gov|i.e|Jan|Ltd|Lt|Mr|Mrs|Nov|Oct|Pfc|Pres|Prof|Sen|Sept|U.S.A|U.S|vs"; + my $pre; + my $core; + my $post; + $s = " $core " if ($pre,$core,$post) = ($s =~ /^(\s*)(.*?)(\s*)$/i); + $s =~ s/\xE2\x80\x89/ /g; # thin space + $standard_abbrev_s =~ s/\./\\\./g; + $s =~ s/[\x01-\x05]//g; + $s = $caller->guard_urls_x045($s); + $s = $caller->guard_xml_tags_x0123($s); + $s = $caller->update_replace_characters_based_on_context($s); + $s =~ s/((?:[a-zA-Z_]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])\.)([,;]) /$1 $2 /g; + $s =~ s/((?:[a-zA-Z_]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])\.)(\x04)/$1 $2/g; + if ($bio_p) { + $s =~ s/(\S)((?:wt\/|onc\/)?(?:[-+]|\?+|\xE2\x80[\x93\x94])\/(?:[-+]|\?+|\xE2\x80[\x93\x94]))/$1 $2/g; + $s =~ s/((?:[-+]|\xE2\x80[\x93\x94])\/(?:[-+]|\xE2\x80[\x93\x94]))(\S)/$1 $2/g; + } + print "Point D: $s\n" if $local_verbose; + $s =~ s/(~+)/ $1 /g; + $s =~ s/((?:\xE2\x80\xB9|\xE2\x80\xBA|\xC2\xAB|\xC2\xBB|\xE2\x80\x9E)+)/ $1 /g; # triangular bracket(s) "<" or ">" etc. + $s =~ s/(``)([A-Za-z])/$1 $2/g; # added Nov. 30, 2017 + $s =~ s/((?:<|<)?=+(?:>|>)?)/ $1 /g; # include arrows + $s =~ s/(\\")/ $1 /g; + $s =~ s/([^\\])("+)/$1 $2 /g; + $s =~ s/([^\\])((?:\xE2\x80\x9C)+)/$1 $2 /g; # open " + $s =~ s/([^\\])((?:\xE2\x80\x9D)+)/$1 $2 /g; # close " + $s =~ s/((?:<|<)?-{2,}(?:>|>)?)/ $1 /g; # include arrows + $s =~ s/((?:\xE2\x80\xA6)+)/ $1 /g; # ellipsis + print "Point E: $s\n" if $local_verbose; + foreach $_ ((1..2)) { + # colon + $s =~ s/([.,;])(:+)/$1 \@$2/g; + $s =~ s/(:+)([.,;])/$1 \@\@ $2/g; + # # question mark/exclamation mark blocks + # $s =~ s/([^!?])([!?]+)([^!?])/$1 $2 $3/g; + } + print "Point F: $s\n" if $local_verbose; + $s =~ s/(\?)/ $1 /g; + $s =~ s/(\!)/ $1 /g; + $s =~ s/ +/ /g; + $s =~ s/(\$+|\xC2\xA3|\xE2\x82[\xA0-\xBE])/ $1 /g; # currency signs (Euro sign; British pound sign; Yen sign etc.) + $s =~ s/(\xC2\xA9|\xE2\x84\xA2)/ $1 /g; # copyright/trademark signs + $s =~ s/(\xC2\xB2)([-.,;:!?()])/$1 $2/g; # superscript 2 + $s =~ s/([^ ])( )/$1 $2/g; + $s =~ s/( )([^ ])/$1 $2/g; + $s =~ s/(&#\d+|&#x[0-9A-F]+);/$1_DONTBREAK_;/gi; + $s =~ s/([\@\.]\S*\d)([a-z][A-z])/$1_DONTBREAK_$2/g; # email address, URL + $s =~ s/ ($standard_abbrev_s)\./ $1_DONTBREAK_\./gi; + $s =~ s/ ($standard_abbrev_s) \. (\S)/ $1_DONTBREAK_\. $2/gi; + $s =~ s/\b((?:[A-Za-z]\.){1,3}[A-Za-z])\.\s+/$1_DONTBREAK_\. /g; # e.g. a.m. O.B.E. + $s =~ s/([ ])([A-Z])\. ([A-Z])/$1$2_DONTBREAK_\. $3/; # e.g. George W. Bush + $s =~ s/(\S\.*?[ ])([A-Z])_DONTBREAK_\. (After|All|And|But|Each|Every|He|How|In|It|My|She|So|That|The|Then|There|These|They|This|Those|We|What|When|Which|Who|Why|You)([', ])/$1$2\. $3$4/; # Exceptions to previous line, e.g. "plan B. This" + $s =~ s/\b(degrees C|[Ff]ig\.? \d+ ?[A-Z]|(?:plan|Scud) [A-Z])_DONTBREAK_\./$1\./g; # Exception, e.g. "plan B"; + $s =~ s/([^-_a-z0-9])(art|fig|no|p)((?:_DONTBREAK_)?\.)(\d)/$1$2$3 $4/gi; # Fig.2 No.14 + $s =~ s/([^-_A-Za-z0-9])(\d+(?:\.\d+)?)(?:_DONTBREAK_)?(thousand|million|billion|trillion|min|mol|sec|kg|km|g|m|p)\b/$1$2 $3/g; # 3.4kg 1.7million 49.9p + $s =~ s/([^-_a-z0-9])((?:[1-9]|1[0-2])(?:[.:][0-5]\d)?)(?:_DONTBREAK_)?([ap]m\b|[ap]\.m(?:_DONTBREAK_)?\.)/$1$2 $3/gi; # 3.15pm 12:00p.m. 8am + print "Point H: $s\n" if $local_verbose; + + $s =~ s/(\d)([a-z][A-z])/$1 $2/g; + $s =~ s/(\w|`|'|%|[a-zA-Z]\.|[a-zA-Z]_DONTBREAK_\.)(-|\xE2\x80\x93)(\w|`|')/$1 \@$2\@ $3/g; + $s =~ s/(\w|`|'|%|[a-zA-Z]\.|[a-zA-Z]_DONTBREAK_\.)(-|\xE2\x80\x93)(\w|`|')/$1 \@$2\@ $3/g; + $s =~ s/(\w)- /$1 \@- /g; + $s =~ s/ -(\w)/ -\@ $1/g; + $s =~ s/(\d):(\d)/$1 \@:\@ $2/g; + $s =~ s/(\d)\/(\d)/$1 \@\/\@ $2/g; + $s =~ s/($alphanum)\/([,;:!?])/$1 \@\/\@ $2/g; + $s =~ s/($alphanum)([-+]+)\/($alphanum)/$1$2 \@\/\@ $3/gi; + print "Point I: $s\n" if $local_verbose; + foreach $_ ((1..5)) { + $s =~ s/([ \/()])($alphanum) ?\/ ?($alphanum)([-+ \/().,;])/$1$2 \@\/\@ $3$4/gi; + } + $s =~ s/([a-zA-Z%\/\[\]]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05|[a-zA-Z]_DONTBREAK_\.)([,;:!?])\s*(\S)/$1 $2 $3/g; + # asterisk + $s =~ s/( [(\[]?)(\*)([a-z0-9])/$1$2\@ $3/gi; + $s =~ s/([a-z0-9])(\*)([.,;:)\]]* )/$1 \@$2$3/gi; + print "Point J: $s\n" if $local_verbose; + + # Arabic script + if ($s =~ /[\xD8-\xDB]/) { + for (my $i=0; $i <= 1; $i++) { + $s =~ s/([\xD8-\xDB][\x80-\xBF])([,;:!?.\(\)\[\]\/]|\xD8\x8C|\xD8\x9B|\xD8\x9F|\xD9\xAA|\xC2\xAB|\xC2\xBB|\xE2[\x80-\x9F][\x80-\xBF])/$1 $2/gi; # punctuation includes Arabic ,;?% + $s =~ s/([,;:!?.\(\)\[\]\/]|\xD8\x8C|\xD8\x9B|\xD8\x9F|\xD9\xAA|\xC2\xAB|\xC2\xBB|\xE2[\x80-\x9F][\x80-\xBF])([\xD8-\xDB][\x80-\xBF])/$1 $2/gi; + } + } + $s =~ s/(\d|[a-zA-Z]|[\xD8-\xDB][\x80-\xBF])([-])([\xD8-\xDB][\x80-\xBF])/$1 \@$2\@ $3/g; + $s =~ s/(\d|[a-zA-Z])([\xD8-\xDB][\x80-\xBF])/$1 \@\@ $2/g; + print "Point K: $s\n" if $local_verbose; + + # misc. non-ASCII punctuation + $s =~ s/(\xC2[\xA1\xBF]|\xD5\x9D|\xD6\x89|\xD8[\x8C\x9B]|\xD8\x9F|\xD9[\xAA\xAC]|\xDB\x94|\xDC[\x80\x82])/ $1 /g; + $s =~ s/(\xE0\xA5[\xA4\xA5]|\xE0\xBC[\x84-\x86\x8D-\x8F\x91\xBC\xBD])/ $1 /g; + $s =~ s/(\xE1\x81[\x8A\x8B]|\xE1\x8D[\xA2-\xA6])/ $1 /g; + $s =~ s/(\xE1\x81[\x8A\x8B]|\xE1\x8D[\xA2-\xA6]|\xE1\x9F[\x94\x96])/ $1 /g; + $s =~ s/([^0-9])(5\xE2\x80\xB2)(-)([ACGTU])/$1 $2 \@$3\@ $4/g; # 5-prime-DNA-seq. + $s =~ s/([^0-9])([35]\xE2\x80\xB2)/$1 $2 /g; # prime (keep 3-prime/5-prime together for bio domain) + $s =~ s/([^0-9])(\xE2\x80\xB2)/$1 $2 /g; # prime + $s =~ s/(\xE2\x81\x99)/ $1 /g; # five dot punctuation + $s =~ s/(\xE3\x80[\x81\x82\x8A-\x91]|\xE3\x83\xBB|xEF\xB8\xB0|\xEF\xBC\x8C)/ $1 /g; + $s =~ s/(\xEF\xBC[\x81-\x8F\x9A\x9F])/ $1 /g; # CJK fullwidth punctuation (e.g. fullwidth exclamation mark) + print "Point L: $s\n" if $local_verbose; + # spaces + $s =~ s/((?:\xE3\x80\x80)+)/ $1 /g; # idiographic space + $s =~ s/((?:\xE1\x8D\xA1)+)/ $1 /g; # Ethiopic space + + # isolate \xF0 and up from much more normal characters + $s =~ s/([\xF0-\xFE][\x80-\xBF]*)([\x00-\x7F\xC0-\xDF][\x80-\xBF]*)/$1 $2/g; + $s =~ s/([\x00-\x7F\xC0-\xDF][\x80-\xBF]*)([\xF0-\xFE][\x80-\xBF]*)/$1 $2/g; + print "Point M: $s\n" if $local_verbose; + + $s =~ s/( \d+)([,;:!?] )/$1 $2/g; + $s =~ s/ ([,;()\[\]])([a-zA-Z0-9.,;])/ $1 $2/g; + $s =~ s/(\)+)([-\/])([a-zA-Z0-9])/$1 $2 $3/g; + $s =~ s/([0-9\*\[\]()]|\xE2\x80\xB2)([.,;:] )/$1 $2/g; + $s =~ s/([a-zA-Z%]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)([,;:.!?])([")]|''|\xE2\x80[\x99\x9D]|)(\s)/$1 $2 $3$4/g; + $s =~ s/([a-zA-Z%]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)([,;:.!?])([")]|''|\xE2\x80[\x99\x9D]|)\s*$/$1 $2 $3/g; + $s =~ s/([.,;:]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)('|\xE2\x80[\x99\x9D])/$1 $2/g; + $s =~ s/('|\xE2\x80[\x99\x9D])([.,;:]|\x04)/$1 $2/g; + $s =~ s/([(){}\[\]]|\xC2\xB1)/ $1 /g; + $s =~ s/([a-zA-Z0-9]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)\.\s*$/$1 ./g; + $s =~ s/([a-zA-Z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)\.\s+/$1 . /g; + $s =~ s/([a-zA-Z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)\.(\x04)/$1 . $2/g; + $s =~ s/([0-9]),\s+(\S)/$1 , $2/g; + $s =~ s/([a-zA-Z])(\$)/$1 $2/g; + $s =~ s/(\$|[~<=>]|\xC2\xB1|\xE2\x89[\xA4\xA5]|\xE2\xA9[\xBD\xBE])(\d)/$1 $2/g; + $s =~ s/(RMB)(\d)/$1 $2/g; + print "Point N: $s\n" if $local_verbose; + foreach $_ ((1..2)) { + $s =~ s/([ '"]|\xE2\x80\x9C)(are|could|did|do|does|had|has|have|is|should|was|were|would)(n't|n\xE2\x80\x99t)([ '"]|\xE2\x80\x9D)/$1 $2 $3 $4/gi; + $s =~ s/ (can)(not) / $1 $2 /gi; + $s =~ s/ (ca)\s*(n)('t|\xE2\x80\x99t) / $1$2 $2$3 /gi; + $s =~ s/ ([Ww])o\s*n('|\xE2\x80\x99)t / $1ill n$2t /g; + $s =~ s/ WO\s*N('|\xE2\x80\x99)T / WILL N$1T /g; + $s =~ s/ ([Ss])ha\s*n('|\xE2\x80\x99)t / $1hall n$2t /g; + $s =~ s/ SHAN('|\xE2\x80\x99)T / SHALL N$1T /g; + # $s =~ s/ ain('|\xE2\x80\x99)t / is n$1t /g; + # $s =~ s/ Ain('|\xE2\x80\x99)t / Is n$1t /g; + # $s =~ s/ AIN('|\xE2\x80\x99)T / IS N$1T /g; + } + print "Point O: $s\n" if $local_verbose; + $s =~ s/(\d)%/$1 %/g; + $s =~ s/ '(d|ll|m|re|s|ve|em) / '_DONTBREAK_$1 /g; # 'd = would; 'll = will; 'em = them + $s =~ s/ \xE2\x80\x99t(d|ll|m|re|s|ve) / \xE2\x80\x99t_DONTBREAK_$1 /g; + $s =~ s/([^0-9a-z'.])('|\xE2\x80\x98)([0-9a-z])/$1$2 $3/gi; + $s =~ s/([0-9a-z])(\.(?:'|\xE2\x80\x99))([^0-9a-z']|\xE2\x80\x99)/$1 $2$3/gi; + $s =~ s/([0-9a-z]_?\.?)((?:'|\xE2\x80\x99)(?:d|ll|m|re|s|ve|))([^0-9a-z'])/$1 $2$3/gi; + $s =~ s/([("]|\xE2\x80\x9C|'')(\w)/$1 $2/g; + print "Point P: $s\n" if $local_verbose; + $s =~ s/(\w|[.,;:?!])([")]|''|\xE2\x80\x9D)/$1 $2/g; + $s =~ s/ ([,;()\[\]])([a-zA-Z0-9.,;])/ $1 $2/g; + $s =~ s/([a-z0-9]) ?(\()([-+_ a-z0-9\/]+)(\))/$1 $2 $3 $4 /ig; + $s =~ s/([a-z0-9]) ?(\[)([-+_ a-z0-9\/]+)(\])/$1 $2 $3 $4 /ig; + $s =~ s/([a-z0-9]) ?(\{)([-+_ a-z0-9\/]+)(\})/$1 $2 $3 $4 /ig; + $s =~ s/([%])-(\d+(?:\.\d+)? ?%)/$1 \@-\@ $2/g; + $s =~ s/( )(art|No)_DONTBREAK_(\.{2,})/$1 $2$3/gi; + $s =~ s/(_DONTBREAK_\.)(\.{1,})/$1 $2/g; + print "Point Q: $s\n" if $local_verbose; + foreach $_ ((1 .. 2)) { + $s =~ s/(\s(?:[-a-z0-9()']|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])*)(\.{2,})((?:[-a-z0-9()?!:\/']|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])*\s|(?:[-a-z0-9()'\/]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])+\.\s)/$1 $2 $3/gi; + } + $s =~ s/0s\b/0 s/g; + $s =~ s/([0-9])(\x04)/$1 $2/g; + $s =~ s/ +/ /g; + print "Point R: $s\n" if $local_verbose; + + if ($bio_p) { + foreach $_ ((1 .. 2)) { + $s =~ s/([a-z]) \@(-|\xE2\x80[\x93\x94])\@ (\d+(?:$alpha)?\d*\+?)([- \/])/$1$2$3$4/ig; + $s =~ s/([a-z]) \@(-|\xE2\x80[\x93\x94])\@ ((?:alpha|beta|kappa)\d+)([- \/])/$1$2$3$4/ig; + $s =~ s/([a-z]) \@(-|\xE2\x80[\x93\x94])\@ ((?:a|b|h|k)\d)([- \/])/$1$2$3$4/ig; + $s =~ s/([a-z0-9]) \@(-|\xE2\x80[\x93\x94])\@ ([a-z])([- \/])/$1$2$3$4/ig; + $s =~ s/([- \/])(\d*[a-z]) \@(-|\xE2\x80[\x93\x94])\@ ([a-z0-9])/$1$2$3$4/ig; + } + # mutation indicators such -/- etc. + $s =~ s/(\?\/) +(\?)/$1$2/g; + $s =~ s/([^ ?])((?:wt\/|onc\/)?(?:[-+]|\?+|\xE2\x80[\x93\x94])\/(?:[-+]|\?+|\xE2\x80[\x93\x94]))/$1 $2/g; + $s =~ s/((?:[-+]|\xE2\x80[\x93\x94])\/(?:[-+]|\xE2\x80[\x93\x94]))(\S)/$1 $2/g; + + # Erk1/2 + $rest = $s; + $s = ""; + while (($pre, $stem, $slashed_number_s, $post) = ($rest =~ /^(.*?[^-_a-z0-9])([a-z][-_a-z]*)(\d+(?:(?: \@)?\/(?:\@ )?(?:\d+))+)([^-+a-z0-9].*|)$/i)) { + if ((($pre =~ /\x04[^\x05]*$/) && ($post =~ /^[^\x04]*\x05/)) + || ($stem =~ /^(mid|pre|post|sub|to)$/i)) { + $s .= "$pre$stem$slashed_number_s"; + } else { + $s .= $pre; + my @slashed_numbers = split(/(?: \@)?\/(?:\@ )?/, $slashed_number_s); + foreach $i ((0 .. $#slashed_numbers)) { + my $number = $slashed_numbers[$i]; + $s .= "$stem$number"; + $s .= " @\/@ " unless $i == $#slashed_numbers; + } + } + $rest = $post; + } + $s .= $rest; + + # Erk-1/-2 + while (($pre, $stem, $dash1, $number1, $dash2, $number2, $post) = ($s =~ /^(.*[^-_a-z0-9])([a-z][-_a-z]*)(?: \@)?(-|\xE2\x80[\x93\x94])(?:\@ )?(\d+)(?: \@)?\/(?:\@ )?(?:\@ )?(-|\xE2\x80[\x93\x94])(?:\@ )?(\d+)([^-+a-z0-9].*|)$/i)) { + $s = "$pre$stem$dash1$number1 \@\/\@ $stem$dash2$number2$post"; + } + $rest = $s; + $s = ""; + # IFN-a/b (Slac2-a/b/c) + while (($pre, $stem, $dash, $slashed_letter_s, $post) = ($rest =~ /^(.*[^-_a-z0-9])([a-z][-_a-z0-9]*)(-|\xE2\x80[\x93\x94])([a-z](?:(?: \@)?\/(?:\@ )?(?:[a-z]))+)([^-+a-z0-9].*|)$/i)) { + if (($pre =~ /\x04[^\x05]*$/) && ($post =~ /^[^\x04]*\x05/)) { + $s .= "$pre$stem$dash1$number1$dash2$number2"; + } else { + $s .= $pre; + my @slashed_letters = split(/(?: \@)?\/(?:\@ )?/, $slashed_letter_s); + foreach $i ((0 .. $#slashed_letters)) { + my $letter = $slashed_letters[$i]; + $s .= "$stem$dash$letter"; + $s .= " @\/@ " unless $i == $#slashed_letters; + } + } + $rest = $post; + } + $s .= $rest; + + # SPLIT X-induced + my $rest = $s; + my $new_s = ""; + while (($pre, $dash, $right, $post) = ($rest =~ /^(.*?)(-|\xE2\x80[\x93\x94])([a-z]+)( .*|)$/i)) { + $new_s .= $pre; + if (($right eq "I") && ($pre =~ / [a-zA-Z][a-z]*$/)) { + # compatriots-I have a dream + $new_s .= " \@" . $dash . "\@ "; + } elsif ($ht{LC_SPLIT_DASH_X}->{($caller->normalize_punctuation(lc $right))}) { + $new_s .= " \@" . $dash . "\@ "; + } else { + $new_s .= $dash; + } + $new_s .= $right; + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # SPLIT ubiquinated-X + $rest = $s; + $new_s = ""; + while (($pre, $left, $dash, $post) = ($rest =~ /^(.*? |)([a-z0-9]+|'s)(-|\xE2\x80[\x93\x94])([a-z0-9].*)$/i)) { + $new_s .= "$pre$left"; + if ($ht{LC_SPLIT_X_DASH}->{($caller->normalize_punctuation(lc $left))}) { + $new_s .= " \@" . $dash . "\@ "; + } else { + $new_s .= $dash; + } + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # SPLIT low-frequency + $rest = $s; + $new_s = ""; + if (($pre, $left, $dash, $right, $post) = ($rest =~ /^(.*?[- ]|)([a-z]+)([-\/]|\xE2\x80[\x93\x94])([a-z]+)([- ].*|)$/i)) { + } + while (($pre, $left, $dash, $right, $post) = ($rest =~ /^(.*?[-\/ ]|)([a-z]+)((?: \@)?(?:[-\/]|\xE2\x80[\x93\x94])(?:\@ )?)([a-z]+)([-\/ ].*|)$/i)) { + $x = $caller->normalize_punctuation(lc ($left . $dash . $right)); + if ($ht{LC_SPLIT}->{($caller->normalize_punctuation(lc ($left . $dash . $right)))}) { + $pre =~ s/([-\/])$/ \@$1\@ /; + $post =~ s/^([-\/])/ \@$1\@ /; + $dash = $caller->normalize_punctuation($dash); + $new_s .= "$pre$left"; + $new_s .= " \@" . $dash . "\@ "; + $new_s .= $right; + $rest = $post; + } elsif ($pre =~ /[-\/]$/) { + $new_s .= $pre; + $rest = "$left$dash$right$post"; + } else { + $new_s .= "$pre$left"; + $rest = "$dash$right$post"; + } + } + $new_s .= $rest; + $s = $new_s; + + # DO-NOT-SPLIT X-ras + $rest = $s; + $new_s = ""; + while (($pre, $dash, $right, $post) = ($rest =~ /^(.*?) \@(-|\xE2\x80[\x93\x94])\@ ([a-z0-9]+)( .*|)$/i)) { + $new_s .= $pre; + if ($ht{LC_DO_NOT_SPLIT_DASH_X}->{($caller->normalize_punctuation(lc $right))}) { + $new_s .= $dash; + } else { + $new_s .= " \@" . $dash . "\@ "; + } + $new_s .= $right; + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # DO-NOT-SPLIT Caco-X + $rest = $s; + $new_s = ""; + while (($pre, $left, $dash, $post) = ($rest =~ /^(.*? |)([a-z0-9]+) \@([-\/]|\xE2\x80[\x93\x94]])\@ ([a-z0-9].*)$/i)) { + $new_s .= "$pre$left"; + if ($ht{LC_DO_NOT_SPLIT_X_DASH}->{($caller->normalize_punctuation(lc $left))}) { + $new_s .= $dash; + } else { + $new_s .= " \@" . $dash . "\@ "; + } + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # DO-NOT-SPLIT down-modulate (2 elements) + $rest = $s; + $new_s = ""; + while (($pre, $left, $dash, $right, $post) = ($rest =~ /^(.*? |)([a-z0-9]+) \@([-\/]|\xE2\x80[\x93\x94]])\@ ([a-z0-9]+)( .*|)$/i)) { + $new_s .= "$pre$left"; + if ($ht{LC_DO_NOT_SPLIT}->{($caller->normalize_punctuation(lc ($left . $dash . $right)))}) { + $new_s .= $dash; + } else { + $new_s .= " \@" . $dash . "\@ "; + } + $new_s .= $right; + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # DO-NOT-SPLIT 14-3-3 (3 elements) + $rest = $s; + $new_s = ""; + while (($pre, $left, $dash_group1, $dash1, $middle, $dash_group2, $dash2, $right, $post) = ($rest =~ /^(.*? |)([a-z0-9]+)((?: \@)?([-\/]|\xE2\x80[\x93\x94]])(?:\@ )?)([a-z0-9]+)((?: \@)?([-\/]|\xE2\x80[\x93\x94]])(?:\@ )?)([a-z0-9]+)( .*|)$/i)) { + $new_s .= "$pre$left"; + if ($ht{LC_DO_NOT_SPLIT}->{($caller->normalize_punctuation(lc ($left . $dash1 . $middle . $dash2 . $right)))}) { + $new_s .= $dash1; + } else { + $new_s .= $dash_group1; + } + $new_s .= $middle; + if ($ht{LC_DO_NOT_SPLIT}->{($caller->normalize_punctuation(lc ($left . $dash1 . $middle . $dash2 . $right)))}) { + $new_s .= $dash2; + } else { + $new_s .= $dash_group2; + } + $new_s .= $right; + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + $s =~ s/ +/ /g; + } + print "Point S: $s\n" if $local_verbose; + + $s =~ s/_DONTBREAK_//g; + $s =~ s/( )(ark|ill|mass|miss|wash|GA|LA|MO|OP|PA|VA|VT)(\.)( )/$1$2 $3$4/g; + print "Point T: $s\n" if $local_verbose; + $s = $caller->restore_urls_x045_guarded_string($s); + $s = $caller->restore_xml_tags_x0123_guarded_string($s); + print "Point U: $s\n" if $local_verbose; + $s =~ s/(https?|ftp)\s*(:)\s*(\/\/)/$1$2$3/gi; + $s =~ s/\b(mailto)\s*(:)\s*([a-z])/$1$2$3/gi; + $s =~ s/(\d)\s*(:)\s*([0-5]\d[^0-9])/$1$2$3/gi; + print "Point V: $s\n" if $local_verbose; + $s =~ s/(5\xE2\x80\xB2-[ACGT]+)\s*(-|\xE2\x80[\x93\x94])\s*(3\xE2\x80\xB2)/$1$2$3/g; # repair broken DNA sequence + $s =~ s/ (etc) \. / $1. /g; # repair most egrareous separations + print "Point W: $s\n" if $local_verbose; + $s = $caller->repair_separated_periods($s); + print "Point X: $s\n" if $local_verbose; + $s =~ s/^\s+//; + $s =~ s/\s+$//; + $s = "$pre$s$post" if defined($pre) && defined($post); + $s =~ s/ +/ /g; + print "Point Y: $s\n" if $local_verbose; + + return $s; +} + +sub tokenize_plus_for_noisy_text { + local($caller, $s, *ht, $control) = @_; + + $control = "" unless defined($control); + my $pre; + my $code; + my $post; + $s = " $core " if ($pre,$core,$post) = ($s =~ /^(\s*)(.*?)(\s*)$/i); + foreach $i ((1 .. 2)) { + $s =~ s/ ([A-Z][a-z]+'?[a-z]+)(-) / $1 $2 /gi; # Example: Beijing- + $s =~ s/ (\d+(?:\.\d+)?)(-|:-|:|_|\.|'|;)([A-Z][a-z]+'?[a-z]+|[A-Z]{3,}) / $1 $2 $3 /gi; # Example: 3:-Maxkamado + $s =~ s/ (\d+(?:\.\d+)?)(')([A-Za-z]{3,}) / $1 $2 $3 /gi; # Example: 42'daqiiqo + $s =~ s/ (-|:-|:|_|\.)([A-Z][a-z]+'?[a-z]+|[A-Z]{3,}) / $1 $2 /gi; # Example: -Xassan + $s =~ s/ ((?:[A-Z]\.[A-Z]|[A-Z]|Amb|Col|Dr|Eng|Gen|Inj|Lt|Maj|Md|Miss|Mr|Mrs|Ms|Pres|Prof|Sen)\.)([A-Z][a-z]+|[A-Z]{2,}) / $1 $2 /gi; # Example: Dr.Smith + $s =~ s/ (\d+)(,)([a-z]{3,}) / $1 $2 $3 /gi; # Example: 24,October + $s =~ s/ (%)(\d+(?:\.\d+)?) / $1 $2 /gi; # Example: %0.6 + $s =~ s/ ([A-Za-z][a-z]{3,}\d*)([.,\/]|:\()([A-Za-z][a-z]{3,}|[A-Z]{3,}) / $1 $2 $3 /gi; # Example: Windows8,falanqeeyaal + $s =~ s/ ([A-Za-z]{3,}\d*?|[A-Za-z]+'[A-Za-z]+)([,\/]|:\()([A-Za-z]{3,}|[A-Za-z]+'[A-Za-z]+) / $1 $2 $3 /gi; # Example: GAROOWE:(SHL + $s =~ s/ (\d[0-9.,]*\d)(;)([a-z]+) / $1 $2 $3 /gi; # Example: 2.1.2014;Waraka + } + $s =~ s/^\s+//; + $s =~ s/\s+$//; + $s = "$pre$s$post" if defined($pre) && defined($post); + return $s; +} + +# preparation for sub repair_separated_periods: + +my $abbrev_s = "etc.|e.g.|i.e.|U.K.|S.p.A.|A.F.P."; +my @abbrevs = split(/\|/, $abbrev_s); +my @exp_abbrevs = (); +foreach $abbrev (@abbrevs) { + if (($core,$period) = ($abbrev =~ /^(.*?)(\.|)$/)) { + $core =~ s/\./\\s*\\.\\s*/g; + $abbrev = $core; + $abbrev .= "\\b" if $abbrev =~ /[a-z]$/i; # don't split etcetera -> etc. etera + $abbrev .= "(?:\\s*\\.|)" if $period; + push(@exp_abbrevs, $abbrev); + } +} +my $exp_abbrev_s = join("|", @exp_abbrevs); + +sub repair_separated_periods { + local($caller,$s) = @_; + + # separated or missing period + my $result = ""; + while (($pre, $abbrev, $post) = ($s =~ /^(.*? |)($exp_abbrev_s)(.*)$/)) { + $abbrev =~ s/ //g; + $abbrev .= "." unless $abbrev =~ /\.$/; + $result .= "$pre$abbrev "; + $s = $post; + } + $result .= $s; + $result =~ s/ +/ /g; + return $result; +} + +# provided by Alex Fraser +sub fix_tokenize { + local($caller,$s) = @_; + + ## change "2:15" to "2 @:@ 15" + $s =~ s/(\d)\:(\d)/$1 \@:\@ $2/g; + + ## strip leading zeros from numbers + $s =~ s/(^|\s)0+(\d)/$1$2/g; + + ## fix rule typo + $s =~ s/associatedpress/associated press/g; + + ## fix _ entities + $s =~ s/hong_kong/hong kong/g; + $s =~ s/united_states/united states/g; + + return $s; +} + +sub de_mt_tokenize { + local($caller,$s) = @_; + + $s =~ s/\s+\@([-:\/])/$1/g; + $s =~ s/([-:\/])\@\s+/$1/g; + $s =~ s/\s+\/\s+/\//g; + return $s; +} + +sub surface_forms { + local($caller,$pe,$modp) = @_; + + $sem = $pe->sem; + $surf = $pe->surf; + $synt = $pe->synt; + $value = $pe->value; + $gloss = $pe->gloss; +# $util->log("surface_forms surf:$surf sem:$sem gloss:$gloss value:$value", $logfile); + if ($sem eq "integer") { + return ($gloss) if ($gloss =~ /several/) && !($value =~ /\S/); + print STDERR "Warning: $value not an integer\n" unless $value =~ /^\d+(e\+\d+)?$/; + if ($pe->get("reliable") =~ /sequence of digits/) { + $english = $value; + $english = "$prefix $english" if $prefix = $pe->get("prefix"); + @result = ($english); + } else { + @result = $caller->q_number_surface_forms($pe); + } + } elsif ($sem eq "decimal number") { + @result = $caller->q_number_surface_forms($pe); + } elsif ($sem =~ /(integer|decimal number) range/) { + @result = $caller->number_range_surface_forms($pe); + } elsif ($sem eq "ordinal") { + if ($pe->get("definite")) { + $exclude_adverbials_p = 1; + } elsif (defined($chinesePM) && ($hao = $chinesePM->e2c("hao-day")) + && ($gc = $chinesePM->e2c("generic counter"))) { + $exclude_adverbials_p = ($surf =~ /($hao|$gc)$/); + } else { + $exclude_adverbials_p = 1; + } + @result = $caller->ordinal_surface_forms($pe->get("ordvalue") || $pe->value,0,$exclude_adverbials_p, $pe); + } elsif ($sem eq "fraction") { + @result = $caller->fraction_surface_forms($pe,$modp); + } elsif ($sem =~ /monetary quantity/) { + @result = $caller->currency_surface_forms($pe); + } elsif ($sem =~ /occurrence quantity/) { + @result = $caller->occurrence_surface_forms($pe,$modp); + } elsif ($sem =~ /score quantity/) { + @result = $caller->score_surface_forms($pe); + } elsif ($sem =~ /age quantity/) { + @result = $caller->age_surface_forms($pe, $modp); + } elsif ($sem =~ /quantity/) { + @result = $caller->quantity_surface_forms($pe,$modp); + } elsif ($sem eq "percentage") { + @result = $caller->percent_surface_forms($pe,$modp); + } elsif ($sem eq "percentage range") { + if ($gloss =~ /^and /) { + @result = ($gloss); + } else { + @result = ($gloss, "by $gloss", "of $gloss"); + } + } elsif ($sem =~ /^(month of the year|month\+year|year)$/) { + if ($synt eq "pp") { + @result = ($gloss); + } elsif ($gloss =~ /^the (beginning|end) of/) { + @result = ($gloss, "at $gloss"); + } elsif ($gloss =~ /^(last|this|current|next)/) { + @result = ($gloss); + } else { + # in November; in mid-November + @result = ($gloss, "in $gloss"); + } + } elsif ($sem =~ /date(\+year)?$/) { + @result = $caller->date_surface_forms($pe,$modp); + } elsif ($sem =~ /year range\b.*\b(decade|century)$/) { + @result = $caller->decade_century_surface_forms($pe); + } elsif ($sem eq "day of the month") { + @result = $caller->day_of_the_month_surface_forms($pe); + } elsif ($sem =~ /period of the day\+day of the week/) { + @result = ($gloss); + push(@result, "on $gloss") if $gloss =~ /^the night/; + } elsif ($sem =~ /day of the week/) { + @result = $caller->day_of_the_week_surface_forms($pe); + } elsif ($sem =~ /^(time)$/) { + if ($gloss =~ /^at /) { + @result = ($gloss); + } else { + @result = ($gloss, "at $gloss"); + } + } elsif ($sem =~ /^date range$/) { + if ($synt eq "pp") { + @result = ($gloss); + } elsif ($pe->get("between")) { + $b_gloss = "between $gloss"; + $b_gloss =~ s/-/ and /; + @result = ($b_gloss, $gloss, "from $gloss"); + } else { + @result = ($gloss, "from $gloss"); + } + } elsif ($sem =~ /^date enumeration$/) { + if ($synt eq "pp") { + @result = ($gloss); + } else { + @result = ($gloss, "on $gloss"); + } + } elsif ($pe->get("unknown-in-pc")) { + @result = (); + foreach $unknown_pos_en (split(/;;/, $pe->get("unknown-pos-en-list"))) { + ($engl) = ($unknown_pos_en =~ /^[^:]+:[^:]+:(.*)$/); + push(@result, $engl) if defined($engl) && ! $util->member($engl, @result); + } + @result = ($gloss) unless @result; + } elsif (($sem =~ /\b(name|unknown)\b/) && (($en_s = $pe->get("english")) =~ /[a-z]/i)) { + @result = split(/\s*\|\s*/, $en_s); + } elsif (($sem =~ /^proper\b/) && (($en_s = $pe->get("english")) =~ /[a-z]/i)) { + @result = split(/\s*\|\s*/, $en_s); + } else { + @result = ($gloss); + } + + if (($sem =~ /^(date\+year|month\+year|year)$/) + && ($year = $pe->get("year")) + && ($year =~ /^\d\d$/) + && (@extend_years = @{$english_entity_style_ht{"ExtendYears"}}) + && ($#extend_years == 1) + && ($extended_year_start = $extend_years[0]) + && ($extended_year_end = $extend_years[1]) + && ($extended_year_start <= $extended_year_end) + && ($extended_year_start + 99 >= $extended_year_end) + && ($extended_year_start =~ /^\d\d\d\d$/) + && ($extended_year_end =~ /^\d\d\d\d$/)) { + $century1 = substr($extended_year_start, 0, 2); + $century2 = substr($extended_year_end, 0, 2); + $exp_year1 = "$century1$year"; + $exp_year2 = "$century2$year"; + if (($extended_year_start <= $exp_year1) && ($exp_year1 <= $extended_year_end)) { + $exp_year = $exp_year1; + } elsif (($extended_year_start <= $exp_year2) && ($exp_year2 <= $extended_year_end)) { + $exp_year = $exp_year2; + } else { + $exp_year = ""; + } + if ($exp_year) { + @new_glosses = (); + foreach $old_gloss (@result) { + $new_gloss = $old_gloss; + $new_gloss =~ s/\b$year$/$exp_year/; + push (@new_glosses, $new_gloss) unless $new_gloss eq $old_gloss; + } + push (@result, @new_glosses); + } + } + + # tokenize as requested + @tokenize_list = @{$english_entity_style_ht{"Tokenize"}}; + $tokenize_p = 1 if $util->member("yes", @tokenize_list) + || $util->member("all", @tokenize_list); + $dont_tokenize_p = 1 if $util->member("no", @tokenize_list) + || $util->member("all", @tokenize_list); + if ($tokenize_p) { + @new_result = (); + foreach $item (@result) { + $t_item = $caller->tokenize($item, *dummy_ht); + push(@new_result, $item) if $dont_tokenize_p && ($item ne $t_item); + push(@new_result, $t_item); + } + @result = @new_result; + } + + # case as requested + @case_list = @{$english_entity_style_ht{"Case"}}; + $lower_case_p = $util->member("lower", @case_list) + || $util->member("all", @case_list); + $reg_case_p = $util->member("regular", @case_list) + || $util->member("all", @case_list); + if ($lower_case_p) { + @new_result = (); + foreach $item (@result) { + $l_item = "\L$item"; + push(@new_result, $item) if $reg_case_p && ($item ne $l_item); + push(@new_result, $l_item) unless $util->member($l_item, @new_result); + } + @result = @new_result; + } + # $value = "n/a" unless $value; + # print STDERR "SF surf:$surf sem:$sem gloss:$gloss value:$value Result(s): " . join("; ", @result) . "\n"; + return @result; +} + +sub case_list { + return @{$english_entity_style_ht{"Case"}}; +} + +sub right_cased_list { + local($caller, $word) = @_; + + @case_list = @{$english_entity_style_ht{"Case"}}; + + @right_cased_core_list = (); + push(@right_cased_core_list, $word) + if ($util->member("regular", @case_list) || $util->member("all", @case_list)) + && ! $util->member($word, @right_cased_core_list); + push(@right_cased_core_list, lc $word) + if ($util->member("lower", @case_list) || $util->member("all", @case_list)) + && ! $util->member(lc $word, @right_cased_core_list); + + return @right_cased_core_list; +} + +sub string2surf_forms { + local($caller, $text, $lang, $alt_sep) = @_; + + $alt_sep = " | " unless defined($alt_sep); + $lang = "zh" unless defined($lang); + + if ($lang eq "zh") { + @pes = $chinesePM->parse_entities_in_string($text); + $n = $#pes + 1; +# print " $n pes\n"; + @pes = $chinesePM->select_reliable_entities(@pes); + my @res_surf_forms_copy = $caller->reliable_pes2surf_forms($alt_sep, @pes); + return @res_surf_forms_copy; + } else { + return (); + } +} + +sub reliable_pe2surf_forms { + local($caller, $pe, $parent_reliant_suffices_p) = @_; + + $parent_reliant_suffices_p = 0 unless defined($parent_reliant_suffices_p); + if ((defined($r = $pe->get("reliable")) && $r) + || ($parent_reliant_suffices_p && ($parent_pe = $pe->get("parent")) && + $parent_pe->get("reliable"))) { + @surf_forms = $caller->surface_forms($pe); + if ((($pe->sem =~ /quantity( range)?$/) && !($pe->sem =~ /monetary quantity/)) + || ($util->member($pe->sem, "percentage","fraction"))) { + foreach $mod_form ($caller->surface_forms($pe, 1)) { + push(@surf_forms, $mod_form) unless $util->member($mod_form, @surf_forms); + } + } + return @surf_forms; + } + return (); +} + +sub reliable_pe2surf_form { + local($caller, $alt_sep, $pe) = @_; + + if (@surf_forms = $caller->reliable_pe2surf_forms($pe)) { + return $pe->surf . " == " . join($alt_sep, @surf_forms); + } else { + return ""; + } +} + +sub reliable_pes2surf_forms { + local($caller, $alt_sep, @pes) = @_; + + my @res_surf_forms = (); + foreach $pe (@pes) { + if ($new_surf_form = $caller->reliable_pe2surf_form($alt_sep, $pe)) { + push(@res_surf_forms, $new_surf_form); + } + } + return @res_surf_forms; +} + +sub string_contains_ascii_letter { + local($caller,$string) = @_; + return $string =~ /[a-zA-Z]/; +} + +sub string_starts_w_ascii_letter { + local($caller,$string) = @_; + return $string =~ /^[a-zA-Z]/; +} + +sub en_lex_bin { + local($caller, $word) = @_; + + $word =~ s/\s+//g; + $word =~ s/[-_'\/]//g; + $word =~ tr/A-Z/a-z/; + return "digit" if $word =~ /^\d/; + return "special" unless $word =~ /^[a-z]/; + return substr($word, 0, 1); +} + +sub skeleton_bin { + local($caller, $sk_bin_control, $word) = @_; + + $word =~ s/\s+//g; + $word =~ s/[-_'\/]//g; + $word =~ tr/A-Z/a-z/; + return "E" unless $word; + if ($sk_bin_control =~ /^v1/i) { + return $word if length($word) <= 2; + return substr($word, 0, 3) if $word =~ /^(b|f[lnrt]|gr|j[nr]|k|l[nt]|m|n[kmst]|r[knst]|s|t)/; + return substr($word, 0, 2); + } elsif ($sk_bin_control =~ /d6f$/) { + return $word if length($word) <= 6; + return substr($word, 0, 6); + } elsif ($sk_bin_control =~ /d5f$/) { + return $word if length($word) <= 5; + return substr($word, 0, 5); + } elsif ($sk_bin_control =~ /d4f$/) { + return $word if length($word) <= 4; + return substr($word, 0, 4); + } else { + return $word if length($word) <= 4; + return substr($word, 0, 5) if $word =~ /^(bnts|brnt|brst|brtk|brtn|brts|frst|frts|klts|kntr|knts|krst|krtn|krts|ksks|kstr|lktr|ntrs|sbrt|skrt|sntr|strn|strt|trns|trts|ts)/; + return substr($word, 0, 4); + } +} + +sub skeleton_bin_sub_dir { + local($caller, $sk_bin_control, $skeleton_bin) = @_; + + $sk_bin_control = "v1" unless defined($sk_bin_control); + return "" if $sk_bin_control =~ /^v1/i; + if ($sk_bin_control =~ /^2d4d\df$/) { + return "SH/SHOR" if (length($skeleton_bin) < 2); + return substr($skeleton_bin, 0, 2) . "/" . substr($skeleton_bin, 0, 2) . "SH" if (length($skeleton_bin) < 4); + return substr($skeleton_bin, 0, 2) . "/" . substr($skeleton_bin, 0, 4); + } elsif ($sk_bin_control =~ /^2d3d\df$/) { + return "SH/SHO" if (length($skeleton_bin) < 2); + return substr($skeleton_bin, 0, 2) . "/" . substr($skeleton_bin, 0, 2) . "S" if (length($skeleton_bin) < 3); + return substr($skeleton_bin, 0, 2) . "/" . substr($skeleton_bin, 0, 3); + } + $bin3 = "ts"; + return "SH" if (length($skeleton_bin) < 2) || ($skeleton_bin =~ /^($bin3)$/); + return substr($skeleton_bin, 0, 3) if $skeleton_bin =~ /^($bin3)/; + return substr($skeleton_bin, 0, 2); +} + +sub en_words_and_counts_matching_skeletons { + local($caller, $sk_bin_version, @skeletons) = @_; + + return () unless @skeletons; + + @rem_skeletons = sort @skeletons; + $previous_skeleton = ""; + $current_skeleton = shift @rem_skeletons; + @list = ($current_skeleton); + @lists = (); + + $current_bin = ""; + while ($current_skeleton) { + unless ($current_skeleton eq $previous_skeleton) { + $current_skeleton_bin = $caller->skeleton_bin($sk_bin_version, $current_skeleton); + unless ($current_skeleton_bin eq $current_bin) { + # need to read from new file + close(IN) if $current_bin; + $current_bin = $current_skeleton_bin; + $current_bin_subdir + = $caller->skeleton_bin_sub_dir($sk_bin_version, $current_bin); + if ($current_bin_subdir) { + $en_skeleton_file = File::Spec->catfile($english_resources_skeleton_dir, + $current_bin_subdir, + "$current_bin.txt"); + } else { + $en_skeleton_file = File::Spec->catfile($english_resources_skeleton_dir, + "$current_bin.txt"); + } + # print STDERR " Perusing $en_skeleton_file ...\n"; + if (open(IN, $en_skeleton_file)) { + $en_skeleton_file_exists = 1; + } else { + $en_skeleton_file_exists = 0; + print STDERR "Can't open $en_skeleton_file (Point A)\n"; + } + } + $previous_skeleton = $current_skeleton; + } + $_ = if $en_skeleton_file_exists; + unless ($en_skeleton_file_exists && defined($_)) { + push(@lists, join(' ; ', @list)); + if (@rem_skeletons) { + $current_skeleton = shift @rem_skeletons; + @list = ($current_skeleton); + } else { + $current_skeleton = ""; + } + next; + } + ($skeleton) = ($_ =~ /^(\S+)\t/); + next unless defined($skeleton); + $skeletons_match_p = $caller->skeletons_match_p($skeleton, $current_skeleton); + next if ($skeleton lt $current_skeleton) && ! $skeletons_match_p; + if ($skeletons_match_p) { + ($token, $count) = ($_ =~ /^\S+\t(\S|\S[-' a-zA-Z]*\S)\t(\d+)\s*$/); + push(@list, "$token : $count") if defined($token) && defined($count); + } else { + while ($current_skeleton lt $skeleton) { + push(@lists, join(' ; ', @list)); + unless (@rem_skeletons) { + close(IN) if $current_bin; + $current_skeleton = ""; + last; + } + $current_skeleton = shift @rem_skeletons; + @list = ($current_skeleton); + } + if ($caller->skeletons_match_p($skeleton, $current_skeleton)) { + ($token, $count) = ($_ =~ /^\S+\t(\S|\S[-' a-zA-Z]*\S)\t(\d+)\s*$/); + push(@list, "$token : $count") if defined($token) && defined($count); + } + } + } + close(IN) if $current_bin; + return @lists; +} + +sub skeletons_match_p { +# one of the skeletons might have been cut off at max + local($caller, $skeleton1, $skeleton2, $max) = @_; + + return 1 if $skeleton1 eq $skeleton2; + + $max = 5 unless defined($max); + if ((length($skeleton1) > length($skeleton2)) && (length($skeleton2) == $max)) { + return ($skeleton1 =~ /^$skeleton2/) ? 1 : 0; + } elsif ((length($skeleton2) > length($skeleton1)) && (length($skeleton1) == $max)) { + return ($skeleton2 =~ /^$skeleton1/) ? 1 : 0; + } else { + return 0; + } +} + +sub token_weird_or_too_long { + local($caller, *WARNING_FH, $token) = @_; + + $lc_token = lc $token; + $norm_token = $lc_token; + $norm_token =~ s/[-' ,]//g; + $snippet4_5 = ""; + $snippet4_5 = substr($norm_token, 4, 2) if length($norm_token) >= 10; + $snippet4_6 = ""; + $snippet4_6 = substr($norm_token, 4, 3) if length($norm_token) >= 10; + if (($norm_token =~ /(kkk|vvv|www|xxx|yyy|zzz)/) || + ($norm_token =~ /[acgt]{15,}/) || # DNA sequence + ($snippet4_5 && ($norm_token =~ /($snippet4_5){5,}/)) || # 2-letter repetition + ($snippet4_6 && ($norm_token =~ /($snippet4_6){4,}/)) || # 3-letter repetition + ($norm_token =~ /[bcdfghjklmnpqrstvwxz]{8,}/) || # too many consonants + ($token =~ /(DDD)/) || + (($lc_token =~ /fff/) && ! ($lc_token =~ /schifff/))) { + print WARNING_FH "skipping (WEIRD): $_"; + return 1; + } + if ((length($norm_token) >= 50) || + ((length($norm_token) >= 28) + + # typical German compound noun components + && ! ($norm_token =~ /entwicklung/) + && ! ($norm_token =~ /fabrik/) + && ! ($norm_token =~ /finanz/) + && ! ($norm_token =~ /forschung/) + && ! ($norm_token =~ /geschwindigkeit/) + && ! ($norm_token =~ /gesundheit/) + && ! ($norm_token =~ /gewohnheit/) + && ! ($norm_token =~ /schaft/) + && ! ($norm_token =~ /schifffahrt/) + && ! ($norm_token =~ /sicherheit/) + && ! ($norm_token =~ /vergangen/) + && ! ($norm_token =~ /versicherung/) + && ! ($norm_token =~ /unternehmen/) + && ! ($norm_token =~ /verwaltung/) + + # Other Germanic languages + && ! ($norm_token =~ /aktiebolag/) + && ! ($norm_token =~ /aktieselskab/) + && ! ($norm_token =~ /ontwikkeling/) + + # chemical + && ! ($norm_token =~ /phetamine/) + && ! ($norm_token =~ /ethyl/) + + # medical + && ! ($norm_token =~ /^pneumonaultramicroscopicsilicovolcanoconios[ei]s$/) + + # business + && ! ($norm_token =~ /PriceWaterhouse/) + )) { + print WARNING_FH "skipping (TOO LONG): $_"; + return 1; + } + return 0; +} + +sub xml_de_accent { + local($caller, $string) = @_; + + # for the time being, unlauts are mapped to main vowel (without "e") + + $string =~ s/\[2-7];/A/g; + $string =~ s/\Æ/Ae/g; + $string =~ s/\Ç/C/g; + $string =~ s/\[0-3];/E/g; + $string =~ s/\[4-7];/I/g; + $string =~ s/\Ð/Dh/g; + $string =~ s/\Ñ/N/g; + $string =~ s/\[0-4];/O/g; + $string =~ s/\Ø/O/g; + $string =~ s/\[7-9];/U/g; + $string =~ s/\Ü/U/g; + $string =~ s/\Ý/Y/g; + $string =~ s/\Þ/Th/g; + + $string =~ s/\ß/ss/g; + $string =~ s/\[4-9];/a/g; + $string =~ s/\æ/ae/g; + $string =~ s/\ç/c/g; + $string =~ s/\[2-5];/e/g; + $string =~ s/\[6-9];/i/g; + $string =~ s/\ð/dh/g; + $string =~ s/\ñ/n/g; + $string =~ s/\[2-6];/o/g; + $string =~ s/\ø/o/g; + $string =~ s/\ù/u/g; + $string =~ s/\[0-2];/u/g; + $string =~ s/\ý/y/g; + $string =~ s/\þ/th/g; + $string =~ s/\ÿ/y/g; + $string =~ s/\xE2\x80\x99/'/g; + + return $string; +} + +sub de_accent { + local($caller, $string) = @_; + + # for the time being, unlauts are mapped to main vowel (without "e") + + $string =~ s/\xC3[\x80-\x85]/A/g; + $string =~ s/\xC3\x86/Ae/g; + $string =~ s/\xC3\x87/C/g; + $string =~ s/\xC3[\x88-\x8B]/E/g; + $string =~ s/\xC3[\x8C-\x8F]/I/g; + $string =~ s/\xC3\x90/Dh/g; + $string =~ s/\xC3\x91/N/g; + $string =~ s/\xC3[\x92-\x96]/O/g; + $string =~ s/\xC3\x98/O/g; + $string =~ s/\xC3[\x99-\x9C]/U/g; + $string =~ s/\xC3\x9D/Y/g; + $string =~ s/\xC3\x9E/Th/g; + + $string =~ s/\xC3\x9F/ss/g; + $string =~ s/\xC3[\xA0-\xA5]/a/g; + $string =~ s/\xC3\xA6/ae/g; + $string =~ s/\xC3\xA7/c/g; + $string =~ s/\xC3[\xA8-\xAB]/e/g; + $string =~ s/\xC3[\xAC-\xAF]/i/g; + $string =~ s/\xC3\xB0/dh/g; + $string =~ s/\xC3\xB1/n/g; + $string =~ s/\xC3[\xB2-\xB6]/o/g; + $string =~ s/\xC3\xB8/o/g; + $string =~ s/\xC3[\xB9-\xBC]/u/g; + $string =~ s/\xC3\xBD/y/g; + $string =~ s/\xC3\xBE/th/g; + $string =~ s/\xC3\xBF/y/g; + $string =~ s/\xE2\x80\x99/'/g; + + return $string; +} + +sub common_non_name_cap_p { + local($caller, $word) = @_; + return defined($english_ht{(lc $word)}->{COMMON_NON_NAME_CAP}); +} + +sub language { + return "English"; +} + +sub language_id { + return "en"; +} + +sub parse_entities_in_string { + local($caller, $string) = @_; + + $ParseEntry->set_current_lang("en"); + @pes = $ParseEntry->init_ParseEntry_list($string); + @pes = $caller->lexical_heuristic(@pes); + @pes = $caller->base_number_heuristic(@pes); + + return @pes; +} + +sub lexical_heuristic { + local($caller, @pes) = @_; + + $i = 0; + while ($i <= $#pes) { + $pe = $pes[$i]; + if ($pe->undefined("synt")) { + if ($pe->surf =~ /^\d+(,\d\d\d)*\.\d+/) { + $pe->set("synt", "cardinal"); + $pe->set("sem", "decimal number"); + $value = $pe->surf; + $value =~ s/,//g; + $pe->set("value", $value); + } elsif ($pe->surf =~ /^\d+(,\d\d\d)*$/) { + $pe->set("synt", "cardinal"); + $pe->set("sem", "integer"); + $value = $pe->surf; + $value =~ s/,//g; + $pe->set("value", $value); + } elsif ($pe->surf =~ /^([-",\.;\s:()\/%]|\@[-:\/]\@|[-:\/]\@|\@[-:\/])$/) { + $pe->set("gloss", $pe->surf); + $pe->set("synt", "punctuation"); + } else { + ($length,$english) = $caller->find_max_lex_match($i,3,@pes); + if ($length) { + if ($length > 1) { + @slot_value_list = (); + @children = splice(@pes,$i,$length); + @roles = $util->list_with_same_elem($length,"lex"); + $pe = $ParseEntry->newParent(*slot_value_list,*children,*roles); + $pe->set("surf",$english); + $pe->set("eot",1) if $pe->eot_p; + splice(@pes,$i,0,$pe); + } else { + $pe = $pes[$i]; + } + $annot_s = $english_annotation_ht{$english}; + $annot_s =~ s/^\s*:+//; + $annot_s =~ s/^\s+//; + $annot_s =~ s/\s+$//; + $annot_s =~ s/#.*$//; + foreach $annot (split('::', $annot_s)) { + ($slot, $value) = ($annot =~ /^([^:]+):(.*)$/); + if (defined($slot) && defined($value)) { + $pe->set($slot, $value); + } + $pe->set("sem", "integer") if ($slot eq "synt") && ($value eq "cardinal"); + } + $pe->set("ord-value", $ord_value) + if $ord_value = $english_annotation_ht{"_EN_SYNT_"}->{(lc $english)}->{"ordinal"}->{"value"}; + $pe->set("card-value", $card_value) + if $card_value = $english_annotation_ht{"_EN_SYNT_"}->{(lc $english)}->{"cardinal"}->{"value"}; + } + } + } + $i++; + } + return @pes; +} + +# builds numbers, incl. integers, decimal numbers, fractions, percentages, ordinals +sub base_number_heuristic { + local($caller, @pes) = @_; + + $i = 0; + # $ParseEntry->print_pes("start base_number_heuristic",$i,@pes); + while ($i <= $#pes) { + # forty-five + ($head_pe, @pes) = + $ParseEntry->build_parse_entry("composite number plus","",$i,*pes, + ' :head :($pe->sem eq "integer") && ($pe->value =~ /^[1-9]0$/)', + 'optional:dummy:$pe->surf eq "\@-\@"', + ' :mod :($pe->sem eq "integer") && ($pe->value =~ /^[1-9]$/)'); + if ($head_pe) { # match succeeded + $value1 = $head_pe->childValue("head"); + $value2 = $head_pe->childValue("mod"); + $head_pe->set("value", $value1 + $value2); + } + # six billion + ($head_pe, @pes) = + $ParseEntry->build_parse_entry("composite number 1000","",$i,*pes, + ' :mod :(($value1 = $pe->value) =~ /^\d+(.\d+)?$/) && ($value1 < 1000)', + ' :head:($value2 = $pe->value) =~ /^1(000)+$/'); + if ($head_pe) { # match succeeded + $value1 = $head_pe->childValue("mod"); + $value2 = $head_pe->childValue("head"); + $head_pe->set("value", $value1 * $value2); + } + # twenty-second + ($head_pe, @pes) = + $ParseEntry->build_parse_entry("composite ordinal","",$i,*pes, + ' :mod :($pe->sem eq "integer") && ($pe->value =~ /^[1-9]0$/)', + 'optional:dummy:$pe->surf eq "\@-\@"', + ' :head :$pe->get("ord-value") =~ /^[1-9]$/'); + if ($head_pe) { # match succeeded + $value1 = $head_pe->childSlot("head", "ord-value"); + $value2 = $head_pe->childValue("mod"); + $head_pe->set("value", $value1 + $value2); + } + $i++; + } + + return @pes; +} + +sub find_max_lex_match { + local($caller,$start,$maxlength,@pes) = @_; + + while ($maxlength > 0) { + if (($english = $util->pes_subseq_surf($start,$maxlength,"en",@pes)) + && defined($english_annotation_ht{$english}) + && ($english =~ /\S/)) { + return ($maxlength, $english); + } else { + $maxlength--; + } + } + return (0,""); +} + +sub select_reliable_entities { + local($caller, @pes) = @_; + + foreach $i (0 .. $#pes) { + $pe = $pes[$i]; + $surf = $pe->surf; + + $pe->set("reliable",1); + } + return @pes; +} + +sub negatives_p { + # (cool <-> uncool), (improper <-> proper), ... + local($caller, $s1, $s2) = @_; + + my $g_s1 = $util->regex_guard($s1); + my $g_s2 = $util->regex_guard($s2); + return 1 if $s1 =~ /^[iu]n$g_s2$/; + return 1 if $s1 =~ /^il$g_s2$/ && ($s2 =~ /^l/); + return 1 if $s1 =~ /^im$g_s2$/ && ($s2 =~ /^[mp]/); + + return 1 if $s2 =~ /^[iu]n$g_s1$/; + return 1 if $s2 =~ /^il$g_s1$/ && ($s1 =~ /^l/); + return 1 if $s2 =~ /^im$g_s1$/ && ($s1 =~ /^[mp]/); + + return 0; +} + +sub present_participle_p { + local($caller, $pe) = @_; + + my $aux_pe = $pe->child("aux"); + return $caller->present_participle_p($aux_pe) if $aux_pe; + my $head_pe = $pe->child("head"); + return $caller->present_participle_p($head_pe) if $head_pe; + return ($pe->synt =~ /^VBG/); +} + + +%engl_value_ht = ( + "monday" => 1, + "tuesday" => 2, + "wednesday" => 3, + "thursday" => 4, + "friday" => 5, + "saturday" => 6, + "sunday" => 7, + + "january" => 1, + "february" => 2, + "march" => 3, + "april" => 4, + "may" => 5, + "june" => 6, + "july" => 7, + "august" => 8, + "september" => 9, + "october" => 10, + "november" => 11, + "december" => 12, + + "spring" => 1, + "summer" => 2, + "fall" => 3, + "autumn" => 3, + "winter" => 4, + + "morning" => 1, + "noon" => 2, + "afternoon" => 3, + "evening" => 4, + "night" => 5, + + "picosecond" => 1, + "nanosecond" => 2, + "microsecond" => 3, + "millisecond" => 4, + "second" => 5, + "minute" => 6, + "hour" => 7, + "day" => 8, + "week" => 9, + "fortnight" => 10, + "month" => 11, + "year" => 12, + "decade" => 13, + "century" => 14, + "millennium" => 15, + + "nanometer" => 2, + "micrometer" => 3, + "millimeter" => 4, + "centimeter" => 5, + "decimeter" => 6, + "meter" => 7, + "kilometer" => 8, + "inch" => 11, + "foot" => 12, + "yard" => 13, + "mile" => 14, + "lightyear" => 20, + + "microgram" => 2, + "milligram" => 3, + "gram" => 4, + "kilogram" => 5, + "ton" => 6, + "ounce" => 14, +); + +sub engl_order_value { + local($this, $s) = @_; + + return $value = $engl_value_ht{(lc $s)} || 0; +} + +1; +