|
|
|
|
|
|
|
|
|
|
|
|
|
package NLP::Romanizer; |
|
|
|
use NLP::Chinese; |
|
use NLP::UTF8; |
|
use NLP::utilities; |
|
use JSON; |
|
$utf8 = NLP::UTF8; |
|
$util = NLP::utilities; |
|
$chinesePM = NLP::Chinese; |
|
|
|
my $verbosePM = 0; |
|
%empty_ht = (); |
|
|
|
my $braille_capital_letter_indicator = "\xE2\xA0\xA0"; |
|
my $braille_number_indicator = "\xE2\xA0\xBC"; |
|
my $braille_decimal_point = "\xE2\xA0\xA8"; |
|
my $braille_comma = "\xE2\xA0\x82"; |
|
my $braille_solidus = "\xE2\xA0\x8C"; |
|
my $braille_numeric_space = "\xE2\xA0\x90"; |
|
my $braille_letter_indicator = "\xE2\xA0\xB0"; |
|
my $braille_period = "\xE2\xA0\xB2"; |
|
|
|
sub new { |
|
local($caller) = @_; |
|
|
|
my $object = {}; |
|
my $class = ref( $caller ) || $caller; |
|
bless($object, $class); |
|
return $object; |
|
} |
|
|
|
sub load_unicode_data { |
|
local($this, *ht, $filename) = @_; |
|
|
|
|
|
$n = 0; |
|
if (open(IN, $filename)) { |
|
while (<IN>) { |
|
if (($unicode_value, $char_name, $general_category, $canon_comb_classes, $bidir_category, $char_decomp_mapping, $decimal_digit_value, $digit_value, $numeric_value, $mirrored, $unicode_1_0_name, $comment_field, $uc_mapping, $lc_mapping, $title_case_mapping) = split(";", $_)) { |
|
$utf8_code = $utf8->unicode_hex_string2string($unicode_value); |
|
$ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name; |
|
$ht{UTF_NAME_TO_UNICODE}->{$char_name} = $unicode_value; |
|
$ht{UTF_NAME_TO_CODE}->{$char_name} = $utf8_code; |
|
$ht{UTF_TO_CAT}->{$utf8_code} = $general_category; |
|
$ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric_value unless $numeric_value eq ""; |
|
$n++; |
|
} |
|
} |
|
close(IN); |
|
|
|
} else { |
|
print STDERR "Can't open $filename\n"; |
|
} |
|
} |
|
|
|
sub load_unicode_overwrite_romanization { |
|
local($this, *ht, $filename) = @_; |
|
|
|
|
|
$n = 0; |
|
if (open(IN, $filename)) { |
|
while (<IN>) { |
|
next if /^ |
|
$unicode_value = $util->slot_value_in_double_colon_del_list($_, "u"); |
|
$romanization = $util->slot_value_in_double_colon_del_list($_, "r"); |
|
$numeric = $util->slot_value_in_double_colon_del_list($_, "num"); |
|
$picture = $util->slot_value_in_double_colon_del_list($_, "pic"); |
|
$syllable_info = $util->slot_value_in_double_colon_del_list($_, "syllable-info"); |
|
$tone_mark = $util->slot_value_in_double_colon_del_list($_, "tone-mark"); |
|
$char_name = $util->slot_value_in_double_colon_del_list($_, "name"); |
|
$entry_processed_p = 0; |
|
$utf8_code = $utf8->unicode_hex_string2string($unicode_value); |
|
if ($unicode_value) { |
|
$ht{UTF_TO_CHAR_ROMANIZATION}->{$utf8_code} = $romanization if $romanization; |
|
$ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric if defined($numeric) && ($numeric ne ""); |
|
$ht{UTF_TO_PICTURE_DESCR}->{$utf8_code} = $picture if $picture; |
|
$ht{UTF_TO_SYLLABLE_INFO}->{$utf8_code} = $syllable_info if $syllable_info; |
|
$ht{UTF_TO_TONE_MARK}->{$utf8_code} = $tone_mark if $tone_mark; |
|
$ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name if $char_name; |
|
$entry_processed_p = 1 if $romanization || $numeric || $picture || $syllable_info || $tone_mark; |
|
} |
|
$n++ if $entry_processed_p; |
|
} |
|
close(IN); |
|
} else { |
|
print STDERR "Can't open $filename\n"; |
|
} |
|
} |
|
|
|
sub load_script_data { |
|
local($this, *ht, $filename) = @_; |
|
|
|
|
|
$n = 0; |
|
if (open(IN, $filename)) { |
|
while (<IN>) { |
|
next unless $script_name = $util->slot_value_in_double_colon_del_list($_, "script-name"); |
|
$abugida_default_vowel_s = $util->slot_value_in_double_colon_del_list($_, "abugida-default-vowel"); |
|
$alt_script_name_s = $util->slot_value_in_double_colon_del_list($_, "alt-script-name"); |
|
$language_s = $util->slot_value_in_double_colon_del_list($_, "language"); |
|
$direction = $util->slot_value_in_double_colon_del_list($_, "direction"); |
|
$font_family_s = $util->slot_value_in_double_colon_del_list($_, "font-family"); |
|
$ht{SCRIPT_P}->{$script_name} = 1; |
|
$ht{SCRIPT_NORM}->{(uc $script_name)} = $script_name; |
|
$ht{DIRECTION}->{$script_name} = $direction if $direction; |
|
foreach $language (split(/,\s*/, $language_s)) { |
|
$ht{SCRIPT_LANGUAGE}->{$script_name}->{$language} = 1; |
|
$ht{LANGUAGE_SCRIPT}->{$language}->{$script_name} = 1; |
|
} |
|
foreach $alt_script_name (split(/,\s*/, $alt_script_name_s)) { |
|
$ht{SCRIPT_NORM}->{$alt_script_name} = $script_name; |
|
$ht{SCRIPT_NORM}->{(uc $alt_script_name)} = $script_name; |
|
} |
|
foreach $abugida_default_vowel (split(/,\s*/, $abugida_default_vowel_s)) { |
|
$ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$script_name}->{$abugida_default_vowel} = 1 if $abugida_default_vowel; |
|
} |
|
foreach $font_family (split(/,\s*/, $font_family_s)) { |
|
$ht{SCRIPT_FONT}->{$script_name}->{$font_family} = 1 if $font_family; |
|
} |
|
$n++; |
|
} |
|
close(IN); |
|
|
|
} else { |
|
print STDERR "Can't open $filename\n"; |
|
} |
|
} |
|
|
|
sub unicode_hangul_romanization { |
|
local($this, $s, $pass_through_p) = @_; |
|
|
|
$pass_through_p = 0 unless defined($pass_through_p); |
|
@leads = split(/\s+/, "g gg n d dd r m b bb s ss - j jj c k t p h"); |
|
|
|
@vowels = split(/\s+/, "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i"); |
|
@tails = split(/\s+/, "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h"); |
|
$result = ""; |
|
@chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht); |
|
foreach $char (@chars) { |
|
$unicode = $utf8->utf8_to_unicode($char); |
|
if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) { |
|
$code = $unicode - 0xAC00; |
|
$lead_index = int($code / (28*21)); |
|
$vowel_index = int($code/28) % 21; |
|
$tail_index = $code % 28; |
|
$rom = $leads[$lead_index] . $vowels[$vowel_index] . $tails[$tail_index]; |
|
$rom =~ s/-//g; |
|
$result .= $rom; |
|
} elsif ($pass_through_p) { |
|
$result .= $char; |
|
} |
|
} |
|
return $result; |
|
} |
|
|
|
sub listify_comma_sep_string { |
|
local($this, $s) = @_; |
|
|
|
@result_list = (); |
|
return @result_list unless $s =~ /\S/; |
|
$s = $util->trim2($s); |
|
my $elem; |
|
|
|
while (($elem, $rest) = ($s =~ /^("(?:\\"|[^"])*"|'(?:\\'|[^'])*'|[^"', ]+),\s*(.*)$/)) { |
|
push(@result_list, $util->dequote_string($elem)); |
|
$s = $rest; |
|
} |
|
push(@result_list, $util->dequote_string($s)) if $s =~ /\S/; |
|
|
|
return @result_list; |
|
} |
|
|
|
sub braille_string_p { |
|
local($this, $s) = @_; |
|
|
|
return ($s =~ /^(\xE2[\xA0-\xA3][\x80-\xBF])+$/); |
|
} |
|
|
|
sub register_word_boundary_info { |
|
local($this, *ht, $lang_code, $utf8_source_string, $utf8_target_string, $use_only_for_whole_word_p, |
|
$use_only_at_start_of_word_p, $use_only_at_end_of_word_p, |
|
$dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p) = @_; |
|
|
|
if ($use_only_for_whole_word_p) { |
|
if ($lang_code) { |
|
$ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} else { |
|
$ht{USE_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} |
|
} |
|
if ($use_only_at_start_of_word_p) { |
|
if ($lang_code) { |
|
$ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} else { |
|
$ht{USE_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} |
|
} |
|
if ($use_only_at_end_of_word_p) { |
|
if ($lang_code) { |
|
$ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} else { |
|
$ht{USE_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} |
|
} |
|
if ($dont_use_at_start_of_word_p) { |
|
if ($lang_code) { |
|
$ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} else { |
|
$ht{DONT_USE_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} |
|
} |
|
if ($dont_use_at_end_of_word_p) { |
|
if ($lang_code) { |
|
$ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} else { |
|
$ht{DONT_USE_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} |
|
} |
|
} |
|
|
|
sub load_romanization_table { |
|
local($this, *ht, $filename) = @_; |
|
|
|
|
|
$n = 0; |
|
$line_number = 0; |
|
if (open(IN, $filename)) { |
|
while (<IN>) { |
|
$line_number++; |
|
next if /^ |
|
if ($_ =~ /^::preserve\s/) { |
|
$from_unicode = $util->slot_value_in_double_colon_del_list($_, "from"); |
|
$to_unicode = $util->slot_value_in_double_colon_del_list($_, "to"); |
|
if ($from_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) { |
|
$from_unicode =~ s/^(?:U\+|\\u)//; |
|
$from_code_point = hex($from_unicode); |
|
} else { |
|
$from_code_point = ""; |
|
} |
|
if ($to_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) { |
|
$to_unicode =~ s/^(?:U\+|\\u)//; |
|
$to_code_point = hex($to_unicode); |
|
} else { |
|
$to_code_point = $from_code_point; |
|
} |
|
if ($from_code_point ne "") { |
|
|
|
foreach $code_point (($from_code_point .. $to_code_point)) { |
|
$utf8_string = $utf8->unicode2string($code_point); |
|
$ht{UTF_CHAR_MAPPING}->{$utf8_string}->{$utf8_string} = 1; |
|
} |
|
$n++; |
|
} |
|
next; |
|
} |
|
$utf8_source_string = $util->slot_value_in_double_colon_del_list($_, "s"); |
|
$utf8_target_string = $util->slot_value_in_double_colon_del_list($_, "t"); |
|
$utf8_alt_target_string_s = $util->slot_value_in_double_colon_del_list($_, "t-alt"); |
|
$use_alt_in_pointed_p = ($_ =~ /::use-alt-in-pointed\b/); |
|
$use_only_for_whole_word_p = ($_ =~ /::use-only-for-whole-word\b/); |
|
$use_only_at_start_of_word_p = ($_ =~ /::use-only-at-start-of-word\b/); |
|
$use_only_at_end_of_word_p = ($_ =~ /::use-only-at-end-of-word\b/); |
|
$dont_use_at_start_of_word_p = ($_ =~ /::dont-use-at-start-of-word\b/); |
|
$dont_use_at_end_of_word_p = ($_ =~ /::dont-use-at-end-of-word\b/); |
|
$use_only_in_lower_case_enviroment_p = ($_ =~ /::use-only-in-lower-case-enviroment\b/); |
|
$word_external_punctuation_p = ($_ =~ /::word-external-punctuation\b/); |
|
$utf8_source_string =~ s/\s*$//; |
|
$utf8_target_string =~ s/\s*$//; |
|
$utf8_alt_target_string_s =~ s/\s*$//; |
|
$utf8_target_string =~ s/^"(.*)"$/$1/; |
|
$utf8_target_string =~ s/^'(.*)'$/$1/; |
|
@utf8_alt_targets = $this->listify_comma_sep_string($utf8_alt_target_string_s); |
|
$numeric = $util->slot_value_in_double_colon_del_list($_, "num"); |
|
$numeric =~ s/\s*$//; |
|
$annotation = $util->slot_value_in_double_colon_del_list($_, "annotation"); |
|
$annotation =~ s/\s*$//; |
|
$lang_code = $util->slot_value_in_double_colon_del_list($_, "lcode"); |
|
$prob = $util->slot_value_in_double_colon_del_list($_, "p") || 1; |
|
unless (($utf8_target_string eq "") && ($numeric =~ /\d/)) { |
|
if ($lang_code) { |
|
$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob; |
|
} else { |
|
$ht{UTF_CHAR_MAPPING}->{$utf8_source_string}->{$utf8_target_string} = $prob; |
|
} |
|
if ($word_external_punctuation_p) { |
|
if ($lang_code) { |
|
$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob; |
|
} else { |
|
$ht{WORD_EXTERNAL_PUNCTUATION}->{$utf8_source_string}->{$utf8_target_string} = $prob; |
|
} |
|
} |
|
if ($this->braille_string_p($utf8_source_string)) { |
|
if (($utf8_target_string =~ /^[a-z]+$/) |
|
&& (! ($utf8_source_string =~ /^$braille_capital_letter_indicator/))) { |
|
my $uc_utf8_source_string = "$braille_capital_letter_indicator$utf8_source_string"; |
|
my $uc_utf8_target_string = ucfirst $utf8_target_string; |
|
if ($lang_code) { |
|
$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob; |
|
} else { |
|
$ht{UTF_CHAR_MAPPING}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob; |
|
} |
|
$this->register_word_boundary_info(*ht, $lang_code, $uc_utf8_source_string, $uc_utf8_target_string, |
|
$use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p, |
|
$dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p); |
|
} |
|
if (($utf8_target_string =~ /^[0-9]$/) |
|
&& ($utf8_source_string =~ /^$braille_number_indicator./)) { |
|
my $core_number_char = $utf8_source_string; |
|
$core_number_char =~ s/$braille_number_indicator//; |
|
$ht{BRAILLE_TO_DIGIT}->{$core_number_char} = $utf8_target_string; |
|
} |
|
} |
|
} |
|
if ($use_only_in_lower_case_enviroment_p) { |
|
if ($lang_code) { |
|
$ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} else { |
|
$ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT}->{$utf8_source_string}->{$utf8_target_string} = 1; |
|
} |
|
} |
|
$this->register_word_boundary_info(*ht, $lang_code, $utf8_source_string, $utf8_target_string, |
|
$use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p, |
|
$dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p); |
|
foreach $utf8_alt_target (@utf8_alt_targets) { |
|
if ($lang_code) { |
|
$ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = $prob; |
|
$ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p; |
|
} else { |
|
$ht{UTF_CHAR_ALT_MAPPING}->{$utf8_source_string}->{$utf8_alt_target} = $prob; |
|
$ht{USE_ALT_IN_POINTED}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p; |
|
} |
|
if ($use_only_for_whole_word_p) { |
|
if ($lang_code) { |
|
$ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1; |
|
} else { |
|
$ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1; |
|
} |
|
} |
|
if ($use_only_at_start_of_word_p) { |
|
if ($lang_code) { |
|
$ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1; |
|
} else { |
|
$ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1; |
|
} |
|
} |
|
if ($use_only_at_end_of_word_p) { |
|
if ($lang_code) { |
|
$ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1; |
|
} else { |
|
$ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1; |
|
} |
|
} |
|
} |
|
if ($numeric =~ /\d/) { |
|
$ht{UTF_TO_NUMERIC}->{$utf8_source_string} = $numeric; |
|
} |
|
if ($annotation =~ /\S/) { |
|
$ht{UTF_ANNOTATION}->{$utf8_source_string} = $annotation; |
|
} |
|
$n++; |
|
} |
|
close(IN); |
|
|
|
} else { |
|
print STDERR "Can't open $filename\n"; |
|
} |
|
} |
|
|
|
sub char_name_to_script { |
|
local($this, $char_name, *ht) = @_; |
|
|
|
return $cached_result if $cached_result = $ht{CHAR_NAME_TO_SCRIPT}->{$char_name}; |
|
$orig_char_name = $char_name; |
|
$char_name =~ s/\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL)\b.*$//; |
|
my $script_name; |
|
while ($char_name) { |
|
last if $script_name = $ht{SCRIPT_NORM}->{(uc $char_name)}; |
|
$char_name =~ s/\s*\S+\s*$//; |
|
} |
|
$script_name = "" unless defined($script_name); |
|
$ht{CHAR_NAME_TO_SCRIPT}->{$char_name} = $script_name; |
|
return $script_name; |
|
} |
|
|
|
sub letter_plus_char_p { |
|
local($this, $char_name) = @_; |
|
|
|
return $cached_result if $cached_result = $ht{CHAR_NAME_LETTER_PLUS}->{$char_name}; |
|
my $letter_plus_p = ($char_name =~ /\b(?:LETTER|VOWEL SIGN|AU LENGTH MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN AL-LAKUNA|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN NUKTA|SIGN DOT BELOW|HEBREW POINT)\b/) ? 1 : 0; |
|
$ht{CHAR_NAME_LETTER_PLUS}->{$char_name} = $letter_plus_p; |
|
return $letter_plus_p; |
|
} |
|
|
|
sub subjoined_char_p { |
|
local($this, $char_name) = @_; |
|
|
|
return $cached_result if $cached_result = $ht{CHAR_NAME_SUBJOINED}->{$char_name}; |
|
my $subjoined_p = (($char_name =~ /\b(?:SUBJOINED LETTER|VOWEL SIGN|AU LENGTH MARK|EMPHASIS MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN DOT BELOW|HEBREW (POINT|PUNCTUATION GERESH)|ARABIC (?:DAMMA|DAMMATAN|FATHA|FATHATAN|HAMZA|KASRA|KASRATAN|MADDAH|SHADDA|SUKUN))\b/)) ? 1 : 0; |
|
$ht{CHAR_NAME_SUBJOINED}->{$char_name} = $subjoined_p; |
|
return $subjoined_p; |
|
} |
|
|
|
sub new_node_id { |
|
local($this, *chart_ht) = @_; |
|
|
|
my $n_nodes = $chart_ht{N_NODES}; |
|
$n_nodes++; |
|
$chart_ht{N_NODES} = $n_nodes; |
|
return $n_nodes; |
|
} |
|
|
|
sub add_node { |
|
local($this, $s, $start, $end, *chart_ht, $type, $comment) = @_; |
|
|
|
my $node_id = $this->new_node_id(*chart_ht); |
|
|
|
|
|
$chart_ht{NODE_START}->{$node_id} = $start; |
|
$chart_ht{NODE_END}->{$node_id} = $end; |
|
$chart_ht{NODES_STARTING_AT}->{$start}->{$node_id} = 1; |
|
$chart_ht{NODES_ENDING_AT}->{$end}->{$node_id} = 1; |
|
$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}->{$node_id} = 1; |
|
$chart_ht{NODE_TYPE}->{$node_id} = $type; |
|
$chart_ht{NODE_COMMENT}->{$node_id} = $comment; |
|
$chart_ht{NODE_ROMAN}->{$node_id} = $s; |
|
return $node_id; |
|
} |
|
|
|
sub get_node_for_span { |
|
local($this, $start, $end, *chart_ht) = @_; |
|
|
|
return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}); |
|
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}; |
|
|
|
return (@node_ids) ? $node_ids[0] : ""; |
|
} |
|
|
|
sub get_node_for_span_and_type { |
|
local($this, $start, $end, *chart_ht, $type) = @_; |
|
|
|
return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}); |
|
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}; |
|
|
|
foreach $node_id (@node_ids) { |
|
return $node_id if $chart_ht{NODE_TYPE}->{$node_id} eq $type; |
|
} |
|
return ""; |
|
} |
|
|
|
sub get_node_roman { |
|
local($this, $node_id, *chart_id, $default) = @_; |
|
|
|
$default = "" unless defined($default); |
|
my $roman = $chart_ht{NODE_ROMAN}->{$node_id}; |
|
return (defined($roman)) ? $roman : $default; |
|
} |
|
|
|
sub set_node_id_slot_value { |
|
local($this, $node_id, $slot, $value, *chart_id) = @_; |
|
|
|
$chart_ht{NODE_SLOT}->{$node_id}->{$slot} = $value; |
|
} |
|
|
|
sub copy_slot_values { |
|
local($this, $old_node_id, $new_node_id, *chart_id, @slots) = @_; |
|
|
|
if (@slots) { |
|
foreach $slot (keys %{$chart_ht{NODE_SLOT}->{$old_node_id}}) { |
|
if (($slots[0] eq "all") || $util->member($slot, @slots)) { |
|
my $value = $chart_ht{NODE_SLOT}->{$old_node_id}->{$slot}; |
|
$chart_ht{NODE_SLOT}->{$new_node_id}->{$slot} = $value if defined($value); |
|
} |
|
} |
|
} |
|
} |
|
|
|
sub get_node_id_slot_value { |
|
local($this, $node_id, $slot, *chart_id, $default) = @_; |
|
|
|
$default = "" unless defined($default); |
|
my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot}; |
|
return (defined($value)) ? $value : $default; |
|
} |
|
|
|
sub get_node_for_span_with_slot_value { |
|
local($this, $start, $end, $slot, *chart_id, $default) = @_; |
|
|
|
$default = "" unless defined($default); |
|
return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}); |
|
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}; |
|
foreach $node_id (@node_ids) { |
|
my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot}; |
|
return $value if defined($value); |
|
} |
|
return $default; |
|
} |
|
|
|
sub get_node_for_span_with_slot { |
|
local($this, $start, $end, $slot, *chart_id, $default) = @_; |
|
|
|
$default = "" unless defined($default); |
|
return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}); |
|
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}; |
|
foreach $node_id (@node_ids) { |
|
my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot}; |
|
return $node_id if defined($value); |
|
} |
|
return $default; |
|
} |
|
|
|
sub register_new_complex_number_span_segment { |
|
local($this, $start, $mid, $end, *chart_id, $line_number) = @_; |
|
|
|
|
|
|
|
|
|
if (defined($old_start = $chart_ht{COMPLEX_NUMERIC_END_START}->{$mid})) { |
|
undef($chart_ht{COMPLEX_NUMERIC_END_START}->{$mid}); |
|
$chart_ht{COMPLEX_NUMERIC_START_END}->{$old_start} = $end; |
|
$chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $old_start; |
|
} else { |
|
$chart_ht{COMPLEX_NUMERIC_START_END}->{$start} = $end; |
|
$chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $start; |
|
} |
|
} |
|
|
|
sub romanize_by_token_with_caching { |
|
local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number) = @_; |
|
|
|
$control = "" unless defined($control); |
|
my $return_chart_p = ($control =~ /return chart/i); |
|
my $return_offset_mappings_p = ($control =~ /return offset mappings/i); |
|
return $this->romanize($s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number) |
|
if $return_chart_p || $return_offset_mappings_p; |
|
my $result = ""; |
|
my @separators = (); |
|
my @tokens = (); |
|
$s =~ s/\n$//; |
|
while (($sep, $token, $rest) = ($s =~ /^(\s*)(\S+)(.*)$/)) { |
|
push(@separators, $sep); |
|
push(@tokens, $token); |
|
$s = $rest; |
|
} |
|
push(@separators, $s); |
|
while (@tokens) { |
|
my $sep = shift @separators; |
|
my $token = shift @tokens; |
|
$result .= $sep; |
|
if ($token =~ /^[\x00-\x7F]*$/) { |
|
$result .= $token; |
|
} else { |
|
my $rom_token = $ht{CACHED_ROMANIZATION}->{$lang_code}->{$token}; |
|
unless (defined($rom_token)) { |
|
$rom_token = $this->romanize($token, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number); |
|
$ht{CACHED_ROMANIZATION}->{$lang_code}->{$token} = $rom_token if defined($rom_token); |
|
} |
|
$result .= $rom_token; |
|
} |
|
} |
|
my $sep = shift @separators; |
|
$result .= $sep if defined($sep); |
|
|
|
return $result; |
|
} |
|
|
|
sub romanize { |
|
local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number, $initial_rom_char_offset) = @_; |
|
|
|
my $orig_lang_code = $lang_code; |
|
|
|
if (($line_lang_code) = ($s =~ /^::lcode\s+([a-z][a-z][a-z])\s/)) { |
|
$lang_code = $line_lang_code; |
|
} |
|
$initial_char_offset = 0 unless defined($initial_char_offset); |
|
$initial_rom_char_offset = 0 unless defined($initial_rom_char_offset); |
|
$control = "" unless defined($control); |
|
my $return_chart_p = ($control =~ /return chart/i); |
|
my $return_offset_mappings_p = ($control =~ /return offset mappings/i); |
|
$line_number = "" unless defined($line_number); |
|
my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht); |
|
my $n_characters = $#chars + 1; |
|
%chart_ht = (); |
|
$chart_ht{N_CHARS} = $n_characters; |
|
$chart_ht{N_NODES} = 0; |
|
my $char = ""; |
|
my $char_name = ""; |
|
my $prev_script = ""; |
|
my $current_script = ""; |
|
my $script_start = 0; |
|
my $script_end = 0; |
|
my $prev_letter_plus_script = ""; |
|
my $current_letter_plus_script = ""; |
|
my $letter_plus_script_start = 0; |
|
my $letter_plus_script_end = 0; |
|
my $log =""; |
|
my $n_right_to_left_chars = 0; |
|
my $n_left_to_right_chars = 0; |
|
my $hebrew_word_start = ""; |
|
my $hebrew_word_contains_point = 0; |
|
my $current_word_start = ""; |
|
my $current_word_script = ""; |
|
my $braille_all_caps_p = 0; |
|
|
|
|
|
foreach $i ((0 .. ($#chars + 1))) { |
|
if ($i <= $#chars) { |
|
$char = $chars[$i]; |
|
$chart_ht{ORIG_CHAR}->{$i} = $char; |
|
$char_name = $ht{UTF_TO_CHAR_NAME}->{$char} || ""; |
|
$chart_ht{CHAR_NAME}->{$i} = $char_name; |
|
$current_script = $this->char_name_to_script($char_name, *ht); |
|
$current_script_direction = $ht{DIRECTION}->{$current_script} || ''; |
|
if ($current_script_direction eq 'right-to-left') { |
|
$n_right_to_left_chars++; |
|
} elsif (($char =~ /^[a-z]$/i) || ! ($char =~ /^[\x00-\x7F]$/)) { |
|
$n_left_to_right_chars++; |
|
} |
|
$chart_ht{CHAR_SCRIPT}->{$i} = $current_script; |
|
$chart_ht{SCRIPT_SEGMENT_START}->{$i} = ""; |
|
$chart_ht{SCRIPT_SEGMENT_END}->{$i} = ""; |
|
$chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = ""; |
|
$chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = ""; |
|
$subjoined_char_p = $this->subjoined_char_p($char_name); |
|
$chart_ht{CHAR_SUBJOINED}->{$i} = $subjoined_char_p; |
|
$letter_plus_char_p = $this->letter_plus_char_p($char_name); |
|
$chart_ht{CHAR_LETTER_PLUS}->{$i} = $letter_plus_char_p; |
|
$current_letter_plus_script = ($letter_plus_char_p) ? $current_script : ""; |
|
$numeric_value = $ht{UTF_TO_NUMERIC}->{$char}; |
|
$numeric_value = "" unless defined($numeric_value); |
|
$annotation = $ht{UTF_ANNOTATION}->{$char}; |
|
$annotation = "" unless defined($annotation); |
|
$chart_ht{CHAR_NUMERIC_VALUE}->{$i} = $numeric_value; |
|
$chart_ht{CHAR_ANNOTATION}->{$i} = $annotation; |
|
$syllable_info = $ht{UTF_TO_SYLLABLE_INFO}->{$char} || ""; |
|
$chart_ht{CHAR_SYLLABLE_INFO}->{$i} = $syllable_info; |
|
$tone_mark = $ht{UTF_TO_TONE_MARK}->{$char} || ""; |
|
$chart_ht{CHAR_TONE_MARK}->{$i} = $tone_mark; |
|
} else { |
|
$char = ""; |
|
$char_name = ""; |
|
$current_script = ""; |
|
$current_letter_plus_script = ""; |
|
} |
|
if ($char_name =~ /^HEBREW (LETTER|POINT|PUNCTUATION GERESH) /) { |
|
$hebrew_word_start = $i if $hebrew_word_start eq ""; |
|
$hebrew_word_contains_point = 1 if $char_name =~ /^HEBREW POINT /; |
|
} elsif ($hebrew_word_start ne "") { |
|
if ($hebrew_word_contains_point) { |
|
foreach $j (($hebrew_word_start .. ($i-1))) { |
|
$chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$j} = 1; |
|
} |
|
$chart_ht{CHAR_START_OF_WORD}->{$hebrew_word_start} = 1; |
|
$chart_ht{CHAR_END_OF_WORD}->{($i-1)} = 1; |
|
} |
|
$hebrew_word_start = ""; |
|
$hebrew_word_contains_point = 0; |
|
} |
|
my $part_of_word_p = $current_script |
|
&& ($this->letter_plus_char_p($char_name) |
|
|| $this->subjoined_char_p($char_name) |
|
|| ($char_name =~ /\b(LETTER|SYLLABLE|SYLLABICS|LIGATURE)\b/)); |
|
|
|
|
|
my $end_offset = 0; |
|
if ($char_name =~ /^Braille\b/i) { |
|
if (($char =~ /^\s*$/) || ($char_name =~ /BLANK/)) { |
|
$part_of_word_p = 0; |
|
$braille_all_caps_p = 0; |
|
} elsif ($chart_ht{NOT_PART_OF_WORD_P}->{$i}) { |
|
$part_of_word_p = 0; |
|
$braille_all_caps_p = 0; |
|
} elsif ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$char}}) |
|
|| (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$char}})) { |
|
$part_of_word_p = 0; |
|
$braille_all_caps_p = 0; |
|
} elsif (($i+1 <= $#chars) |
|
&& ($s1 = $char . $chars[$i+1]) |
|
&& ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s1}}) |
|
|| (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s1}}))) { |
|
$part_of_word_p = 0; |
|
$braille_all_caps_p = 0; |
|
$chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1; |
|
} elsif (($i+2 <= $#chars) |
|
&& ($s2 = $char . $chars[$i+1] . $chars[$i+2]) |
|
&& ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s2}}) |
|
|| (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s2}}))) { |
|
$part_of_word_p = 0; |
|
$braille_all_caps_p = 0; |
|
$chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1; |
|
$chart_ht{NOT_PART_OF_WORD_P}->{($i+2)} = 1; |
|
} elsif (($i+1 <= $#chars) |
|
&& ($char eq $braille_capital_letter_indicator) |
|
&& ($chars[$i+1] eq $braille_capital_letter_indicator)) { |
|
$braille_all_caps_p = 1; |
|
} else { |
|
$part_of_word_p = 1; |
|
} |
|
|
|
if (($char eq $braille_period) |
|
&& (($i == $#chars) |
|
|| (($i < $#chars) |
|
&& (! $this->braille_string_p($chars[$i+1]))))) { |
|
$part_of_word_p = 0; |
|
} |
|
|
|
if (($i > 0) |
|
&& ($chars[$i-1] eq $braille_period) |
|
&& (! $part_of_word_p) |
|
&& ($current_word_start ne "")) { |
|
$end_offset = -1; |
|
} |
|
} else { |
|
$braille_all_caps_p = 0; |
|
} |
|
$chart_ht{BRAILLE_ALL_CAPS_P}->{$i} = $braille_all_caps_p; |
|
|
|
if (($current_word_start ne "") |
|
&& ((! $part_of_word_p) |
|
|| ($current_script ne $current_word_script))) { |
|
|
|
$chart_ht{CHAR_START_OF_WORD}->{$current_word_start} = 1; |
|
$chart_ht{CHAR_END_OF_WORD}->{($i-1+$end_offset)} = 1; |
|
my $word = join("", @chars[$current_word_start .. ($i-1+$end_offset)]); |
|
$chart_ht{WORD_START_END}->{$current_word_start}->{$i} = $word; |
|
$chart_ht{WORD_END_START}->{$i+$end_offset}->{$current_word_start} = $word; |
|
|
|
$current_word_start = ""; |
|
$current_word_script = ""; |
|
} |
|
if ($part_of_word_p && ($current_word_start eq "")) { |
|
|
|
$current_word_start = $i; |
|
$current_word_script = $current_script; |
|
} |
|
|
|
unless ($current_script eq $prev_script) { |
|
if ($prev_script && ($i-1 >= $script_start)) { |
|
my $script_end = $i; |
|
$chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start} = $script_end; |
|
$chart_ht{SCRIPT_SEGMENT_END_TO_START}->{$script_end} = $script_start; |
|
foreach $i (($script_start .. $script_end)) { |
|
$chart_ht{SCRIPT_SEGMENT_START}->{$i} = $script_start; |
|
$chart_ht{SCRIPT_SEGMENT_END}->{$i} = $script_end; |
|
} |
|
|
|
} |
|
$script_start = $i; |
|
} |
|
unless ($current_letter_plus_script eq $prev_letter_plus_script) { |
|
if ($prev_letter_plus_script && ($i-1 >= $letter_plus_script_start)) { |
|
my $letter_plus_script_end = $i; |
|
$chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$letter_plus_script_start} = $letter_plus_script_end; |
|
$chart_ht{LETTER_TOKEN_SEGMENT_END_TO_START}->{$letter_plus_script_end} = $letter_plus_script_start; |
|
foreach $i (($letter_plus_script_start .. $letter_plus_script_end)) { |
|
$chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = $letter_plus_script_start; |
|
$chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = $letter_plus_script_end; |
|
} |
|
|
|
} |
|
$letter_plus_script_start = $i; |
|
} |
|
$prev_script = $current_script; |
|
$prev_letter_plus_script = $current_letter_plus_script; |
|
} |
|
$ht{STRING_IS_DOMINANTLY_RIGHT_TO_LEFT}->{$s} = 1 if $n_right_to_left_chars > $n_left_to_right_chars; |
|
|
|
|
|
my $i = 0; |
|
while ($i <= $#chars) { |
|
my $char = $chart_ht{ORIG_CHAR}->{$i}; |
|
my $current_script = $chart_ht{CHAR_SCRIPT}->{$i}; |
|
$chart_ht{CHART_CONTAINS_SCRIPT}->{$current_script} = 1; |
|
my $script_segment_start = $chart_ht{SCRIPT_SEGMENT_START}->{$i}; |
|
my $script_segment_end = $chart_ht{SCRIPT_SEGMENT_END}->{$i}; |
|
my $char_name = $chart_ht{CHAR_NAME}->{$i}; |
|
my $subjoined_char_p = $chart_ht{CHAR_SUBJOINED}->{$i}; |
|
my $letter_plus_char_p = $chart_ht{CHAR_LETTER_PLUS}->{$i}; |
|
my $numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{$i}; |
|
my $annotation = $chart_ht{CHAR_ANNOTATION}->{$i}; |
|
|
|
my $tone_mark = $chart_ht{CHAR_TONE_MARK}->{$i}; |
|
my $found_char_mapping_p = 0; |
|
my $prev_char_name = ($i >= 1) ? $chart_ht{CHAR_NAME}->{($i-1)} : ""; |
|
my $prev2_script = ($i >= 2) ? $chart_ht{CHAR_SCRIPT}->{($i-2)} : ""; |
|
my $prev_script = ($i >= 1) ? $chart_ht{CHAR_SCRIPT}->{($i-1)} : ""; |
|
my $next_script = ($i < $#chars) ? $chart_ht{CHAR_SCRIPT}->{($i+1)} : ""; |
|
my $next_char = ($i < $#chars) ? $chart_ht{ORIG_CHAR}->{($i+1)} : ""; |
|
my $next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char} || ""; |
|
my $prev2_letter_plus_char_p = ($i >= 2) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-2)} : 0; |
|
my $prev_letter_plus_char_p = ($i >= 1) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-1)} : 0; |
|
my $next_letter_plus_char_p = ($i < $#chars) ? $chart_ht{CHAR_LETTER_PLUS}->{($i+1)} : 0; |
|
my $next_index = $i + 1; |
|
|
|
|
|
if ($char eq $braille_number_indicator) { |
|
my $offset = 0; |
|
my $numeric_value = ""; |
|
my $digit; |
|
while ($i+$offset < $#chars) { |
|
$offset++; |
|
my $offset_char = $chart_ht{ORIG_CHAR}->{$i+$offset}; |
|
if (defined($digit = $ht{BRAILLE_TO_DIGIT}->{$offset_char})) { |
|
$numeric_value .= $digit; |
|
} elsif (($offset_char eq $braille_decimal_point) |
|
|| ($ht{UTF_CHAR_MAPPING}->{$offset_char}->{"."})) { |
|
$numeric_value .= "."; |
|
} elsif ($offset_char eq $braille_comma) { |
|
$numeric_value .= ","; |
|
} elsif ($offset_char eq $braille_numeric_space) { |
|
$numeric_value .= " "; |
|
} elsif ($offset_char eq $braille_solidus) { |
|
$numeric_value .= "/"; |
|
} elsif ($offset_char eq $braille_number_indicator) { |
|
|
|
} elsif ($offset_char eq $braille_letter_indicator) { |
|
|
|
last; |
|
} else { |
|
$offset--; |
|
last; |
|
} |
|
} |
|
if ($offset) { |
|
$next_index = $i + $offset + 1; |
|
$node_id = $this->add_node($numeric_value, $i, $next_index, *chart_ht, "", "braille number"); |
|
$found_char_mapping_p = 1; |
|
} |
|
} |
|
|
|
unless ($found_char_mapping_p) { |
|
foreach $string_length (reverse(1 .. 6)) { |
|
next if ($i + $string_length-1) > $#chars; |
|
my $start_of_word_p = $chart_ht{CHAR_START_OF_WORD}->{$i} || 0; |
|
my $end_of_word_p = $chart_ht{CHAR_END_OF_WORD}->{($i+$string_length-1)} || 0; |
|
my $multi_char_substring = join("", @chars[$i..($i+$string_length-1)]); |
|
my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}}; |
|
@mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings; |
|
my @mappings_whole = (); |
|
my @mappings_start_or_end = (); |
|
my @mappings_other = (); |
|
foreach $mapping (@mappings) { |
|
next if $mapping =~ /\(__.*__\)/; |
|
if ($ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping} |
|
|| $ht{USE_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$mapping}) { |
|
push(@mappings_whole, $mapping) if $start_of_word_p && $end_of_word_p; |
|
} elsif ($ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping} |
|
|| $ht{USE_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) { |
|
push(@mappings_start_or_end, $mapping) if $start_of_word_p; |
|
} elsif ($ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping} |
|
|| $ht{USE_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) { |
|
push(@mappings_start_or_end, $mapping) if $end_of_word_p; |
|
} else { |
|
push(@mappings_other, $mapping); |
|
} |
|
} |
|
@mappings = @mappings_whole; |
|
@mappings = @mappings_start_or_end unless @mappings; |
|
@mappings = @mappings_other unless @mappings; |
|
foreach $mapping (@mappings) { |
|
next if $mapping =~ /\(__.*__\)/; |
|
if ($ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping} |
|
|| $ht{DONT_USE_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) { |
|
next if $start_of_word_p; |
|
} |
|
if ($ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping} |
|
|| $ht{DONT_USE_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) { |
|
next if $end_of_word_p; |
|
} |
|
my $mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $mapping) : $mapping; |
|
$node_id = $this->add_node($mapping2, $i, $i+$string_length, *chart_ht, "", "multi-char-mapping"); |
|
$next_index = $i + $string_length; |
|
$found_char_mapping_p = 1; |
|
if ($annotation) { |
|
@annotation_elems = split(/,\s*/, $annotation); |
|
foreach $annotation_elem (@annotation_elems) { |
|
if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) { |
|
$this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht); |
|
} else { |
|
$this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht); |
|
} |
|
} |
|
} |
|
} |
|
my @alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}}; |
|
@alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING}->{$multi_char_substring}} unless @alt_mappings; |
|
@alt_mappings = () if ($#alt_mappings == 0) && ($alt_mappings[0] eq "_NONE_"); |
|
foreach $alt_mapping (@alt_mappings) { |
|
if ($chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$i}) { |
|
next unless |
|
$ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping} |
|
|| $ht{USE_ALT_IN_POINTED}->{$multi_char_substring}->{$alt_mapping}; |
|
} |
|
if ($ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping} |
|
|| $ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$alt_mapping}) { |
|
next unless $start_of_word_p && $end_of_word_p; |
|
} |
|
if ($ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping} |
|
|| $ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) { |
|
next unless $start_of_word_p; |
|
} |
|
if ($ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping} |
|
|| $ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) { |
|
next unless $end_of_word_p; |
|
} |
|
my $alt_mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $alt_mapping) : $alt_mapping; |
|
$node_id = $this->add_node($alt_mapping2, $i, $i+$string_length, *chart_ht, "alt", "multi-char-mapping"); |
|
if ($annotation) { |
|
@annotation_elems = split(/,\s*/, $annotation); |
|
foreach $annotation_elem (@annotation_elems) { |
|
if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) { |
|
$this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht); |
|
} else { |
|
$this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
unless ($found_char_mapping_p) { |
|
my $prev_node_id = $this->get_node_for_span($i-4, $i, *chart_ht) |
|
|| $this->get_node_for_span($i-3, $i, *chart_ht) |
|
|| $this->get_node_for_span($i-2, $i, *chart_ht) |
|
|| $this->get_node_for_span($i-1, $i, *chart_ht); |
|
my $prev_char_roman = ($prev_node_id) ? $this->get_node_roman($prev_node_id, *chart_id) : ""; |
|
my $prev_node_start = ($prev_node_id) ? $chart_ht{NODE_START}->{$prev_node_id} : ""; |
|
|
|
|
|
if (($numeric_value =~ /\d/) |
|
&& (! ($char_name =~ /SUPERSCRIPT/))) { |
|
my $prev_numeric_value = $this->get_node_for_span_with_slot_value($i-1, $i, "numeric-value", *chart_id); |
|
my $sep = ""; |
|
$sep = " " if ($char_name =~ /^vulgar fraction /i) && ($prev_numeric_value =~ /\d/); |
|
$node_id = $this->add_node("$sep$numeric_value", $i, $i+1, *chart_ht, "", "number"); |
|
$this->set_node_id_slot_value($node_id, "numeric-value", $numeric_value, *chart_ht); |
|
if ((($prev_numeric_value =~ /\d/) && ($numeric_value =~ /\d\d/)) |
|
|| (($prev_numeric_value =~ /\d\d/) && ($numeric_value =~ /\d/))) { |
|
|
|
my $j = 1; |
|
|
|
if ($prev_numeric_value =~ /^\d$/) { |
|
while (1) { |
|
if (($i-$j-1 >= 0) |
|
&& defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-1, $i-$j, "numeric-value", *chart_id)) |
|
&& ($digit_value =~ /^\d$/)) { |
|
$j++; |
|
} elsif (($i-$j-2 >= 0) |
|
&& ($chart_ht{ORIG_CHAR}->{($i-$j-1)} =~ /^[.,]$/) |
|
&& defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-2, $i-$j-1, "numeric-value", *chart_id)) |
|
&& ($digit_value =~ /^\d$/)) { |
|
$j += 2; |
|
} else { |
|
last; |
|
} |
|
} |
|
} |
|
|
|
my $k = 0; |
|
if ($numeric_value =~ /^\d$/) { |
|
while (1) { |
|
if (defined($next_numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{($i+$k+1)}) |
|
&& ($next_numeric_value =~ /^\d$/)) { |
|
$k++; |
|
} else { |
|
last; |
|
} |
|
} |
|
} |
|
$this->register_new_complex_number_span_segment($i-$j, $i, $i+$k+1, *chart_ht, $line_number); |
|
} |
|
if ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char) |
|
&& ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) { |
|
$de_accented_translit = $util->de_accent_string($tonal_translit); |
|
if ($numeric_value =~ /^(10000|1000000000000|10000000000000000)$/) { |
|
$chart_ht{NODE_TYPE}->{$node_id} = "alt"; |
|
$alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK"); |
|
} else { |
|
$alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "alt", "CJK"); |
|
} |
|
} |
|
|
|
|
|
} elsif ($char =~ /^[\x00-\x7F]$/) { |
|
$this->add_node($char, $i, $i+1, *chart_ht, "", "ASCII"); |
|
|
|
|
|
} elsif ($char =~ /^(\xE2[\x98-\x9E]|\xF0\x9F[\x8C-\xA7])/) { |
|
$this->add_node($char, $i, $i+1, *chart_ht, "", "pictograph"); |
|
|
|
|
|
} elsif (($char =~ /^[\xEA-\xED]/) |
|
&& ($romanized_char = $this->unicode_hangul_romanization($char))) { |
|
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "Hangul"); |
|
|
|
|
|
} elsif ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char) |
|
&& ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) { |
|
$de_accented_translit = $util->de_accent_string($tonal_translit); |
|
$this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK"); |
|
|
|
|
|
} elsif ($char_name =~ /\bSIGN (?:VIRAMA|AL-LAKUNA|ASAT|COENG|PAMAAEH)\b/) { |
|
|
|
if (($prev_script eq $current_script) |
|
&& (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i)) |
|
&& ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) { |
|
$this->add_node($prev_char_roman_consonant, $prev_node_start, $i+1, *chart_ht, "", "virama"); |
|
} else { |
|
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-virama"); |
|
} |
|
|
|
|
|
} elsif ($char_name =~ /\bSIGN (?:NUKTA)\b/) { |
|
|
|
if ($prev_script eq $current_script) { |
|
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "nukta"); |
|
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all"); |
|
$this->set_node_id_slot_value($node_id, "nukta", 1, *chart_ht); |
|
} else { |
|
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-nukta"); |
|
} |
|
|
|
|
|
} elsif ($char =~ /^\xE2\x80[\x8B-\x8F\xAA-\xAE]$/) { |
|
if ($prev_node_id) { |
|
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char"); |
|
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all"); |
|
} else { |
|
$this->add_node("", $i, $i+1, *chart_ht, "", "zero-width-char"); |
|
} |
|
} elsif (($char =~ /^\xEF\xBB\xBF$/) && $prev_node_id) { |
|
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char"); |
|
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all"); |
|
|
|
|
|
} elsif ($tone_mark) { |
|
if ($prev_script eq $current_script) { |
|
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "tone-mark"); |
|
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all"); |
|
$this->set_node_id_slot_value($node_id, "tone-mark", $tone_mark, *chart_ht); |
|
} else { |
|
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-tone-mark"); |
|
} |
|
|
|
|
|
} elsif (($char_name =~ /\b(ACCENT|TONE|COMBINING DIAERESIS|COMBINING DIAERESIS BELOW|COMBINING MACRON|COMBINING VERTICAL LINE ABOVE|COMBINING DOT ABOVE RIGHT|COMBINING TILDE|COMBINING CYRILLIC|MUUSIKATOAN|TRIISAP)\b/) && ($ht{UTF_TO_CAT}->{$char} =~ /^Mn/)) { |
|
if ($prev_script eq $current_script) { |
|
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "diacritic"); |
|
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all"); |
|
$diacritic = lc $char_name; |
|
$diacritic =~ s/^.*(?:COMBINING CYRILLIC|COMBINING|SIGN)\s+//i; |
|
$diacritic =~ s/^.*(ACCENT|TONE)/$1/i; |
|
$diacritic =~ s/^\s*//; |
|
$this->set_node_id_slot_value($node_id, "diacritic", $diacritic, *chart_ht); |
|
|
|
} else { |
|
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-diacritic"); |
|
} |
|
|
|
|
|
} elsif ($char_name) { |
|
if (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht))) { |
|
|
|
print STDOUT "ROM l.$line_number/$i: $romanized_char\n" if $verbosePM; |
|
|
|
|
|
if ($romanized_char eq "\"\"") { |
|
$this->add_node("", $i, $i+1, *chart_ht, "", "empty-string-mapping"); |
|
|
|
|
|
|
|
} elsif (($romanized_char =~ /^(character|lengthener|modifier)/)) { |
|
$this->add_node($char, $i, $i+1, *chart_ht, "", "nevermind-keep-original"); |
|
|
|
|
|
} elsif (($romanized_char =~ /^\+(H|M|N|NG)$/i) |
|
&& ($prev_script eq $current_script) |
|
&& ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{"a"})) { |
|
my $core_suffix = $romanized_char; |
|
$core_suffix =~ s/^\+//; |
|
if ($prev_char_roman =~ /[aeiou]$/i) { |
|
$this->add_node($core_suffix, $i, $i+1, *chart_ht, "", "syllable-end-consonant"); |
|
} else { |
|
$this->add_node(join("", $prev_char_roman, "a", $core_suffix), $prev_node_start, $i+1, *chart_ht, "", "syllable-end-consonant-with-added-a"); |
|
$this->add_node(join("", "a", $core_suffix), $i, $i+1, *chart_ht, "backup", "syllable-end-consonant"); |
|
} |
|
|
|
|
|
} elsif ($char_name =~ /(?:HIRAGANA|KATAKANA) LETTER SMALL Y/) { |
|
if (($prev_script eq $current_script) |
|
&& (($prev_char_roman_consonant) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])i$/i))) { |
|
unless ($this->get_node_for_span_and_type($prev_node_start, $i+1, *chart_ht, "")) { |
|
$this->add_node("$prev_char_roman_consonant$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "japanese-contraction"); |
|
} |
|
} else { |
|
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "unexpected-japanese-contraction-character"); |
|
} |
|
} elsif (($prev_script =~ /^(HIRAGANA|KATAKANA)$/i) |
|
&& ($char_name eq "KATAKANA-HIRAGANA PROLONGED SOUND MARK") |
|
&& (($prev_char_roman_vowel) = ($prev_char_roman =~ /([aeiou])$/i))) { |
|
$this->add_node("$prev_char_roman$prev_char_roman_vowel", $prev_node_start, $i+1, *chart_ht, "", "japanese-vowel-lengthening"); |
|
} elsif (($current_script =~ /^(Hiragana|Katakana)$/i) |
|
&& ($char_name =~ /^(HIRAGANA|KATAKANA) LETTER SMALL TU$/i) |
|
&& ($next_script eq $current_script) |
|
&& ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht)) |
|
&& (($doubled_consonant) = ($romanized_next_char =~ /^(ch|[bcdfghjklmnpqrstwz])/i))) { |
|
|
|
|
|
$doubled_consonant = "t" if $doubled_consonant eq "ch"; |
|
$this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "japanese-consonant-doubling"); |
|
|
|
|
|
} elsif (($char_name eq "GREEK SMALL LETTER MU") |
|
&& (! ($prev_script =~ /^GREEK$/)) |
|
&& ($i < $#chars) |
|
&& ($chart_ht{ORIG_CHAR}->{($i+1)} =~ /^[cfgjlmstv]$/i)) { |
|
$this->add_node("\xC2\xB5", $i, $i+1, *chart_ht, "", "greek-mu-to-micro-sign"); |
|
|
|
|
|
} elsif (($current_script eq "Gurmukhi") |
|
&& ($char_name eq "GURMUKHI ADDAK")) { |
|
if (($next_script eq $current_script) |
|
&& ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht)) |
|
&& (($doubled_consonant) = ($romanized_next_char =~ /^([bcdfghjklmnpqrstvwxz])/i))) { |
|
$this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "gurmukhi-consonant-doubling"); |
|
} else { |
|
$this->add_node("'", $i, $i+1, *chart_ht, "", "gurmukhi-unexpected-addak"); |
|
} |
|
|
|
|
|
} elsif ($subjoined_char_p |
|
&& ($prev_script eq $current_script) |
|
&& (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i)) |
|
&& ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) { |
|
my $new_roman = "$prev_char_roman_consonant$romanized_char"; |
|
$this->add_node($new_roman, $prev_node_start, $i+1, *chart_ht, "", "subjoined-character"); |
|
|
|
|
|
|
|
} elsif (($char_name =~ /THAI CHARACTER/) |
|
&& ($prev_script eq $current_script) |
|
&& ($chart_ht{CHAR_SYLLABLE_INFO}->{($i-1)} =~ /written-pre-consonant-spoken-post-consonant/i) |
|
&& ($prev_char_roman =~ /^[aeiou]+$/i) |
|
&& ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]/)) { |
|
$this->add_node("$romanized_char$prev_char_roman", $prev_node_start, $i+1, *chart_ht, "", "thai-vowel-consonant-swap"); |
|
|
|
|
|
} elsif ($char_name eq "THAI CHARACTER O ANG") { |
|
if ($prev_script ne $current_script) { |
|
$this->add_node("", $i, $i+1, *chart_ht, "", "thai-initial-o-ang-drop"); |
|
} elsif ($next_script ne $current_script) { |
|
$this->add_node("", $i, $i+1, *chart_ht, "", "thai-final-o-ang-drop"); |
|
} else { |
|
my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht); |
|
my $romanized_prev2_char = $this->romanize_char_at_position($i-2, $lang_code, $output_style, *ht, *chart_ht); |
|
if (($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxz]+$/i) |
|
&& ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) { |
|
$this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); |
|
} elsif (($prev2_script eq $current_script) |
|
&& 0 |
|
&& ($prev_char_name =~ /^THAI CHARACTER MAI [A-Z]+$/) |
|
&& ($romanized_prev2_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i) |
|
&& ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) { |
|
$this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); |
|
} else { |
|
$this->add_node("", $i, $i+1, *chart_ht, "", "thai-middle-o-ang-drop"); |
|
} |
|
} |
|
|
|
|
|
} elsif ($romanized_char =~ /\s/) { |
|
$this->add_node($char, $i, $i+1, *chart_ht, "", "space"); |
|
|
|
|
|
} elsif ($current_script eq "Tibetan") { |
|
|
|
if ($subjoined_char_p |
|
&& ($prev_script eq $current_script) |
|
&& $prev_letter_plus_char_p |
|
&& ($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) { |
|
$this->add_node("$prev_char_roman$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "subjoined-tibetan-character"); |
|
} elsif ($romanized_char =~ /^-A$/i) { |
|
my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht); |
|
if (! $prev_letter_plus_char_p) { |
|
$this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-frontal-dash-a"); |
|
} elsif (($prev_script eq $current_script) |
|
&& ($next_script eq $current_script) |
|
&& ($prev_char_roman =~ /[bcdfghjklmnpqrstvwxyz]$/) |
|
&& ($romanized_next_char =~ /^[aeiou]/)) { |
|
$this->add_node("a'", $i, $i+1, *chart_ht, "", "tibetan-medial-dash-a"); |
|
} elsif (($prev_script eq $current_script) |
|
&& ($next_script eq $current_script) |
|
&& ($prev_char_roman =~ /[aeiou]$/) |
|
&& ($romanized_next_char =~ /[aeiou]/)) { |
|
$this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-reduced-medial-dash-a"); |
|
} elsif (($prev_script eq $current_script) |
|
&& (! ($prev_char_roman =~ /[aeiou]/)) |
|
&& (! $next_letter_plus_char_p)) { |
|
$this->add_node("a", $i, $i+1, *chart_ht, "", "tibetan-final-dash-a"); |
|
} else { |
|
$this->add_node("a", $i, $i+1, *chart_ht, "", "unexpected-tibetan-dash-a"); |
|
} |
|
} elsif (($romanized_char =~ /^[AEIOU]/i) |
|
&& ($prev_script eq $current_script) |
|
&& ($prev_char_roman =~ /^A$/i) |
|
&& (! $prev2_letter_plus_char_p)) { |
|
$this->add_node($romanized_char, $prev_node_start, $i+1, *chart_ht, "", "tibetan-dropped-word-initial-a"); |
|
} else { |
|
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization"); |
|
} |
|
|
|
|
|
} elsif (($current_script eq "Khmer") |
|
&& (($char_roman_consonant, $char_roman_vowel) = ($romanized_char =~ /^(.*[bcdfghjklmnpqrstvwxyz])([ao]+)-$/i))) { |
|
my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht); |
|
if (($next_script eq $current_script) |
|
&& ($romanized_next_char =~ /^[aeiouy]/i)) { |
|
$this->add_node($char_roman_consonant, $i, $i+1, *chart_ht, "", "khmer-vowel-drop"); |
|
} else { |
|
$this->add_node("$char_roman_consonant$char_roman_vowel", $i, $i+1, *chart_ht, "", "khmer-standard-unicode-based-romanization"); |
|
} |
|
|
|
|
|
} elsif ((@abudiga_default_vowels = sort keys %{$ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}}) |
|
&& ($abudiga_default_vowel = $abudiga_default_vowels[0]) |
|
&& ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) { |
|
my $new_roman = join("", $romanized_char, $abudiga_default_vowel); |
|
$this->add_node($new_roman, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization-plus-abudiga-default-vowel"); |
|
|
|
|
|
|
|
} else { |
|
$node_id = $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization"); |
|
} |
|
} else { |
|
$this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original"); |
|
} |
|
} elsif (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht)) |
|
&& ((length($romanized_char) <= 2) |
|
|| ($ht{UTF_TO_CHAR_ROMANIZATION}->{$char}))) { |
|
$romanized_char =~ s/^""$//; |
|
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "romanized-without-character-name"); |
|
} else { |
|
$this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original-without-character-name"); |
|
} |
|
} |
|
$i = $next_index; |
|
} |
|
|
|
$this->schwa_deletion(0, $n_characters, *chart_ht, $lang_code); |
|
$this->default_vowelize_tibetan(0, $n_characters, *chart_ht, $lang_code, $line_number) if $chart_ht{CHART_CONTAINS_SCRIPT}->{"Tibetan"}; |
|
$this->assemble_numbers_in_chart(*chart_ht, $line_number); |
|
|
|
if ($return_chart_p) { |
|
} elsif ($return_offset_mappings_p) { |
|
($result, $offset_mappings, $new_char_offset, $new_rom_char_offset) = $this->best_romanized_string(0, $n_characters, *chart_ht, $control, $initial_char_offset, $initial_rom_char_offset); |
|
} else { |
|
$result = $this->best_romanized_string(0, $n_characters, *chart_ht) unless $return_chart_p; |
|
} |
|
|
|
if ($verbosePM) { |
|
my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-log.txt"; |
|
$util->append_to_file($logfile, $log) if $log && (-r $logfile); |
|
} |
|
|
|
return ($result, $offset_mappings) if $return_offset_mappings_p; |
|
return *chart_ht if $return_chart_p; |
|
return $result; |
|
} |
|
|
|
sub string_to_json_string { |
|
local($this, $s) = @_; |
|
|
|
utf8::decode($s); |
|
my $j = JSON->new->utf8->encode([$s]); |
|
$j =~ s/^\[(.*)\]$/$1/; |
|
return $j; |
|
} |
|
|
|
sub chart_to_json_romanization_elements { |
|
local($this, $chart_start, $chart_end, *chart_ht, $line_number) = @_; |
|
|
|
my $result = ""; |
|
my $start = $chart_start; |
|
my $end; |
|
while ($start < $chart_end) { |
|
$end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht); |
|
my @best_romanizations; |
|
if (($end && ($start < $end)) |
|
&& (@best_romanizations = $this->best_romanizations($start, $end, *chart_ht))) { |
|
$orig_segment = $this->orig_string_at_span($start, $end, *chart_ht); |
|
$next_start = $end; |
|
} else { |
|
$orig_segment = $chart_ht{ORIG_CHAR}->{$start}; |
|
@best_romanizations = ($orig); |
|
$next_start = $start + 1; |
|
} |
|
$exclusive_end = $end - 1; |
|
|
|
$guarded_orig = $this->string_to_json_string($orig_segment); |
|
$result .= " { \"line\": $line_number, \"start\": $start, \"end\": $exclusive_end, \"orig\": $guarded_orig, \"roms\": ["; |
|
foreach $i ((0 .. $#best_romanizations)) { |
|
my $rom = $best_romanizations[$i]; |
|
|
|
my $guarded_rom = $this->string_to_json_string($rom); |
|
$result .= " { \"rom\": $guarded_rom"; |
|
|
|
$result .= " }"; |
|
$result .= "," if $i < $#best_romanizations; |
|
} |
|
$result .= " ] },\n"; |
|
$start = $next_start; |
|
} |
|
return $result; |
|
} |
|
|
|
sub default_vowelize_tibetan { |
|
local($this, $chart_start, $chart_end, *chart_ht, $lang_code, $line_number) = @_; |
|
|
|
|
|
|
|
my $token_start = $chart_start; |
|
my $next_token_start = $chart_start; |
|
while (($token_start = $next_token_start) < $chart_end) { |
|
$next_token_start = $token_start + 1; |
|
|
|
next unless $chart_ht{CHAR_LETTER_PLUS}->{$token_start}; |
|
my $current_script = $chart_ht{CHAR_SCRIPT}->{$token_start}; |
|
next unless ($current_script eq "Tibetan"); |
|
my $token_end = $chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$token_start}; |
|
next unless $token_end; |
|
next unless $token_end > $token_start; |
|
$next_token_start = $token_end; |
|
|
|
my $start = $token_start; |
|
my $end; |
|
my @node_ids = (); |
|
while ($start < $token_end) { |
|
$end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht); |
|
last unless $end && ($end > $start); |
|
my @alt_node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}; |
|
last unless @alt_node_ids; |
|
push(@node_ids, $alt_node_ids[0]); |
|
$start = $end; |
|
} |
|
my $contains_vowel_p = 0; |
|
my @romanizations = (); |
|
foreach $node_id (@node_ids) { |
|
my $roman = $chart_ht{NODE_ROMAN}->{$node_id}; |
|
$roman = "" unless defined($roman); |
|
push(@romanizations, $roman); |
|
$contains_vowel_p = 1 if $roman =~ /[aeiou]/i; |
|
} |
|
|
|
unless ($contains_vowel_p) { |
|
my $default_vowel_target_index; |
|
if ($#node_ids <= 1) { |
|
$default_vowel_target_index = 0; |
|
} elsif ($romanizations[$#romanizations] eq "s") { |
|
if ($romanizations[($#romanizations-1)] eq "y") { |
|
$default_vowel_target_index = $#romanizations-1; |
|
} else { |
|
$default_vowel_target_index = $#romanizations-2; |
|
} |
|
} else { |
|
$default_vowel_target_index = $#romanizations-1; |
|
} |
|
$romanizations[$default_vowel_target_index] .= "a"; |
|
my $old_node_id = $node_ids[$default_vowel_target_index]; |
|
my $old_start = $chart_ht{NODE_START}->{$old_node_id}; |
|
my $old_end = $chart_ht{NODE_END}->{$old_node_id}; |
|
my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id}; |
|
my $new_roman = $old_roman . "a"; |
|
my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-default-vowel"); |
|
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all"); |
|
$chart_ht{NODE_TYPE}->{$old_node_id} = "backup"; |
|
} |
|
if (($romanizations[0] eq "'") |
|
&& ($#romanizations >= 1) |
|
&& ($romanizations[1] =~ /^[o]$/)) { |
|
my $old_node_id = $node_ids[0]; |
|
my $old_start = $chart_ht{NODE_START}->{$old_node_id}; |
|
my $old_end = $chart_ht{NODE_END}->{$old_node_id}; |
|
my $new_node_id = $this->add_node("", $old_start, $old_end, *chart_ht, "", "tibetan-delete-apostrophe"); |
|
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all"); |
|
$chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; |
|
} |
|
if (($#node_ids >= 1) |
|
&& ($romanizations[$#romanizations] =~ /^[bcdfghjklmnpqrstvwxz]+y$/)) { |
|
my $old_node_id = $node_ids[$#romanizations]; |
|
my $old_start = $chart_ht{NODE_START}->{$old_node_id}; |
|
my $old_end = $chart_ht{NODE_END}->{$old_node_id}; |
|
my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id}; |
|
my $new_roman = $old_roman . "a"; |
|
my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-final-vowel"); |
|
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all"); |
|
$chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; |
|
} |
|
foreach $old_node_id (@node_ids) { |
|
my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id}; |
|
next unless $old_roman =~ /-a/; |
|
my $old_start = $chart_ht{NODE_START}->{$old_node_id}; |
|
my $old_end = $chart_ht{NODE_END}->{$old_node_id}; |
|
my $new_roman = $old_roman; |
|
$new_roman =~ s/-a/a/; |
|
my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-delete-dash"); |
|
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all"); |
|
$chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; |
|
} |
|
} |
|
} |
|
|
|
sub schwa_deletion { |
|
local($this, $chart_start, $chart_end, *chart_ht, $lang_code) = @_; |
|
|
|
|
|
|
|
if ($chart_ht{CHART_CONTAINS_SCRIPT}->{"Devanagari"}) { |
|
my $script_start = $chart_start; |
|
my $next_script_start = $chart_start; |
|
while (($script_start = $next_script_start) < $chart_end) { |
|
$next_script_start = $script_start + 1; |
|
|
|
my $current_script = $chart_ht{CHAR_SCRIPT}->{$script_start}; |
|
next unless ($current_script eq "Devanagari"); |
|
my $script_end = $chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start}; |
|
next unless $script_end; |
|
next unless $script_end - $script_start >= 2; |
|
$next_script_start = $script_end; |
|
my $end_node_id = $this->get_node_for_span($script_end-1, $script_end, *chart_ht); |
|
next unless $end_node_id; |
|
my $end_roman = $chart_ht{NODE_ROMAN}->{$end_node_id}; |
|
next unless ($end_consonant) = ($end_roman =~ /^([bcdfghjklmnpqrstvwxz]+)a$/i); |
|
my $prev_node_id = $this->get_node_for_span($script_end-4, $script_end-1, *chart_ht) |
|
|| $this->get_node_for_span($script_end-3, $script_end-1, *chart_ht) |
|
|| $this->get_node_for_span($script_end-2, $script_end-1, *chart_ht); |
|
next unless $prev_node_id; |
|
my $prev_roman = $chart_ht{NODE_ROMAN}->{$prev_node_id}; |
|
next unless $prev_roman =~ /[aeiou]/i; |
|
|
|
|
|
$chart_ht{NODE_TYPE}->{$end_node_id} = "alt"; |
|
|
|
$this->add_node($end_consonant, $script_end-1, $script_end, *chart_ht, "", "devanagari-with-deleted-final-schwa"); |
|
} |
|
} |
|
} |
|
|
|
sub best_romanized_string { |
|
local($this, $chart_start, $chart_end, *chart_ht, $control, $orig_char_offset, $rom_char_offset) = @_; |
|
|
|
$control = "" unless defined($control); |
|
my $current_orig_char_offset = $orig_char_offset || 0; |
|
my $current_rom_char_offset = $rom_char_offset || 0; |
|
my $return_offset_mappings_p = ($control =~ /\breturn offset mappings\b/); |
|
my $result = ""; |
|
my $start = $chart_start; |
|
my $end; |
|
my @char_offsets = ("$current_orig_char_offset:$current_rom_char_offset"); |
|
while ($start < $chart_end) { |
|
$end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht); |
|
my $n_orig_chars_in_segment = 0; |
|
my $n_rom_chars_in_segment = 0; |
|
if ($end && ($start < $end)) { |
|
my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht); |
|
my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef; |
|
if (defined($best_romanization)) { |
|
$result .= $best_romanization; |
|
if ($return_offset_mappings_p) { |
|
$n_orig_chars_in_segment = $end-$start; |
|
$n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization); |
|
} |
|
$start = $end; |
|
} else { |
|
my $best_romanization = $chart_ht{ORIG_CHAR}->{$start}; |
|
$result .= $best_romanization; |
|
$start++; |
|
if ($return_offset_mappings_p) { |
|
$n_orig_chars_in_segment = 1; |
|
$n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization); |
|
} |
|
} |
|
} else { |
|
my $best_romanization = $chart_ht{ORIG_CHAR}->{$start}; |
|
$result .= $best_romanization; |
|
$start++; |
|
if ($return_offset_mappings_p) { |
|
$n_orig_chars_in_segment = 1; |
|
$n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization); |
|
} |
|
} |
|
if ($return_offset_mappings_p) { |
|
my $new_orig_char_offset = $current_orig_char_offset + $n_orig_chars_in_segment; |
|
my $new_rom_char_offset = $current_rom_char_offset + $n_rom_chars_in_segment; |
|
my $offset_mapping = "$new_orig_char_offset:$new_rom_char_offset"; |
|
push(@char_offsets, $offset_mapping); |
|
$current_orig_char_offset = $new_orig_char_offset; |
|
$current_rom_char_offset = $new_rom_char_offset; |
|
} |
|
} |
|
return ($result, join(",", @char_offsets), $current_orig_char_offset, $current_rom_char_offset) if $return_offset_mappings_p; |
|
return $result; |
|
} |
|
|
|
sub orig_string_at_span { |
|
local($this, $start, $end, *chart_ht) = @_; |
|
|
|
my $result = ""; |
|
foreach $i (($start .. ($end-1))) { |
|
$result .= $chart_ht{ORIG_CHAR}->{$i}; |
|
} |
|
return $result; |
|
} |
|
|
|
sub find_end_of_rom_segment { |
|
local($this, $start, $chart_end, *chart_ht) = @_; |
|
|
|
my @ends = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}}; |
|
my $end_index = $#ends; |
|
while (($end_index >= 0) && ($ends[$end_index] > $chart_end)) { |
|
$end_index--; |
|
} |
|
if (($end_index >= 0) |
|
&& defined($end = $ends[$end_index]) |
|
&& ($start < $end)) { |
|
return $end; |
|
} else { |
|
return ""; |
|
} |
|
} |
|
|
|
sub best_romanizations { |
|
local($this, $start, $end, *chart_ht) = @_; |
|
|
|
@regular_romanizations = (); |
|
@alt_romanizations = (); |
|
@backup_romanizations = (); |
|
|
|
foreach $node_id (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) { |
|
my $type = $chart_ht{NODE_TYPE}->{$node_id}; |
|
my $roman = $chart_ht{NODE_ROMAN}->{$node_id}; |
|
if (! defined($roman)) { |
|
|
|
} elsif (($type eq "backup") && ! defined($backup_romanization)) { |
|
push(@backup_romanizations, $roman) unless $util->member($roman, @backup_romanizations); |
|
} elsif (($type eq "alt") && ! defined($alt_romanization)) { |
|
push(@alt_romanizations, $roman) unless $util->member($roman, @alt_romanizations); |
|
} else { |
|
push(@regular_romanizations, $roman) unless $util->member($roman, @regular_romanizations); |
|
} |
|
} |
|
@regular_alt_romanizations = sort @regular_romanizations; |
|
foreach $alt_romanization (sort @alt_romanizations) { |
|
push(@regular_alt_romanizations, $alt_romanization) unless $util->member($alt_romanization, @regular_alt_romanizations); |
|
} |
|
return @regular_alt_romanizations if @regular_alt_romanizations; |
|
return sort @backup_romanizations; |
|
} |
|
|
|
sub join_alt_romanizations_for_viz { |
|
local($this, @list) = @_; |
|
|
|
my @viz_romanizations = (); |
|
|
|
foreach $alt_rom (@list) { |
|
if ($alt_rom eq "") { |
|
push(@viz_romanizations, "-"); |
|
} else { |
|
push(@viz_romanizations, $alt_rom); |
|
} |
|
} |
|
return join(", ", @viz_romanizations); |
|
} |
|
|
|
sub markup_orig_rom_strings { |
|
local($this, $chart_start, $chart_end, *ht, *chart_ht, *pinyin_ht, $last_group_id_index) = @_; |
|
|
|
my $marked_up_rom = ""; |
|
my $marked_up_orig = ""; |
|
my $start = $chart_start; |
|
my $end; |
|
while ($start < $chart_end) { |
|
my $segment_start = $start; |
|
my $segment_end = $start+1; |
|
my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht); |
|
my $rom_segment = ""; |
|
my $orig_segment = ""; |
|
my $rom_title = ""; |
|
my $orig_title = ""; |
|
my $contains_alt_romanizations = 0; |
|
if ($end) { |
|
$segment_end = $end; |
|
my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht); |
|
my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef; |
|
if (defined($best_romanization)) { |
|
$rom_segment .= $best_romanization; |
|
$orig_segment .= $this->orig_string_at_span($start, $end, *chart_ht); |
|
$segment_end = $end; |
|
if ($#best_romanizations >= 1) { |
|
$rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n"); |
|
$contains_alt_romanizations = 1; |
|
} |
|
} else { |
|
my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht); |
|
$rom_segment .= $segment; |
|
$orig_segment .= $segment; |
|
$segment_end = $start+1; |
|
} |
|
$start = $segment_end; |
|
} else { |
|
$rom_segment .= $chart_ht{ORIG_CHAR}->{$start}; |
|
$orig_segment .= $this->orig_string_at_span($start, $start+1, *chart_ht); |
|
$segment_end = $start+1; |
|
$start = $segment_end; |
|
} |
|
my $next_char = $chart_ht{ORIG_CHAR}->{$segment_end}; |
|
my $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht); |
|
while ($next_char_is_combining_p |
|
&& ($segment_end < $chart_end) |
|
&& ($end = $this->find_end_of_rom_segment($segment_end, $chart_end, *chart_ht)) |
|
&& ($end > $segment_end) |
|
&& (@best_romanizations = $this->best_romanizations($segment_end, $end, *chart_ht)) |
|
&& defined($best_romanization = $best_romanizations[0])) { |
|
$orig_segment .= $this->orig_string_at_span($segment_end, $end, *chart_ht); |
|
$rom_segment .= $best_romanization; |
|
if ($#best_romanizations >= 1) { |
|
$rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n"); |
|
$contains_alt_romanizations = 1; |
|
} |
|
$segment_end = $end; |
|
$start = $segment_end; |
|
$next_char = $chart_ht{ORIG_CHAR}->{$segment_end}; |
|
$next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht); |
|
} |
|
foreach $i (($segment_start .. ($segment_end-1))) { |
|
$orig_title .= "+‎ ‎" unless $orig_title eq ""; |
|
my $char = $chart_ht{ORIG_CHAR}->{$i}; |
|
my $numeric = $ht{UTF_TO_NUMERIC}->{$char}; |
|
$numeric = "" unless defined($numeric); |
|
my $pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char}; |
|
$pic_descr = "" unless defined($pic_descr); |
|
if ($char =~ /^\xE4\xB7[\x80-\xBF]$/) { |
|
$orig_title .= "$char_name\n"; |
|
} elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) { |
|
my $unicode = $utf8->utf8_to_unicode($char); |
|
$orig_title .= "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode)) . "\n"; |
|
$orig_title .= "Chinese: $tonal_translit\n" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""); |
|
$orig_title .= "Number: $numeric\n" if $numeric =~ /\d/; |
|
} elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) { |
|
$orig_title .= "$char_name\n"; |
|
$orig_title .= "Number: $numeric\n" if $numeric =~ /\d/; |
|
$orig_title .= "Picture: $pic_descr\n" if $pic_descr =~ /\S/; |
|
} else { |
|
my $unicode = $utf8->utf8_to_unicode($char); |
|
if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) { |
|
$orig_title .= "Hangul syllable U+" . (uc sprintf("%04x", $unicode)) . "\n"; |
|
} else { |
|
$orig_title .= "Unicode character U+" . (uc sprintf("%04x", $unicode)) . "\n"; |
|
} |
|
} |
|
} |
|
(@non_ascii_roms) = ($rom_segment =~ /([\xC0-\xFF][\x80-\xBF]*)/g); |
|
foreach $char (@non_ascii_roms) { |
|
my $char_name = $ht{UTF_TO_CHAR_NAME}->{$char}; |
|
my $unicode = $utf8->utf8_to_unicode($char); |
|
my $unicode_s = "U+" . (uc sprintf("%04x", $unicode)); |
|
if ($char_name) { |
|
$rom_title .= "$char_name\n"; |
|
} else { |
|
$rom_title .= "$unicode_s\n"; |
|
} |
|
} |
|
$last_group_id_index++; |
|
$rom_title =~ s/\s*$//; |
|
$rom_title =~ s/\n/
/g; |
|
$orig_title =~ s/\s*$//; |
|
$orig_title =~ s/\n/
‎/g; |
|
$orig_title = "‭" . $orig_title . "‬"; |
|
my $rom_title_clause = ($rom_title eq "") ? "" : " title=\"$rom_title\""; |
|
my $orig_title_clause = ($orig_title eq "") ? "" : " title=\"$orig_title\""; |
|
my $alt_rom_clause = ($contains_alt_romanizations) ? "border-bottom:1px dotted;" : ""; |
|
$marked_up_rom .= "<span id=\"span-$last_group_id_index-1\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\" style=\"color:#00BB00;$alt_rom_clause\"$rom_title_clause>" . $util->guard_html($rom_segment) . "<\/span>"; |
|
$marked_up_orig .= "<span id=\"span-$last_group_id_index-2\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\"$orig_title_clause>" . $util->guard_html($orig_segment) . "<\/span>"; |
|
if (($last_char = $chart_ht{ORIG_CHAR}->{($segment_end-1)}) |
|
&& ($last_char_name = $ht{UTF_TO_CHAR_NAME}->{$last_char}) |
|
&& ($last_char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET|BRAILLE PATTERN BLANK|TIBETAN MARK .*)$/)) { |
|
$marked_up_orig .= "<wbr>"; |
|
$marked_up_rom .= "<wbr>"; |
|
} |
|
} |
|
return ($marked_up_rom, $marked_up_orig, $last_group_id_index); |
|
} |
|
|
|
sub romanizations_with_alternatives { |
|
local($this, *ht, *chart_ht, *pinyin_ht, $chart_start, $chart_end) = @_; |
|
|
|
$chart_start = 0 unless defined($chart_start); |
|
$chart_end = $chart_ht{N_CHARS} unless defined($chart_end); |
|
my $result = ""; |
|
my $start = $chart_start; |
|
my $end; |
|
|
|
while ($start < $chart_end) { |
|
my $segment_start = $start; |
|
my $segment_end = $start+1; |
|
my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht); |
|
my $rom_segment = ""; |
|
|
|
if ($end) { |
|
$segment_end = $end; |
|
my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht); |
|
|
|
if (@best_romanizations) { |
|
if ($#best_romanizations == 0) { |
|
$rom_segment .= $best_romanizations[0]; |
|
} else { |
|
$rom_segment .= "{" . join("|", @best_romanizations) . "}"; |
|
} |
|
$segment_end = $end; |
|
} else { |
|
my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht); |
|
$rom_segment .= $segment; |
|
$segment_end = $start+1; |
|
} |
|
$start = $segment_end; |
|
} else { |
|
$rom_segment .= $chart_ht{ORIG_CHAR}->{$start}; |
|
$segment_end = $start+1; |
|
$start = $segment_end; |
|
} |
|
|
|
$result .= $rom_segment; |
|
} |
|
return $result; |
|
} |
|
|
|
sub quick_romanize { |
|
local($this, $s, $lang_code, *ht) = @_; |
|
|
|
my $result = ""; |
|
my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht); |
|
while (@chars) { |
|
my $found_match_in_table_p = 0; |
|
foreach $string_length (reverse(1..4)) { |
|
next if ($string_length-1) > $#chars; |
|
$multi_char_substring = join("", @chars[0..($string_length-1)]); |
|
my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}}; |
|
@mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings; |
|
if (@mappings) { |
|
my $mapping = $mappings[0]; |
|
$result .= $mapping; |
|
foreach $_ ((1 .. $string_length)) { |
|
shift @chars; |
|
} |
|
$found_match_in_table_p = 1; |
|
last; |
|
} |
|
} |
|
unless ($found_match_in_table_p) { |
|
$result .= $chars[0]; |
|
shift @chars; |
|
} |
|
} |
|
return $result; |
|
} |
|
|
|
sub char_is_combining_char { |
|
local($this, $c, *ht) = @_; |
|
|
|
return 0 unless $c; |
|
my $category = $ht{UTF_TO_CAT}->{$c}; |
|
return 0 unless $category; |
|
return $category =~ /^M/; |
|
} |
|
|
|
sub mark_up_string_for_mouse_over { |
|
local($this, $s, *ht, $control, *pinyin_ht) = @_; |
|
|
|
$control = "" unless defined($control); |
|
$no_ascii_p = ($control =~ /NO-ASCII/); |
|
my $result = ""; |
|
@chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht); |
|
while (@chars) { |
|
$char = shift @chars; |
|
$numeric = $ht{UTF_TO_NUMERIC}->{$char}; |
|
$numeric = "" unless defined($numeric); |
|
$pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char}; |
|
$pic_descr = "" unless defined($pic_descr); |
|
$next_char = ($#chars >= 0) ? $chars[0] : ""; |
|
$next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht); |
|
if ($no_ascii_p |
|
&& ($char =~ /^[\x00-\x7F]*$/) |
|
&& ! $next_char_is_combining_p) { |
|
$result .= $util->guard_html($char); |
|
} elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) { |
|
$unicode = $utf8->utf8_to_unicode($char); |
|
$title = "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode)); |
|
$title .= "
Chinese: $tonal_translit" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""); |
|
$title .= "
Number: $numeric" if $numeric =~ /\d/; |
|
$result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>"; |
|
} elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) { |
|
$title = $char_name; |
|
$title .= "
Number: $numeric" if $numeric =~ /\d/; |
|
$title .= "
Picture: $pic_descr" if $pic_descr =~ /\S/; |
|
$char_plus = $char; |
|
while ($next_char_is_combining_p) { |
|
|
|
$next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char}; |
|
$title .= "
+ $next_char_name"; |
|
$char = shift @chars; |
|
$char_plus .= $char; |
|
$next_char = ($#chars >= 0) ? $chars[0] : ""; |
|
$next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht); |
|
} |
|
$result .= "<span title=\"$title\">" . $util->guard_html($char_plus) . "<\/span>"; |
|
$result .= "<wbr>" if $char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET)$/; |
|
} elsif (($unicode = $utf8->utf8_to_unicode($char)) |
|
&& ($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) { |
|
$title = "Hangul syllable U+" . (uc sprintf("%04x", $unicode)); |
|
$result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>"; |
|
} else { |
|
$result .= $util->guard_html($char); |
|
} |
|
} |
|
return $result; |
|
} |
|
|
|
sub romanize_char_at_position_incl_multi { |
|
local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_; |
|
|
|
my $char = $chart_ht{ORIG_CHAR}->{$i}; |
|
return "" unless defined($char); |
|
my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$char}}; |
|
return $mappings[0] if @mappings; |
|
@mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$char}}; |
|
return $mappings[0] if @mappings; |
|
return $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht); |
|
} |
|
|
|
sub romanize_char_at_position { |
|
local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_; |
|
|
|
my $char = $chart_ht{ORIG_CHAR}->{$i}; |
|
return "" unless defined($char); |
|
return $char if $char =~ /^[\x00-\x7F]$/; |
|
my $romanization = $ht{UTF_TO_CHAR_ROMANIZATION}->{$char}; |
|
return $romanization if $romanization; |
|
my $char_name = $chart_ht{CHAR_NAME}->{$i}; |
|
$romanization = $this->romanize_charname($char_name, $lang_code, $output_style, *ht, $char); |
|
$ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization} |
|
= ($ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization} || 0) + 1 |
|
unless (length($romanization) < 4) |
|
|| ($romanization =~ /\s/) |
|
|| ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,3}[aeiou]-$/) |
|
|| ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,2}[aeiougw][aeiou]{1,2}$/) |
|
|| ($romanization =~ /^(allah|bbux|nyaa|nnya|quuv|rrep|shch|shur|syrx)$/i) |
|
|| (($char_name =~ /^(YI SYLLABLE|VAI SYLLABLE|ETHIOPIC SYLLABLE|CANADIAN SYLLABICS|CANADIAN SYLLABICS CARRIER)\s+(\S+)$/) && (length($romanization) <= 5)); |
|
|
|
return $romanization; |
|
} |
|
|
|
sub romanize_charname { |
|
local($this, $char_name, $lang_code, $output_style, *ht, $char) = @_; |
|
|
|
my $cached_result = $ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style}; |
|
|
|
return $cached_result if defined($cashed_result); |
|
$orig_char_name = $char_name; |
|
$char_name =~ s/^.* LETTER\s+([A-Z]+)-\d+$/$1/; |
|
$char_name =~ s/^.* LETTER\s+//; |
|
$char_name =~ s/^.* SYLLABLE\s+B\d\d\d\s+//; |
|
$char_name =~ s/^.* SYLLABLE\s+//; |
|
$char_name =~ s/^.* SYLLABICS\s+//; |
|
$char_name =~ s/^.* LIGATURE\s+//; |
|
$char_name =~ s/^.* VOWEL SIGN\s+//; |
|
$char_name =~ s/^.* CONSONANT SIGN\s+//; |
|
$char_name =~ s/^.* CONSONANT\s+//; |
|
$char_name =~ s/^.* VOWEL\s+//; |
|
$char_name =~ s/ WITH .*$//; |
|
$char_name =~ s/ WITHOUT .*$//; |
|
$char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//; |
|
$char_name =~ s/^([A-Z]+)\d+$/$1/; |
|
foreach $_ ((1 .. 3)) { |
|
$char_name =~ s/^.*\b(?:ABKHASIAN|ACADEMY|AFRICAN|AIVILIK|AITON|AKHMIMIC|ALEUT|ALI GALI|ALPAPRAANA|ALTERNATE|ALTERNATIVE|AMBA|ARABIC|ARCHAIC|ASPIRATED|ATHAPASCAN|BASELINE|BLACKLETTER|BARRED|BASHKIR|BERBER|BHATTIPROLU|BIBLE-CREE|BIG|BINOCULAR|BLACKFOOT|BLENDED|BOTTOM|BROAD|BROKEN|CANDRA|CAPITAL|CARRIER|CHILLU|CLOSE|CLOSED|COPTIC|CROSSED|CRYPTOGRAMMIC|CURLED|CURLY|CYRILLIC|DANTAJA|DENTAL|DIALECT-P|DIAERESIZED|DOTLESS|DOUBLE|DOUBLE-STRUCK|EASTERN PWO KAREN|EGYPTOLOGICAL|FARSI|FINAL|FLATTENED|GLOTTAL|GREAT|GREEK|HALF|HIGH|INITIAL|INSULAR|INVERTED|IOTIFIED|JONA|KANTAJA|KASHMIRI|KHAKASSIAN|KHAMTI|KHANDA|KINNA|KIRGHIZ|KOMI|L-SHAPED|LATINATE|LITTLE|LONG|LONG-LEGGED|LOOPED|LOW|MAHAAPRAANA|MALAYALAM|MANCHU|MANDAILING|MATHEMATICAL|MEDIAL|MIDDLE-WELSH|MON|MONOCULAR|MOOSE-CREE|MULTIOCULAR|MUURDHAJA|N-CREE|NARROW|NASKAPI|NDOLE|NEUTRAL|NIKOLSBURG|NORTHERN|NUBIAN|NUNAVIK|NUNAVUT|OJIBWAY|OLD|OPEN|ORKHON|OVERLONG|PALI|PERSIAN|PHARYNGEAL|PRISHTHAMATRA|R-CREE|REDUPLICATION|REVERSED|ROMANIAN|ROUND|ROUNDED|RUDIMENTA|RUMAI PALAUNG|SANSKRIT|SANYAKA|SARA|SAYISI|SCRIPT|SEBATBEIT|SEMISOFT|SGAW KAREN|SHAN|SHARP|SHWE PALAUNG|SHORT|SIBE|SIDEWAYS|SIMALUNGUN|SMALL|SOGDIAN|SOFT|SOUTH-SLAVEY|SOUTHERN|SPIDERY|STIRRUP|STRAIGHT|STRETCHED|SUBSCRIPT|SWASH|TAI LAING|TAILED|TAILLESS|TAALUJA|TH-CREE|TALL|THREE-LEGGED|TURNED|TODO|TOP|TROKUTASTI|TUAREG|UKRAINIAN|UNBLENDED|VISIGOTHIC|VOCALIC|VOICED|VOICELESS|VOLAPUK|WAVY|WESTERN PWO KAREN|WEST-CREE|WESTERN|WIDE|WOODS-CREE|Y-CREE|YENISEI|YIDDISH)\s+//; |
|
} |
|
$char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//; |
|
if ($char_name =~ /THAI CHARACTER/) { |
|
$char_name =~ s/^THAI CHARACTER\s+//; |
|
if ($char =~ /^\xE0\xB8[\x81-\xAE]/) { |
|
|
|
$char_name =~ s/^([^AEIOU]*).*/$1/i; |
|
} elsif ($char_name =~ /^SARA [AEIOU]/) { |
|
|
|
$char_name =~ s/^SARA\s+//; |
|
} else { |
|
$char_name = $char; |
|
} |
|
} |
|
if ($orig_char_name =~ /(HIRAGANA LETTER|KATAKANA LETTER|SYLLABLE|LIGATURE)/) { |
|
$char_name = lc $char_name; |
|
} elsif ($char_name =~ /\b(ANUSVARA|ANUSVARAYA|NIKAHIT|SIGN BINDI|TIPPI)\b/) { |
|
$char_name = "+m"; |
|
} elsif ($char_name =~ /\bSCHWA\b/) { |
|
$char_name = "e"; |
|
} elsif ($char_name =~ /\bIOTA\b/) { |
|
$char_name = "i"; |
|
} elsif ($char_name =~ /\s/) { |
|
} elsif ($orig_char_name =~ /KHMER LETTER/) { |
|
$char_name .= "-"; |
|
} elsif ($orig_char_name =~ /CHEROKEE LETTER/) { |
|
|
|
} elsif ($orig_char_name =~ /KHMER INDEPENDENT VOWEL/) { |
|
$char_name =~ s/q//; |
|
} elsif ($orig_char_name =~ /LETTER/) { |
|
$char_name =~ s/^[AEIOU]+([^AEIOU]+)$/$1/i; |
|
$char_name =~ s/^([^-AEIOUY]+)[AEIOU].*/$1/i; |
|
$char_name =~ s/^(Y)[AEIOU].*/$1/i if $orig_char_name =~ /\b(?:BENGALI|DEVANAGARI|GURMUKHI|GUJARATI|KANNADA|MALAYALAM|MODI|MYANMAR|ORIYA|TAMIL|TELUGU|TIBETAN)\b.*\bLETTER YA\b/; |
|
$char_name =~ s/^(Y[AEIOU]+)[^AEIOU].*$/$1/i; |
|
$char_name =~ s/^([AEIOU]+)[^AEIOU]+[AEIOU].*/$1/i; |
|
} |
|
|
|
my $result = ($orig_char_name =~ /\bCAPITAL\b/) ? (uc $char_name) : (lc $char_name); |
|
|
|
$ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style} = $result; |
|
return $result; |
|
} |
|
|
|
sub assemble_numbers_in_chart { |
|
local($this, *chart_ht, $line_number) = @_; |
|
|
|
foreach $start (sort { $a <=> $b } keys %{$chart_ht{COMPLEX_NUMERIC_START_END}}) { |
|
my $end = $chart_ht{COMPLEX_NUMERIC_START_END}->{$start}; |
|
my @numbers = (); |
|
foreach $i (($start .. ($end-1))) { |
|
my $orig_char = $chart_ht{ORIG_CHAR}->{$i}; |
|
my $node_id = $this->get_node_for_span_with_slot($i, $i+1, "numeric-value", *chart_id); |
|
if (defined($node_id)) { |
|
my $number = $chart_ht{NODE_ROMAN}->{$node_id}; |
|
if (defined($number)) { |
|
push(@numbers, $number); |
|
} elsif ($orig_char =~ /^[.,]$/) { |
|
push(@numbers, $orig_char); |
|
} else { |
|
print STDERR "Found no romanization for node_id $node_id ($i-" . ($i+1) . ") in assemble_numbers_in_chart\n" if $verbosePM; |
|
} |
|
} else { |
|
print STDERR "Found no node_id for span $i-" . ($i+1) . " in assemble_numbers_in_chart\n" if $verbosePM; |
|
} |
|
} |
|
my $complex_number = $this->assemble_number(join("\xC2\xB7", @numbers), $line_number); |
|
|
|
$this->add_node($complex_number, $start, $end, *chart_ht, "", "complex-number"); |
|
} |
|
} |
|
|
|
sub assemble_number { |
|
local($this, $s, $line_number) = @_; |
|
|
|
|
|
my $middot = "\xC2\xB7"; |
|
my @tokens = split(/$middot/, $s); |
|
my $i = 0; |
|
my @orig_tokens = @tokens; |
|
|
|
|
|
while ($i < $#tokens) { |
|
if ($tokens[$i] =~ /^\d$/) { |
|
my $j = $i+1; |
|
while (($j <= $#tokens) && ($tokens[$j] =~ /^[0-9.,]$/)) { |
|
$j++; |
|
} |
|
$j--; |
|
if ($j>$i) { |
|
my $new_token = join("", @tokens[$i .. $j]); |
|
$new_token =~ s/,//g; |
|
splice(@tokens, $i, $j-$i+1, $new_token); |
|
} |
|
} |
|
$i++; |
|
} |
|
|
|
foreach $power ((10, 100, 1000, 10000, 100000, 1000000, 100000000, 1000000000, 1000000000000)) { |
|
for (my $i=0; $i <= $#tokens; $i++) { |
|
if ($tokens[$i] == $power) { |
|
if (($i > 0) && ($tokens[($i-1)] < $power)) { |
|
splice(@tokens, $i-1, 2, ($tokens[($i-1)] * $tokens[$i])); |
|
$i--; |
|
if (($i < $#tokens) && ($tokens[($i+1)] < $power)) { |
|
splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)])); |
|
$i--; |
|
} |
|
} |
|
} |
|
|
|
my $gen_pattern = $power; |
|
$gen_pattern =~ s/^1/\[1-9\]/; |
|
if (($tokens[$i] =~ /^$gen_pattern$/) && ($i < $#tokens) && ($tokens[($i+1)] < $power)) { |
|
splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)])); |
|
$i--; |
|
} |
|
} |
|
last if $#tokens == 0; |
|
} |
|
my $result = join($middot, @tokens); |
|
if ($verbosePM) { |
|
my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-number-log.txt"; |
|
$util->append_to_file($logfile, "$s -> $result\n") if -r $logfile; |
|
|
|
} |
|
return $result; |
|
} |
|
|
|
1; |
|
|
|
|