|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use Getopt::Long "GetOptions"; |
|
|
|
my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE); |
|
my $FILLER = ":s:es"; |
|
my $MIN_SIZE = 3; |
|
my $MIN_COUNT = 5; |
|
my $MAX_COUNT = 5; |
|
my $FACTORED = 0; |
|
my $SYNTAX = 0; |
|
my $MARK_SPLIT = 0; |
|
my $BINARIZE = 0; |
|
$HELP = 1 |
|
unless &GetOptions('corpus=s' => \$CORPUS, |
|
'model=s' => \$MODEL, |
|
'filler=s' => \$FILLER, |
|
'factored' => \$FACTORED, |
|
'min-size=i' => \$MIN_SIZE, |
|
'min-count=i' => \$MIN_COUNT, |
|
'max-count=i' => \$MAX_COUNT, |
|
'help' => \$HELP, |
|
'verbose' => \$VERBOSE, |
|
'syntax' => \$SYNTAX, |
|
'binarize' => \$BINARIZE, |
|
'mark-split' => \$MARK_SPLIT, |
|
'train' => \$TRAIN); |
|
|
|
if ($HELP || |
|
( $TRAIN && !$CORPUS) || |
|
(!$TRAIN && !$MODEL)) { |
|
print "Compound splitter\n"; |
|
print "-----------------\n\n"; |
|
print "train: compound-splitter -train -corpus txt-file -model new-model\n"; |
|
print "apply: compound-splitter -model trained-model < in > out\n"; |
|
print "options: -min-size: minimum word size (default $MIN_SIZE)\n"; |
|
print " -min-count: minimum word count (default $MIN_COUNT)\n"; |
|
print " -filler: filler letters between words (default $FILLER)\n"; |
|
print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n"; |
|
print " -syntax: syntactically parsed data (default $SYNTAX)\n"; |
|
print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n"; |
|
print " -binarize: binarize subtree for split word (default $BINARIZE)\n"; |
|
exit; |
|
} |
|
|
|
if ($TRAIN) { |
|
if ($SYNTAX) { &train_syntax(); } |
|
elsif ($FACTORED) { &train_factored(); } |
|
else { &train(); } |
|
} |
|
else { |
|
&apply(); |
|
} |
|
|
|
sub train { |
|
my %COUNT; |
|
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); |
|
while(<CORPUS>) { |
|
chop; s/\s+/ /g; s/^ //; s/ $//; |
|
foreach (split) { |
|
$COUNT{$_}++; |
|
} |
|
} |
|
close(CORPUS); |
|
&save_trained_model(\%COUNT); |
|
} |
|
|
|
sub save_trained_model { |
|
my ($COUNT) = @_; |
|
my $id = 0; |
|
open(MODEL,">".$MODEL); |
|
foreach my $word (keys %$COUNT) { |
|
print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n"; |
|
} |
|
close(MODEL); |
|
print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n"; |
|
} |
|
|
|
sub train_factored { |
|
my (%COUNT,%FACTORED_COUNT); |
|
|
|
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); |
|
while(<CORPUS>) { |
|
chop; s/\s+/ /g; s/^ //; s/ $//; |
|
foreach my $factored_word (split) { |
|
my $word = $factored_word; |
|
$word =~ s/\|.+//g; |
|
$FACTORED_COUNT{$word}{$factored_word}++; |
|
} |
|
} |
|
close(CORPUS); |
|
|
|
foreach my $word (keys %FACTORED_COUNT) { |
|
my ($max,$best,$total) = (0,"",0); |
|
foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) { |
|
my $count = $FACTORED_COUNT{$word}{$factored_word}; |
|
$total += $count; |
|
if ($count > $max) { |
|
$max = $count; |
|
$best = $factored_word; |
|
} |
|
} |
|
$COUNT{$best} = $total; |
|
} |
|
&save_trained_model(\%COUNT); |
|
} |
|
|
|
sub train_syntax { |
|
my (%COUNT,%LABELED_COUNT); |
|
|
|
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); |
|
while(<CORPUS>) { |
|
chop; s/\s+/ /g; s/^ //; s/ $//; |
|
my $label; |
|
foreach (split) { |
|
if (/^label="([^\"]+)"/) { |
|
$label = $1; |
|
} |
|
elsif (! /^</) { |
|
$LABELED_COUNT{$_}{$label}++; |
|
} |
|
} |
|
} |
|
close(CORPUS); |
|
|
|
|
|
foreach my $word (keys %LABELED_COUNT) { |
|
my ($max,$best,$total) = (0,"",0); |
|
foreach my $label (keys %{$LABELED_COUNT{$word}}) { |
|
my $count = $LABELED_COUNT{$word}{$label}; |
|
$total += $count; |
|
if ($count > $max) { |
|
$max = $count; |
|
$best = "$word $label"; |
|
} |
|
} |
|
$COUNT{$best} = $total; |
|
} |
|
&save_trained_model(\%COUNT); |
|
} |
|
|
|
sub apply { |
|
my (%COUNT,%TRUECASE,%LABEL); |
|
open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'"); |
|
while(<MODEL>) { |
|
chomp; |
|
my ($id,$factored_word,$count) = split(/\t/); |
|
my $label; |
|
($factored_word,$label) = split(/ /,$factored_word); |
|
my $word = $factored_word; |
|
$word =~ s/\|.+//g; |
|
my $lc = lc($word); |
|
|
|
next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; |
|
$COUNT{$lc} = $count; |
|
$TRUECASE{$lc} = $factored_word; |
|
$LABEL{$lc} = $label if $SYNTAX; |
|
} |
|
close(MODEL); |
|
|
|
while(<STDIN>) { |
|
my $first = 1; |
|
chop; s/\s+/ /g; s/^ //; s/ $//; |
|
my @BUFFER; |
|
foreach my $factored_word (split) { |
|
print " " unless $first; |
|
$first = 0; |
|
|
|
|
|
if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) { |
|
push @BUFFER,$factored_word; |
|
$first = 1; |
|
next; |
|
} |
|
|
|
|
|
my $word = $factored_word; |
|
$word =~ s/\|.+//g; |
|
my $lc = lc($word); |
|
|
|
print STDERR "considering $word ($lc)...\n" if $VERBOSE; |
|
|
|
if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) || |
|
$lc !~ /[a-zA-Z]/) {; |
|
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); |
|
print $factored_word; |
|
print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE; |
|
next; |
|
} |
|
|
|
|
|
my $final = length($word)-1; |
|
my %REACHABLE; |
|
for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); } |
|
|
|
print STDERR "splitting $word:\n" if $VERBOSE; |
|
for(my $end=$MIN_SIZE;$end<length($word);$end++) { |
|
for(my $start=0;$start<=$end-$MIN_SIZE;$start++) { |
|
next unless $start == 0 || defined($REACHABLE{$start-1}); |
|
foreach my $filler (split(/:/,$FILLER)) { |
|
next if $start == 0 && $filler ne ""; |
|
next if lc(substr($word,$start,length($filler))) ne $filler; |
|
my $subword = lc(substr($word, |
|
$start+length($filler), |
|
$end-$start+1-length($filler))); |
|
next unless defined($COUNT{$subword}); |
|
next unless $COUNT{$subword} >= $MIN_COUNT; |
|
print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE; |
|
push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}"; |
|
} |
|
} |
|
} |
|
|
|
|
|
if (!defined($REACHABLE{$final})) { |
|
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); |
|
print $factored_word; |
|
next; |
|
} |
|
|
|
my ($best_split,$best_score) = ("",0); |
|
|
|
my %ITERATOR; |
|
for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; } |
|
my $done = 0; |
|
while(1) { |
|
|
|
my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0); |
|
while($pos>0) { |
|
last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; |
|
my ($nextpos,$subword,$count) |
|
= split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]); |
|
$decomp = $subword." ".$decomp; |
|
$score *= $count; |
|
$num++; |
|
push @INDEX,$pos; |
|
|
|
$pos = $nextpos-1; |
|
} |
|
|
|
chop($decomp); |
|
print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE; |
|
$score **= 1/$num; |
|
if ($score>$best_score) { |
|
$best_score = $score; |
|
$best_split = $decomp; |
|
} |
|
|
|
|
|
my $increase = -1; |
|
while($increase<$final) { |
|
$increase = pop @INDEX; |
|
$ITERATOR{$increase}++; |
|
last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase}; |
|
} |
|
last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final}; |
|
for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; } |
|
} |
|
if ($best_split !~ / /) { |
|
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); |
|
print $factored_word; |
|
next; |
|
} |
|
if (!$SYNTAX) { |
|
print $best_split; |
|
} |
|
else { |
|
$BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT; |
|
$BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n"); |
|
my $pos = $1; |
|
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); |
|
|
|
my @SPLIT = split(/ /,$best_split); |
|
my @OUT = (); |
|
if ($BINARIZE) { |
|
for(my $w=0;$w<scalar(@SPLIT)-2;$w++) { |
|
push @OUT,"<tree label=\"\@$pos\">"; |
|
} |
|
} |
|
for(my $w=0;$w<scalar(@SPLIT);$w++) { |
|
if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; } |
|
push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>"; |
|
} |
|
print join(" ",@OUT); |
|
} |
|
} |
|
print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); |
|
print "\n"; |
|
} |
|
} |
|
|