| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | use strict; |
| | use warnings; |
| | use Getopt::Long; |
| | use IO::File; |
| | use File::Basename; |
| |
|
| | |
| | my $ZCAT = "gzip -cd"; |
| | my $BZCAT = "bzcat"; |
| |
|
| | binmode(STDIN, ":utf8"); |
| | binmode(STDOUT, ":utf8"); |
| | binmode(STDERR, ":utf8"); |
| |
|
| | my $factordir = "factors"; |
| | GetOptions( |
| | "factordir=s" => \$factordir, |
| | ); |
| | my $corppathname = shift; |
| |
|
| | my @requested_factors = split /[\+,]/, join("+", @ARGV); |
| | die "usage: reduce_combine.pl corpusfile 0 add_factor_label1 2 add_factor_label2 ..." |
| | if !defined $corppathname || 0 == scalar @requested_factors; |
| |
|
| | my @addfactors = grep { ! /^[0-9]+$/ } @requested_factors; |
| | |
| |
|
| |
|
| | my $corp_stream; |
| | if ($corppathname eq "-") { |
| | $corp_stream=*STDIN; |
| |
|
| | die "Won't add factors to corpus coming from stdin." if scalar @addfactors; |
| | } else { |
| | $corp_stream = my_open($corppathname); |
| | } |
| |
|
| | my $corpdn = dirname($corppathname); |
| | my $corpbn = basename($corppathname); |
| | $corpbn =~ s/\.gz$//; |
| | my %streams = map { |
| | my $fn = "$corpdn/$factordir/$corpbn.$_"; |
| | $fn .= ".gz" if ! -e $fn; |
| | my $stream = my_open($fn); |
| | die "Can't read '$fn'" if !defined $stream; |
| | ( $_, $stream ); |
| | } @addfactors; |
| |
|
| | my $nr=0; |
| | while (<$corp_stream>) { |
| | $nr++; |
| | print STDERR "." if $nr % 10000 == 0; |
| | print STDERR "($nr)" if $nr % 100000 == 0; |
| | chomp; |
| | my @intokens = split / /; |
| | |
| | my %lines_of_extratoks; |
| | foreach my $factor (keys %streams) { |
| | my $line = readline($streams{$factor}); |
| | die "Additional factor file $factor contains too few sentences!" |
| | if !defined $line; |
| | chomp($line); |
| | my @toks = split / /, $line; |
| | die "Incompatible number of words in factor $factor on line $nr." |
| | if $#toks != $#intokens; |
| | $lines_of_extratoks{$factor} = \@toks; |
| | } |
| |
|
| | |
| | for(my $i=0; $i<=$#intokens; $i++) { |
| | my $token = $intokens[$i]; |
| | my @outtoken = (); |
| | my @factors = split /\|/, $token; |
| | |
| | foreach my $name (@requested_factors) { |
| | my $f = undef; |
| | if ($name =~ /^[0-9]+$/o) { |
| | |
| | $f = $factors[$name]; |
| | die "Missed factor $name in $token on line $nr" |
| | if !defined $f || $f eq ""; |
| | } else { |
| | |
| | $f = $lines_of_extratoks{$name}->[$i]; |
| | die "Missed factor $name on line $nr" |
| | if !defined $f || $f eq ""; |
| | } |
| | |
| | push @outtoken, $f; |
| | } |
| | print " " if $i != 0; |
| | print join("|", @outtoken); |
| | } |
| | print "\n"; |
| | } |
| | close $corp_stream; |
| | print STDERR "Done.\n"; |
| |
|
| |
|
| | sub my_open { |
| | my $f = shift; |
| | die "Not found: $f" if ! -e $f; |
| |
|
| | my $opn; |
| | my $hdl; |
| | my $ft = `file $f`; |
| | |
| | if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) { |
| | $opn = "$ZCAT $f |"; |
| | } elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) { |
| | $opn = "$BZCAT $f |"; |
| | } else { |
| | $opn = "$f"; |
| | } |
| | open $hdl, $opn or die "Can't open '$opn': $!"; |
| | binmode $hdl, ":utf8"; |
| | return $hdl; |
| | } |
| |
|