#!/usr/bin/env perl # Extracts raw text from CoNLL-U file. Uses newdoc and newpar tags when available. # Copyright © 2017 Dan Zeman # License: GNU GPL use utf8; use open ':utf8'; binmode(STDIN, ':utf8'); binmode(STDOUT, ':utf8'); binmode(STDERR, ':utf8'); use Getopt::Long; # Language code 'zh' or 'ja' will trigger Chinese-like text formatting. my $language = 'en'; GetOptions ( 'language=s' => \$language ); my $chinese = $language =~ m/^(zh|ja)(_|$)/; my @sentence = (); my $text = ''; # from the text attribute of the sentence my $ftext = ''; # from the word forms of the tokens my $newpar = 0; my $newdoc = 0; my $buffer = ''; my $start = 1; my $mwtlast; while(<>) { push(@sentence, $_); if(m/^\#\s*text\s*=\s*(.+)/) { $text = $1; } elsif(m/^\#\s*newpar(\s|$)/i) { $newpar = 1; } elsif(m/^\#\s*newdoc(\s|$)/i) { $newdoc = 1; } elsif(m/^\d+-(\d+)\t/) { $mwtlast = $1; my @f = split(/\t/, $_); # Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.) # The first token of the new paragraph has "NewPar=Yes" in the MISC column. # Multi-word tokens have this in the token-introducing line. if($f[9] =~ m/NewPar=Yes/i) { # Empty line between documents and paragraphs. (There may have been # a paragraph break before the first part of this sentence as well!) $buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer); $buffer .= $ftext; # Line breaks at word boundaries after at most 80 characters. $buffer = print_lines_from_buffer($buffer, 80, $chinese); print("$buffer\n\n"); $buffer = ''; # Start is only true until we write the first sentence of the input stream. $start = 0; $newdoc = 0; $newpar = 0; $text = ''; $ftext = ''; } $ftext .= $f[1]; $ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/); } elsif(m/^(\d+)\t/ && !(defined($mwtlast) && $1<=$mwtlast)) { $mwtlast = undef; my @f = split(/\t/, $_); # Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.) # The first token of the new paragraph has "NewPar=Yes" in the MISC column. # Multi-word tokens have this in the token-introducing line. if($f[9] =~ m/NewPar=Yes/i) { # Empty line between documents and paragraphs. (There may have been # a paragraph break before the first part of this sentence as well!) $buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer); $buffer .= $ftext; # Line breaks at word boundaries after at most 80 characters. $buffer = print_lines_from_buffer($buffer, 80, $chinese); print("$buffer\n\n"); $buffer = ''; # Start is only true until we write the first sentence of the input stream. $start = 0; $newdoc = 0; $newpar = 0; $text = ''; $ftext = ''; } $ftext .= $f[1]; $ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/); } elsif(m/^\s*$/) { # In a valid CoNLL-U file, $text should be equal to $ftext except for the # space after the last token. However, if there have been intra-sentential # paragraph breaks, $ftext contains only the part after the last such # break, and $text is empty. Hence we currently use $ftext everywhere # and ignore $text, even though we note it when seeing the text attribute. # $text .= ' ' unless($chinese); # Empty line between documents and paragraphs. $buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer); $buffer .= $ftext; # Line breaks at word boundaries after at most 80 characters. $buffer = print_lines_from_buffer($buffer, 80, $chinese); # Start is only true until we write the first sentence of the input stream. $start = 0; $newdoc = 0; $newpar = 0; $text = ''; $ftext = ''; $mwtlast = undef; } } # There may be unflushed buffer contents after the last sentence, less than 80 characters # (otherwise we would have already dealt with it), so just flush it. if($buffer ne '') { print("$buffer\n"); } #------------------------------------------------------------------------------ # Checks whether we have to print an extra line to separate paragraphs. Does it # if necessary. Returns the updated buffer. #------------------------------------------------------------------------------ sub print_new_paragraph_if_needed { my $start = shift; my $newdoc = shift; my $newpar = shift; my $buffer = shift; if(!$start && ($newdoc || $newpar)) { if($buffer ne '') { print("$buffer\n"); $buffer = ''; } print("\n"); } return $buffer; } #------------------------------------------------------------------------------ # Prints as many complete lines of text as there are in the buffer. Returns the # remaining contents of the buffer. #------------------------------------------------------------------------------ sub print_lines_from_buffer { my $buffer = shift; # Maximum number of characters allowed on one line, not counting the line # break character(s), which also replace any number of trailing spaces. # Exception: If there is a word longer than the limit, it will be printed # on one line. # Note that this algorithm is not suitable for Chinese and Japanese. my $limit = shift; # We need a different algorithm for Chinese and Japanese. my $chinese = shift; if($chinese) { return print_chinese_lines_from_buffer($buffer, $limit); } if(length($buffer) >= $limit) { my @cbuffer = split(//, $buffer); # There may be more than one new line waiting in the buffer. while(scalar(@cbuffer) >= $limit) { ###!!! We could make it simpler if we ignored multi-space sequences ###!!! between words. It sounds OK to ignore them because at the ###!!! line break we do not respect original spacing anyway. my $i; my $ilastspace; for($i = 0; $i<=$#cbuffer; $i++) { if($i>$limit && defined($ilastspace)) { last; } if($cbuffer[$i] =~ m/\s/) { $ilastspace = $i; } } if(defined($ilastspace) && $ilastspace>0) { my @out = @cbuffer[0..($ilastspace-1)]; splice(@cbuffer, 0, $ilastspace+1); print(join('', @out), "\n"); } else { print(join('', @cbuffer), "\n"); splice(@cbuffer); } } $buffer = join('', @cbuffer); } return $buffer; } #------------------------------------------------------------------------------ # Prints as many complete lines of text as there are in the buffer. Returns the # remaining contents of the buffer. Assumes that there are no spaces between # words and lines can be broken between any two characters, as is the custom in # Chinese and Japanese. #------------------------------------------------------------------------------ sub print_chinese_lines_from_buffer { my $buffer = shift; # Maximum number of characters allowed on one line, not counting the line # break character(s). my $limit = shift; # We cannot simply print the first $limit characters from the buffer, # followed by a line break. There could be embedded Latin words or # numbers and we do not want to insert a line break in the middle of # a foreign word. my @cbuffer = split(//, $buffer); while(scalar(@cbuffer) >= $limit) { my $nprint = 0; for(my $i = 0; $i <= $#cbuffer; $i++) { if($i > $limit && $nprint > 0) { last; } unless($i < $#cbuffer && $cbuffer[$i] =~ m/[\p{Latin}0-9]/ && $cbuffer[$i+1] =~ m/[\p{Latin}0-9]/) { $nprint = $i+1; } } my @out = @cbuffer[0..($nprint-1)]; splice(@cbuffer, 0, $nprint); print(join('', @out), "\n"); } $buffer = join('', @cbuffer); return $buffer; }