#!/usr/bin/env perl | |
# | |
# This file is part of moses. Its use is licensed under the GNU Lesser General | |
# Public License version 2.1 or, at your option, any later version. | |
use warnings; | |
use strict; | |
# ( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) ) | |
# ( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) | |
# (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) | |
# (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP | |
#(NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) | |
# (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) | |
# (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) | |
# (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) | |
#(SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) | |
# (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) ) | |
while(<STDIN>) { | |
if (/^$/) { | |
print "\n"; # parse failures | |
next; | |
} | |
# parenheses | |
s/\(/\-LRB\-/g; # tokens | |
s/\)/\-RRB\-/g; | |
s/\"LRB\"/\"\-LRB\-\"/g; # labels | |
s/\"RRB\"/\"\-RRB\-\"/g; | |
# main | |
s/<tree label=\"([^\"]+)\">/\($1/g; | |
s/ *<\/tree>/\)/g; | |
s/^\(TOP/\(/; | |
# de-escape | |
s/\&bar;/\|/g; # factor separator | |
s/\</\</g; # xml | |
s/\>/\>/g; # xml | |
s/\&bra;/\[/g; # syntax non-terminal (legacy) | |
s/\&ket;/\]/g; # syntax non-terminal (legacy) | |
s/\"/\"/g; # xml | |
s/\'/\'/g; # xml | |
s/\[/\[/g; # syntax non-terminal | |
s/\]/\]/g; # syntax non-terminal | |
s/\&/\&/g; # escape escape | |
# cleanup | |
s/ +/ /g; | |
s/ $//g; | |
s/\)$/ \)/g; | |
# output | |
print $_; | |
} | |