parser / udpipe2 /pdtc-1.0 /to_conllu.sh
anasampa2's picture
Upload 151 files
ee0ec3d verified
raw
history blame
No virus
1.05 kB
#!/bin/bash
[ $# -ge 2 ] || { echo "Usage: $0 target_dir layer files" >&2; exit 1; }
target="$1"; shift
layer="$1"; shift
resources="$(dirname $0)/resources"
for file in "$@"; do
treex -Lcs Read::PDT schema_dir=$resources from=$file.$layer top_layer=$layer Write::CoNLLU to=$target/$file.conllu use_tree_id_as_sent_id=1 upos=is_parenthesis_root xpos=tag feats=is_member deprel=afun
gawk -i inplace '
BEGIN {s = 1; p = 1; last_pid = -1}
/^# sent_id = / {
split($4, parts, /-/); p_s = parts[length(parts)]; if (match(p_s, /^p[0-9]+s[0-9]+$/) > 0) {
pid = substr(p_s, 0, index(p_s, "s")); if (pid != last_pid) { print "# newpar id = '"${file//\//-}"'-p" p; p += 1; last_pid = pid}}
$4 = "s" s; s += 1}
{print}
' $target/$file.conllu
sed '
1i# newdoc id = '"${file//\//-}"'
s@^# sent_id = @# sent_id = '"${file//\//-}"'-@
' -i $target/$file.conllu
if [ "$layer" = m ]; then
sed '
s/^\([0-9][0-9]*\t\([^\t]*\t\)\{5\}\)0\(\(\t[^\t]*\)\{3\}\)$/\1_\3/
' -i $target/$file.conllu
fi
done