KoichiYasuoka's picture
model improved
7bd4e87
#! /bin/sh
# pip3 install 'transformers>=4.10.0' seqeval datasets supar==1.1.3
test -f run_ner.py || curl -LO https://raw.githubusercontent.com/huggingface/transformers/v`pip3 list | sed -n 's/^transformers *\([^ ]*\) *$/\1/p'`/examples/pytorch/token-classification/run_ner.py
python3 -c '
from suparkanbun.simplify import simplify
c=[]
h=[0]
while True:
try:
s=input()
except:
quit()
t=s.strip().split("\t")
if len(t)==10:
if t[0]!="#":
t[0]=str(len(c)+1)
i=len(t[1])
if i>1:
form=t[1]
lemma=t[2]
head=t[6]
deprel=t[7]
for j in range(0,i-1):
t[1]=form[j]
if t[1] in simplify:
t[1]=simplify[t[1]]
t[2]=lemma[j]
t[6]="-1"
t[7]="compound"
c.append(list(t))
t[0]=str(len(c)+1)
t[1]=form[i-1]
t[2]=lemma[i-1]
t[6]=head
t[7]=deprel
if t[1] in simplify:
t[1]=simplify[t[1]]
c.append(list(t))
h.append(len(c))
elif s.strip()=="":
for t in c:
t[6]=str(int(t[0])+1 if t[6]=="-1" else h[int(t[6])])
print("\t".join(t))
print("")
c=[]
h=[0]
' < lzh_kyoto.conllu | tee simplified.conllu | python3 -c '
tokens=[]
tags=[]
while True:
try:
s=input()
except:
if len(tokens)>0:
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
quit()
t=s.split("\t")
if len(t)==10:
p=t[4]+","+t[3]+","+t[5]
for c in t[1]:
tokens.append(c)
tags.append(p)
elif len(tokens)>80:
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
tokens=[]
tags=[]
' | tee simplifiedPOS.json | nawk '
{
if(NR%10>0)
printf("%s\n",$0)>"trainPOS.json";
else
printf("%s\n",$0)>"validPOS.json";
}'
sed 's/^.*"tags":\[//' trainPOS.json | tr '"' '\012' | sort -u | egrep '^[nvps],' > labelPOS.txt
if [ ! -f guwenbert-base.pos/pytorch_model.bin ]
then mkdir -p guwenbert-base.pos
python3 run_ner.py --model_name_or_path ethanyt/guwenbert-base --train_file trainPOS.json --validation_file validPOS.json --output_dir guwenbert-base.pos --do_train --do_eval --overwrite_output_dir
fi
if [ ! -f guwenbert-large.pos/pytorch_model.bin ]
then mkdir -p guwenbert-large.pos
python3 run_ner.py --model_name_or_path ethanyt/guwenbert-large --train_file trainPOS.json --validation_file validPOS.json --output_dir guwenbert-large.pos --do_train --do_eval --overwrite_output_dir --per_device_train_batch_size=4 --per_device_eval_batch_size=4
fi
nawk '
BEGIN{
f[0]="test.conllu";
f[1]="dev.conllu";
for(i=2;i<10;i++)
f[i]="train.conllu";
}
{
printf("%s\n",$0)>f[i%10];
if($0=="")
i++;
}' simplified.conllu
if [ ! -f guwenbert-base.pos/guwenbert-base.supar ]
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p guwenbert-base.pos/guwenbert-base.supar -c biaffine-dep-en -f bert --bert ethanyt/guwenbert-base --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
fi
if [ ! -f guwenbert-large.pos/guwenbert-large.supar ]
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p guwenbert-large.pos/guwenbert-large.supar -c biaffine-dep-en -f bert --bert ethanyt/guwenbert-large --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
fi
python3 -c '
tokens=[]
tags=[]
i=0
while True:
try:
s=input()
except:
if len(tokens)>0:
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
quit()
t=s.split("\t")
if len(t)==10:
for c in t[1]:
tokens.append(c)
i+=1
else:
if i==1:
tags.append("S")
elif i==2:
tags+=["B","E"]
elif i==3:
tags+=["B","E2","E"]
else:
tags+=["B"]+["M"]*(i-4)+["E3","E2","E"]
i=0
if len(tokens)>80:
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
tokens=[]
tags=[]
' < simplified.conllu | tee simplifiedDanku.json | nawk '
{
if(NR%10>0)
printf("%s\n",$0)>"trainDanku.json";
else
printf("%s\n",$0)>"validDanku.json";
}'
sed 's/^.*"tags":\[//' trainDanku.json | tr '"' '\012' | sort -u | egrep '^[A-Z]' > labelDanku.txt
if [ ! -f guwenbert-base.danku/pytorch_model.bin ]
then mkdir -p guwenbert-base.danku
python3 run_ner.py --model_name_or_path ethanyt/guwenbert-base --train_file trainDanku.json --validation_file validDanku.json --output_dir guwenbert-base.danku --do_train --do_eval --overwrite_output_dir
fi
if [ ! -f guwenbert-large.danku/pytorch_model.bin ]
then mkdir -p guwenbert-large.danku
python3 run_ner.py --model_name_or_path ethanyt/guwenbert-large --train_file trainDanku.json --validation_file validDanku.json --output_dir guwenbert-large.danku --do_train --do_eval --overwrite_output_dir --per_device_train_batch_size=4 --per_device_eval_batch_size=4
fi
python3 -c '
c=[]
h=[0]
while True:
try:
s=input()
except:
quit()
t=s.strip().split("\t")
if len(t)==10:
if t[0]!="#":
t[0]=str(len(c)+1)
i=len(t[1])
if i>1:
form=t[1]
lemma=t[2]
head=t[6]
deprel=t[7]
for j in range(0,i-1):
t[1]=form[j]
t[2]=lemma[j]
t[6]="-1"
t[7]="compound"
c.append(list(t))
t[0]=str(len(c)+1)
t[1]=form[i-1]
t[2]=lemma[i-1]
t[6]=head
t[7]=deprel
c.append(list(t))
h.append(len(c))
elif s.strip()=="":
for t in c:
t[6]=str(int(t[0])+1 if t[6]=="-1" else h[int(t[6])])
print("\t".join(t))
print("")
c=[]
h=[0]
' < lzh_kyoto.conllu | tee traditional.conllu | python3 -c '
tokens=[]
tags=[]
while True:
try:
s=input()
except:
if len(tokens)>0:
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
quit()
t=s.split("\t")
if len(t)==10:
p=t[4]+","+t[3]+","+t[5]
for c in t[1]:
tokens.append(c)
tags.append(p)
elif len(tokens)>80:
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
tokens=[]
tags=[]
' | tee traditionalPOS.json | nawk '
{
if(NR%10>0)
printf("%s\n",$0)>>"trainPOS.json";
else
printf("%s\n",$0)>>"validPOS.json";
}'
if [ ! -f roberta-classical-chinese-base-char.pos/pytorch_model.bin ]
then mkdir -p roberta-classical-chinese-base-char.pos
python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-base-char --train_file trainPOS.json --validation_file validPOS.json --output_dir roberta-classical-chinese-base-char.pos --do_train --do_eval --overwrite_output_dir
fi
if [ ! -f roberta-classical-chinese-large-char.pos/pytorch_model.bin ]
then mkdir -p roberta-classical-chinese-large-char.pos
python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-large-char --train_file trainPOS.json --validation_file validPOS.json --output_dir roberta-classical-chinese-large-char.pos --do_train --do_eval --overwrite_output_dir --per_device_train_batch_size=4 --per_device_eval_batch_size=4
fi
nawk '
BEGIN{
f[0]="test.conllu";
f[1]="dev.conllu";
for(i=2;i<10;i++)
f[i]="train.conllu";
}
{
printf("%s\n",$0)>>f[i%10];
if($0=="")
i++;
}' traditional.conllu
if [ ! -f roberta-classical-chinese-base-char.pos/roberta-classical-chinese-base-char.supar ]
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p roberta-classical-chinese-base-char.pos/roberta-classical-chinese-base-char.supar -c biaffine-dep-en -f bert --bert KoichiYasuoka/roberta-classical-chinese-base-char --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
fi
if [ ! -f roberta-classical-chinese-large-char.pos/roberta-classical-chinese-large-char.supar ]
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p roberta-classical-chinese-large-char.pos/roberta-classical-chinese-large-char.supar -c biaffine-dep-en -f bert --bert KoichiYasuoka/roberta-classical-chinese-large-char --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
fi
python3 -c '
tokens=[]
tags=[]
i=0
while True:
try:
s=input()
except:
if len(tokens)>0:
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
quit()
t=s.split("\t")
if len(t)==10:
for c in t[1]:
tokens.append(c)
i+=1
else:
if i==1:
tags.append("S")
elif i==2:
tags+=["B","E"]
elif i==3:
tags+=["B","E2","E"]
else:
tags+=["B"]+["M"]*(i-4)+["E3","E2","E"]
i=0
if len(tokens)>80:
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
tokens=[]
tags=[]
' < traditional.conllu | tee traditionalDanku.json | nawk '
{
if(NR%10>0)
printf("%s\n",$0)>>"trainDanku.json";
else
printf("%s\n",$0)>>"validDanku.json";
}'
if [ ! -f roberta-classical-chinese-base-char.danku/pytorch_model.bin ]
then mkdir -p roberta-classical-chinese-base-char.danku
python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-base-char --train_file trainDanku.json --validation_file validDanku.json --output_dir roberta-classical-chinese-base-char.danku --do_train --do_eval --overwrite_output_dir
fi
if [ ! -f roberta-classical-chinese-large-char.danku/pytorch_model.bin ]
then mkdir -p roberta-classical-chinese-large-char.danku
python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-large-char --train_file trainDanku.json --validation_file validDanku.json --output_dir roberta-classical-chinese-large-char.danku --do_train --do_eval --overwrite_output_dir --per_device_train_batch_size=4 --per_device_eval_batch_size=4
fi
nawk '
{
if(NR%10>0)
printf("%s\n",$0)>"trainPOS.json";
else
printf("%s\n",$0)>"validPOS.json";
}' traditionalPOS.json
if [ ! -f sikubert.pos/pytorch_model.bin ]
then mkdir -p sikubert.pos
python3 run_ner.py --model_name_or_path SIKU-BERT/sikubert --train_file trainPOS.json --validation_file validPOS.json --output_dir sikubert.pos --do_train --do_eval --overwrite_output_dir
fi
if [ ! -f sikuroberta.pos/pytorch_model.bin ]
then mkdir -p sikuroberta.pos
python3 run_ner.py --model_name_or_path SIKU-BERT/sikuroberta --train_file trainPOS.json --validation_file validPOS.json --output_dir sikuroberta.pos --do_train --do_eval --overwrite_output_dir
fi
nawk '
BEGIN{
f[0]="test.conllu";
f[1]="dev.conllu";
for(i=2;i<10;i++)
f[i]="train.conllu";
}
{
printf("%s\n",$0)>f[i%10];
if($0=="")
i++;
}' traditional.conllu
if [ ! -f sikubert.pos/sikubert.supar ]
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p sikubert.pos/sikubert.supar -c biaffine-dep-en -f bert --bert SIKU-BERT/sikubert --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
fi
if [ ! -f sikuroberta.pos/sikuroberta.supar ]
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p sikuroberta.pos/sikuroberta.supar -c biaffine-dep-en -f bert --bert SIKU-BERT/sikuroberta --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
fi
nawk '
{
if(NR%10>0)
printf("%s\n",$0)>"trainDanku.json";
else
printf("%s\n",$0)>"validDanku.json";
}' traditionalDanku.json
if [ ! -f sikubert.danku/pytorch_model.bin ]
then mkdir -p sikubert.danku
python3 run_ner.py --model_name_or_path SIKU-BERT/sikubert --train_file trainDanku.json --validation_file validDanku.json --output_dir sikubert.danku --do_train --do_eval --overwrite_output_dir
fi
if [ ! -f sikuroberta.danku/pytorch_model.bin ]
then mkdir -p sikuroberta.danku
python3 run_ner.py --model_name_or_path SIKU-BERT/sikuroberta --train_file trainDanku.json --validation_file validDanku.json --output_dir sikuroberta.danku --do_train --do_eval --overwrite_output_dir
fi
exit 0