#! /bin/sh # pip3 install 'transformers>=4.10.0' seqeval datasets supar==1.1.3 test -f run_ner.py || curl -LO https://raw.githubusercontent.com/huggingface/transformers/v`pip3 list | sed -n 's/^transformers *\([^ ]*\) *$/\1/p'`/examples/pytorch/token-classification/run_ner.py python3 -c ' from suparkanbun.simplify import simplify c=[] h=[0] while True: try: s=input() except: quit() t=s.strip().split("\t") if len(t)==10: if t[0]!="#": t[0]=str(len(c)+1) i=len(t[1]) if i>1: form=t[1] lemma=t[2] head=t[6] deprel=t[7] for j in range(0,i-1): t[1]=form[j] if t[1] in simplify: t[1]=simplify[t[1]] t[2]=lemma[j] t[6]="-1" t[7]="compound" c.append(list(t)) t[0]=str(len(c)+1) t[1]=form[i-1] t[2]=lemma[i-1] t[6]=head t[7]=deprel if t[1] in simplify: t[1]=simplify[t[1]] c.append(list(t)) h.append(len(c)) elif s.strip()=="": for t in c: t[6]=str(int(t[0])+1 if t[6]=="-1" else h[int(t[6])]) print("\t".join(t)) print("") c=[] h=[0] ' < lzh_kyoto.conllu | tee simplified.conllu | python3 -c ' tokens=[] tags=[] while True: try: s=input() except: if len(tokens)>0: print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}") quit() t=s.split("\t") if len(t)==10: p=t[4]+","+t[3]+","+t[5] for c in t[1]: tokens.append(c) tags.append(p) elif len(tokens)>80: print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}") tokens=[] tags=[] ' | tee simplifiedPOS.json | nawk ' { if(NR%10>0) printf("%s\n",$0)>"trainPOS.json"; else printf("%s\n",$0)>"validPOS.json"; }' sed 's/^.*"tags":\[//' trainPOS.json | tr '"' '\012' | sort -u | egrep '^[nvps],' > labelPOS.txt if [ ! -f guwenbert-base.pos/pytorch_model.bin ] then mkdir -p guwenbert-base.pos python3 run_ner.py --model_name_or_path ethanyt/guwenbert-base --train_file trainPOS.json --validation_file validPOS.json --output_dir guwenbert-base.pos --do_train --do_eval --overwrite_output_dir fi if [ ! -f guwenbert-large.pos/pytorch_model.bin ] then mkdir -p guwenbert-large.pos python3 run_ner.py --model_name_or_path ethanyt/guwenbert-large --train_file trainPOS.json --validation_file validPOS.json --output_dir guwenbert-large.pos --do_train --do_eval --overwrite_output_dir --per_device_train_batch_size=4 --per_device_eval_batch_size=4 fi nawk ' BEGIN{ f[0]="test.conllu"; f[1]="dev.conllu"; for(i=2;i<10;i++) f[i]="train.conllu"; } { printf("%s\n",$0)>f[i%10]; if($0=="") i++; }' simplified.conllu if [ ! -f guwenbert-base.pos/guwenbert-base.supar ] then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p guwenbert-base.pos/guwenbert-base.supar -c biaffine-dep-en -f bert --bert ethanyt/guwenbert-base --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj fi if [ ! -f guwenbert-large.pos/guwenbert-large.supar ] then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p guwenbert-large.pos/guwenbert-large.supar -c biaffine-dep-en -f bert --bert ethanyt/guwenbert-large --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj fi python3 -c ' tokens=[] tags=[] i=0 while True: try: s=input() except: if len(tokens)>0: print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}") quit() t=s.split("\t") if len(t)==10: for c in t[1]: tokens.append(c) i+=1 else: if i==1: tags.append("S") elif i==2: tags+=["B","E"] elif i==3: tags+=["B","E2","E"] else: tags+=["B"]+["M"]*(i-4)+["E3","E2","E"] i=0 if len(tokens)>80: print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}") tokens=[] tags=[] ' < simplified.conllu | tee simplifiedDanku.json | nawk ' { if(NR%10>0) printf("%s\n",$0)>"trainDanku.json"; else printf("%s\n",$0)>"validDanku.json"; }' sed 's/^.*"tags":\[//' trainDanku.json | tr '"' '\012' | sort -u | egrep '^[A-Z]' > labelDanku.txt if [ ! -f guwenbert-base.danku/pytorch_model.bin ] then mkdir -p guwenbert-base.danku python3 run_ner.py --model_name_or_path ethanyt/guwenbert-base --train_file trainDanku.json --validation_file validDanku.json --output_dir guwenbert-base.danku --do_train --do_eval --overwrite_output_dir fi if [ ! -f guwenbert-large.danku/pytorch_model.bin ] then mkdir -p guwenbert-large.danku python3 run_ner.py --model_name_or_path ethanyt/guwenbert-large --train_file trainDanku.json --validation_file validDanku.json --output_dir guwenbert-large.danku --do_train --do_eval --overwrite_output_dir --per_device_train_batch_size=4 --per_device_eval_batch_size=4 fi python3 -c ' c=[] h=[0] while True: try: s=input() except: quit() t=s.strip().split("\t") if len(t)==10: if t[0]!="#": t[0]=str(len(c)+1) i=len(t[1]) if i>1: form=t[1] lemma=t[2] head=t[6] deprel=t[7] for j in range(0,i-1): t[1]=form[j] t[2]=lemma[j] t[6]="-1" t[7]="compound" c.append(list(t)) t[0]=str(len(c)+1) t[1]=form[i-1] t[2]=lemma[i-1] t[6]=head t[7]=deprel c.append(list(t)) h.append(len(c)) elif s.strip()=="": for t in c: t[6]=str(int(t[0])+1 if t[6]=="-1" else h[int(t[6])]) print("\t".join(t)) print("") c=[] h=[0] ' < lzh_kyoto.conllu | tee traditional.conllu | python3 -c ' tokens=[] tags=[] while True: try: s=input() except: if len(tokens)>0: print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}") quit() t=s.split("\t") if len(t)==10: p=t[4]+","+t[3]+","+t[5] for c in t[1]: tokens.append(c) tags.append(p) elif len(tokens)>80: print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}") tokens=[] tags=[] ' | tee traditionalPOS.json | nawk ' { if(NR%10>0) printf("%s\n",$0)>>"trainPOS.json"; else printf("%s\n",$0)>>"validPOS.json"; }' if [ ! -f roberta-classical-chinese-base-char.pos/pytorch_model.bin ] then mkdir -p roberta-classical-chinese-base-char.pos python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-base-char --train_file trainPOS.json --validation_file validPOS.json --output_dir roberta-classical-chinese-base-char.pos --do_train --do_eval --overwrite_output_dir fi if [ ! -f roberta-classical-chinese-large-char.pos/pytorch_model.bin ] then mkdir -p roberta-classical-chinese-large-char.pos python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-large-char --train_file trainPOS.json --validation_file validPOS.json --output_dir roberta-classical-chinese-large-char.pos --do_train --do_eval --overwrite_output_dir --per_device_train_batch_size=4 --per_device_eval_batch_size=4 fi nawk ' BEGIN{ f[0]="test.conllu"; f[1]="dev.conllu"; for(i=2;i<10;i++) f[i]="train.conllu"; } { printf("%s\n",$0)>>f[i%10]; if($0=="") i++; }' traditional.conllu if [ ! -f roberta-classical-chinese-base-char.pos/roberta-classical-chinese-base-char.supar ] then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p roberta-classical-chinese-base-char.pos/roberta-classical-chinese-base-char.supar -c biaffine-dep-en -f bert --bert KoichiYasuoka/roberta-classical-chinese-base-char --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj fi if [ ! -f roberta-classical-chinese-large-char.pos/roberta-classical-chinese-large-char.supar ] then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p roberta-classical-chinese-large-char.pos/roberta-classical-chinese-large-char.supar -c biaffine-dep-en -f bert --bert KoichiYasuoka/roberta-classical-chinese-large-char --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj fi python3 -c ' tokens=[] tags=[] i=0 while True: try: s=input() except: if len(tokens)>0: print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}") quit() t=s.split("\t") if len(t)==10: for c in t[1]: tokens.append(c) i+=1 else: if i==1: tags.append("S") elif i==2: tags+=["B","E"] elif i==3: tags+=["B","E2","E"] else: tags+=["B"]+["M"]*(i-4)+["E3","E2","E"] i=0 if len(tokens)>80: print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}") tokens=[] tags=[] ' < traditional.conllu | tee traditionalDanku.json | nawk ' { if(NR%10>0) printf("%s\n",$0)>>"trainDanku.json"; else printf("%s\n",$0)>>"validDanku.json"; }' if [ ! -f roberta-classical-chinese-base-char.danku/pytorch_model.bin ] then mkdir -p roberta-classical-chinese-base-char.danku python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-base-char --train_file trainDanku.json --validation_file validDanku.json --output_dir roberta-classical-chinese-base-char.danku --do_train --do_eval --overwrite_output_dir fi if [ ! -f roberta-classical-chinese-large-char.danku/pytorch_model.bin ] then mkdir -p roberta-classical-chinese-large-char.danku python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-large-char --train_file trainDanku.json --validation_file validDanku.json --output_dir roberta-classical-chinese-large-char.danku --do_train --do_eval --overwrite_output_dir --per_device_train_batch_size=4 --per_device_eval_batch_size=4 fi nawk ' { if(NR%10>0) printf("%s\n",$0)>"trainPOS.json"; else printf("%s\n",$0)>"validPOS.json"; }' traditionalPOS.json if [ ! -f sikubert.pos/pytorch_model.bin ] then mkdir -p sikubert.pos python3 run_ner.py --model_name_or_path SIKU-BERT/sikubert --train_file trainPOS.json --validation_file validPOS.json --output_dir sikubert.pos --do_train --do_eval --overwrite_output_dir fi if [ ! -f sikuroberta.pos/pytorch_model.bin ] then mkdir -p sikuroberta.pos python3 run_ner.py --model_name_or_path SIKU-BERT/sikuroberta --train_file trainPOS.json --validation_file validPOS.json --output_dir sikuroberta.pos --do_train --do_eval --overwrite_output_dir fi nawk ' BEGIN{ f[0]="test.conllu"; f[1]="dev.conllu"; for(i=2;i<10;i++) f[i]="train.conllu"; } { printf("%s\n",$0)>f[i%10]; if($0=="") i++; }' traditional.conllu if [ ! -f sikubert.pos/sikubert.supar ] then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p sikubert.pos/sikubert.supar -c biaffine-dep-en -f bert --bert SIKU-BERT/sikubert --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj fi if [ ! -f sikuroberta.pos/sikuroberta.supar ] then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p sikuroberta.pos/sikuroberta.supar -c biaffine-dep-en -f bert --bert SIKU-BERT/sikuroberta --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj fi nawk ' { if(NR%10>0) printf("%s\n",$0)>"trainDanku.json"; else printf("%s\n",$0)>"validDanku.json"; }' traditionalDanku.json if [ ! -f sikubert.danku/pytorch_model.bin ] then mkdir -p sikubert.danku python3 run_ner.py --model_name_or_path SIKU-BERT/sikubert --train_file trainDanku.json --validation_file validDanku.json --output_dir sikubert.danku --do_train --do_eval --overwrite_output_dir fi if [ ! -f sikuroberta.danku/pytorch_model.bin ] then mkdir -p sikuroberta.danku python3 run_ner.py --model_name_or_path SIKU-BERT/sikuroberta --train_file trainDanku.json --validation_file validDanku.json --output_dir sikuroberta.danku --do_train --do_eval --overwrite_output_dir fi exit 0