KoichiYasuoka commited on
Commit
8abcc73
1 Parent(s): 4758dea

re-initialize

Browse files
suparkanbun/models/gloss.orig.txt ADDED
The diff for this file is too large to render. See raw diff
 
suparkanbun/models/labelPOS.txt ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ n,代名詞,人称,他,PRON,Person=1|PronType=Prs
2
+ n,代名詞,人称,他,PRON,Person=2|PronType=Prs
3
+ n,代名詞,人称,他,PRON,Person=3|PronType=Prs
4
+ n,代名詞,人称,他,PRON,PronType=Prs
5
+ n,代名詞,人称,他,PRON,PronType=Prs|Reflex=Yes
6
+ n,代名詞,人称,止格,PRON,Person=1|PronType=Prs
7
+ n,代名詞,人称,止格,PRON,Person=2|PronType=Prs
8
+ n,代名詞,人称,止格,PRON,Person=3|PronType=Prs
9
+ n,代名詞,人称,止格,PRON,PronType=Prs
10
+ n,代名詞,人称,起格,PRON,Person=1|PronType=Prs
11
+ n,代名詞,人称,起格,PRON,Person=2|PronType=Prs
12
+ n,代名詞,人称,起格,PRON,Person=3|PronType=Prs
13
+ n,代名詞,人称,起格,PRON,PronType=Prs
14
+ n,代名詞,指示,*,PRON,PronType=Dem
15
+ n,代名詞,疑問,*,PRON,PronType=Int
16
+ n,名詞,不可譲,属性,NOUN,_
17
+ n,名詞,不可譲,疾病,NOUN,_
18
+ n,名詞,不可譲,身体,NOUN,_
19
+ n,名詞,主体,動物,NOUN,_
20
+ n,名詞,主体,国名,PROPN,Case=Loc|NameType=Nat
21
+ n,名詞,主体,書物,NOUN,_
22
+ n,名詞,主体,機関,NOUN,_
23
+ n,名詞,主体,集団,NOUN,_
24
+ n,名詞,人,その他の人名,PROPN,NameType=Prs
25
+ n,名詞,人,人,NOUN,_
26
+ n,名詞,人,名,PROPN,NameType=Giv
27
+ n,名詞,人,姓氏,PROPN,NameType=Sur
28
+ n,名詞,人,役割,NOUN,_
29
+ n,名詞,人,複合的人名,PROPN,NameType=Prs
30
+ n,名詞,人,関係,NOUN,_
31
+ n,名詞,制度,儀礼,NOUN,_
32
+ n,名詞,制度,場,NOUN,Case=Loc
33
+ n,名詞,可搬,乗り物,NOUN,_
34
+ n,名詞,可搬,伝達,NOUN,_
35
+ n,名詞,可搬,成果物,NOUN,_
36
+ n,名詞,可搬,糧食,NOUN,_
37
+ n,名詞,可搬,道具,NOUN,_
38
+ n,名詞,固定物,地名,PROPN,Case=Loc|NameType=Geo
39
+ n,名詞,固定物,地形,NOUN,Case=Loc
40
+ n,名詞,固定物,建造物,NOUN,Case=Loc
41
+ n,名詞,固定物,樹木,NOUN,_
42
+ n,名詞,固定物,関係,NOUN,Case=Loc
43
+ n,名詞,外観,人,NOUN,_
44
+ n,名詞,天象,天文,NOUN,_
45
+ n,名詞,天象,怪異,NOUN,_
46
+ n,名詞,天象,気象,NOUN,_
47
+ n,名詞,度量衡,*,NOUN,NounType=Clf
48
+ n,名詞,思考,*,NOUN,_
49
+ n,名詞,描写,形質,NOUN,_
50
+ n,名詞,描写,態度,NOUN,_
51
+ n,名詞,数量,*,NOUN,_
52
+ n,名詞,時,*,NOUN,Case=Tem
53
+ n,名詞,行為,*,NOUN,_
54
+ n,数詞,干支,*,NUM,NumType=Ord
55
+ n,数詞,数,*,NUM,_
56
+ n,数詞,数字,*,NUM,_
57
+ p,助詞,句末,*,PART,_
58
+ p,助詞,句頭,*,PART,_
59
+ p,助詞,接続,並列,CCONJ,_
60
+ p,助詞,接続,体言化,PART,_
61
+ p,助詞,接続,属格,SCONJ,_
62
+ p,助詞,提示,*,PART,_
63
+ p,感嘆詞,*,*,INTJ,_
64
+ p,接尾辞,*,*,PART,_
65
+ s,文字,*,*,SYM,_
66
+ s,記号,一般,*,SYM,_
67
+ s,記号,句点,*,PUNCT,_
68
+ s,記号,読点,*,PUNCT,_
69
+ v,前置詞,基盤,*,ADP,_
70
+ v,前置詞,源泉,*,ADP,_
71
+ v,前置詞,経由,*,ADP,_
72
+ v,前置詞,関係,*,ADP,_
73
+ v,副詞,判断,推定,ADV,_
74
+ v,副詞,判断,確定,ADV,_
75
+ v,副詞,判断,逆接,ADV,_
76
+ v,副詞,否定,体言否定,ADV,Polarity=Neg
77
+ v,副詞,否定,有界,ADV,Polarity=Neg
78
+ v,副詞,否定,無界,ADV,Polarity=Neg
79
+ v,副詞,否定,禁止,ADV,Polarity=Neg
80
+ v,副詞,描写,*,ADV,_
81
+ v,副詞,時相,変化,ADV,AdvType=Tim
82
+ v,副詞,時相,完了,ADV,AdvType=Tim|Aspect=Perf
83
+ v,副詞,時相,将来,ADV,AdvType=Tim|Tense=Fut
84
+ v,副詞,時相,恒常,ADV,AdvType=Tim
85
+ v,副詞,時相,現在,ADV,AdvType=Tim|Tense=Pres
86
+ v,副詞,時相,終局,ADV,AdvType=Tim
87
+ v,副詞,時相,継起,ADV,AdvType=Tim
88
+ v,副詞,時相,緊接,ADV,AdvType=Tim
89
+ v,副詞,時相,過去,ADV,AdvType=Tim|Tense=Past
90
+ v,副詞,疑問,原因,ADV,AdvType=Cau
91
+ v,副詞,疑問,反語,ADV,_
92
+ v,副詞,疑問,所在,ADV,_
93
+ v,副詞,程度,やや高度,ADV,AdvType=Deg|Degree=Cmp
94
+ v,副詞,程度,極度,ADV,AdvType=Deg|Degree=Sup
95
+ v,副詞,程度,軽度,ADV,AdvType=Deg|Degree=Pos
96
+ v,副詞,範囲,共同,ADV,_
97
+ v,副詞,範囲,総括,ADV,_
98
+ v,副詞,範囲,限定,ADV,_
99
+ v,副詞,頻度,偶発,ADV,_
100
+ v,副詞,頻度,重複,ADV,_
101
+ v,副詞,頻度,頻繁,ADV,_
102
+ v,助動詞,受動,*,AUX,Voice=Pass
103
+ v,助動詞,可能,*,AUX,Mood=Pot
104
+ v,助動詞,必要,*,AUX,Mood=Nec
105
+ v,助動詞,願望,*,AUX,Mood=Des
106
+ v,動詞,変化,制度,VERB,_
107
+ v,動詞,変化,性質,VERB,_
108
+ v,動詞,変化,生物,VERB,_
109
+ v,動詞,存在,存在,VERB,Polarity=Neg
110
+ v,動詞,存在,存在,VERB,VerbType=Cop
111
+ v,動詞,存在,存在,VERB,_
112
+ v,動詞,描写,境遇,VERB,Degree=Pos
113
+ v,動詞,描写,形質,VERB,Degree=Pos
114
+ v,動詞,描写,態度,VERB,Degree=Pos
115
+ v,動詞,描写,量,VERB,Degree=Pos
116
+ v,動詞,行為,交流,VERB,_
117
+ v,動詞,行為,伝達,VERB,_
118
+ v,動詞,行為,使役,VERB,_
119
+ v,動詞,行為,儀礼,VERB,_
120
+ v,動詞,行為,分類,VERB,Degree=Equ
121
+ v,動詞,行為,動作,VERB,_
122
+ v,動詞,行為,姿勢,VERB,_
123
+ v,動詞,行為,役割,VERB,_
124
+ v,動詞,行為,得失,VERB,_
125
+ v,動詞,行為,態度,VERB,_
126
+ v,動詞,行為,生産,VERB,_
127
+ v,動詞,行為,移動,VERB,_
128
+ v,動詞,行為,設置,VERB,_
129
+ v,動詞,行為,飲食,VERB,_
suparkanbun/models/lzh_kyoto.conllu ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86c46887798cd5d93f500ef99674897876501177366ad2b3e4ad861f3a362beb
3
+ size 24744523
suparkanbun/models/mkmodel.sh ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/sh
2
+ # pip3 install transformers seqeval datasets supar
3
+ test -f run_ner.py || curl -LO https://raw.githubusercontent.com/huggingface/transformers/v4.0.1/examples/token-classification/run_ner.py
4
+
5
+ python3 -c '
6
+ from suparkanbun.simplify import simplify
7
+ c=[]
8
+ h=[0]
9
+ while True:
10
+ try:
11
+ s=input()
12
+ except:
13
+ quit()
14
+ t=s.strip().split("\t")
15
+ if len(t)==10:
16
+ if t[0]!="#":
17
+ t[0]=str(len(c)+1)
18
+ i=len(t[1])
19
+ if i>1:
20
+ form=t[1]
21
+ lemma=t[2]
22
+ head=t[6]
23
+ deprel=t[7]
24
+ for j in range(0,i-1):
25
+ t[1]=form[j]
26
+ if t[1] in simplify:
27
+ t[1]=simplify[t[1]]
28
+ t[2]=lemma[j]
29
+ t[6]="-1"
30
+ t[7]="compound"
31
+ c.append(list(t))
32
+ t[0]=str(len(c)+1)
33
+ t[1]=form[i-1]
34
+ t[2]=lemma[i-1]
35
+ t[6]=head
36
+ t[7]=deprel
37
+ if t[1] in simplify:
38
+ t[1]=simplify[t[1]]
39
+ c.append(list(t))
40
+ h.append(len(c))
41
+ elif s.strip()=="":
42
+ for t in c:
43
+ t[6]=str(int(t[0])+1 if t[6]=="-1" else h[int(t[6])])
44
+ print("\t".join(t))
45
+ print("")
46
+ c=[]
47
+ h=[0]
48
+ ' < lzh_kyoto.conllu | tee simplified.conllu | python3 -c '
49
+ tokens=[]
50
+ tags=[]
51
+ while True:
52
+ try:
53
+ s=input()
54
+ except:
55
+ if len(tokens)>0:
56
+ print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
57
+ quit()
58
+ t=s.split("\t")
59
+ if len(t)==10:
60
+ p=t[4]+","+t[3]+","+t[5]
61
+ for c in t[1]:
62
+ tokens.append(c)
63
+ tags.append(p)
64
+ elif len(tokens)>80:
65
+ print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
66
+ tokens=[]
67
+ tags=[]
68
+ ' | tee simplifiedPOS.json | nawk '
69
+ {
70
+ if(NR%10>0)
71
+ printf("%s\n",$0)>"trainPOS.json";
72
+ else
73
+ printf("%s\n",$0)>"validPOS.json";
74
+ }'
75
+ sed 's/^.*"tags":\[//' trainPOS.json | tr '"' '\012' | sort -u | egrep '^[nvps],' > labelPOS.txt
76
+ if [ ! -d guwenbert-base.pos ]
77
+ then mkdir -p guwenbert-base.pos
78
+ python3 run_ner.py --model_name_or_path ethanyt/guwenbert-base --train_file trainPOS.json --validation_file validPOS.json --output_dir guwenbert-base.pos --do_train --do_eval
79
+ fi
80
+ if [ ! -d guwenbert-large.pos ]
81
+ then mkdir -p guwenbert-large.pos
82
+ python3 run_ner.py --model_name_or_path ethanyt/guwenbert-large --train_file trainPOS.json --validation_file validPOS.json --output_dir guwenbert-large.pos --do_train --do_eval
83
+ fi
84
+
85
+ nawk '
86
+ BEGIN{
87
+ f[0]="test.conllu";
88
+ f[1]="dev.conllu";
89
+ for(i=2;i<10;i++)
90
+ f[i]="train.conllu";
91
+ }
92
+ {
93
+ printf("%s\n",$0)>f[i%10];
94
+ if($0=="")
95
+ i++;
96
+ }' simplified.conllu
97
+ if [ ! -f guwenbert-base.pos/guwenbert-base.supar ]
98
+ then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p guwenbert-base.pos/guwenbert-base.supar -c biaffine-dep-en -f bert --bert ethanyt/guwenbert-base --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
99
+ fi
100
+ if [ ! -f guwenbert-large.pos/guwenbert-large.supar ]
101
+ then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p guwenbert-large.pos/guwenbert-large.supar -c biaffine-dep-en -f bert --bert ethanyt/guwenbert-large --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
102
+ fi
103
+
104
+ python3 -c '
105
+ tokens=[]
106
+ tags=[]
107
+ i=0
108
+ while True:
109
+ try:
110
+ s=input()
111
+ except:
112
+ if len(tokens)>0:
113
+ print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
114
+ quit()
115
+ t=s.split("\t")
116
+ if len(t)==10:
117
+ for c in t[1]:
118
+ tokens.append(c)
119
+ i+=1
120
+ else:
121
+ if i==1:
122
+ tags.append("S")
123
+ elif i==2:
124
+ tags+=["B","E"]
125
+ elif i==3:
126
+ tags+=["B","E2","E"]
127
+ else:
128
+ tags+=["B"]+["M"]*(i-4)+["E3","E2","E"]
129
+ i=0
130
+ if len(tokens)>80:
131
+ print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
132
+ tokens=[]
133
+ tags=[]
134
+ ' < simplified.conllu | tee simplifiedDanku.json | nawk '
135
+ {
136
+ if(NR%10>0)
137
+ printf("%s\n",$0)>"trainDanku.json";
138
+ else
139
+ printf("%s\n",$0)>"validDanku.json";
140
+ }'
141
+ sed 's/^.*"tags":\[//' trainDanku.json | tr '"' '\012' | sort -u | egrep '^[A-Z]' > labelDanku.txt
142
+ if [ ! -d guwenbert-base.danku ]
143
+ then mkdir -p guwenbert-base.danku
144
+ python3 run_ner.py --model_name_or_path ethanyt/guwenbert-base --train_file trainDanku.json --validation_file validDanku.json --output_dir guwenbert-base.danku --do_train --do_eval
145
+ fi
146
+ if [ ! -d guwenbert-large.danku ]
147
+ then mkdir -p guwenbert-large.danku
148
+ python3 run_ner.py --model_name_or_path ethanyt/guwenbert-large --train_file trainDanku.json --validation_file validDanku.json --output_dir guwenbert-large.danku --do_train --do_eval
149
+ fi
150
+
151
+ python3 -c '
152
+ c=[]
153
+ h=[0]
154
+ while True:
155
+ try:
156
+ s=input()
157
+ except:
158
+ quit()
159
+ t=s.strip().split("\t")
160
+ if len(t)==10:
161
+ if t[0]!="#":
162
+ t[0]=str(len(c)+1)
163
+ i=len(t[1])
164
+ if i>1:
165
+ form=t[1]
166
+ lemma=t[2]
167
+ head=t[6]
168
+ deprel=t[7]
169
+ for j in range(0,i-1):
170
+ t[1]=form[j]
171
+ t[2]=lemma[j]
172
+ t[6]="-1"
173
+ t[7]="compound"
174
+ c.append(list(t))
175
+ t[0]=str(len(c)+1)
176
+ t[1]=form[i-1]
177
+ t[2]=lemma[i-1]
178
+ t[6]=head
179
+ t[7]=deprel
180
+ c.append(list(t))
181
+ h.append(len(c))
182
+ elif s.strip()=="":
183
+ for t in c:
184
+ t[6]=str(int(t[0])+1 if t[6]=="-1" else h[int(t[6])])
185
+ print("\t".join(t))
186
+ print("")
187
+ c=[]
188
+ h=[0]
189
+ ' < lzh_kyoto.conllu | tee traditional.conllu | python3 -c '
190
+ tokens=[]
191
+ tags=[]
192
+ while True:
193
+ try:
194
+ s=input()
195
+ except:
196
+ if len(tokens)>0:
197
+ print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
198
+ quit()
199
+ t=s.split("\t")
200
+ if len(t)==10:
201
+ p=t[4]+","+t[3]+","+t[5]
202
+ for c in t[1]:
203
+ tokens.append(c)
204
+ tags.append(p)
205
+ elif len(tokens)>80:
206
+ print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
207
+ tokens=[]
208
+ tags=[]
209
+ ' | tee traditionalPOS.json | nawk '
210
+ {
211
+ if(NR%10>0)
212
+ printf("%s\n",$0)>>"trainPOS.json";
213
+ else
214
+ printf("%s\n",$0)>>"validPOS.json";
215
+ }'
216
+ if [ ! -d roberta-classical-chinese-base-char.pos ]
217
+ then mkdir -p roberta-classical-chinese-base-char.pos
218
+ python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-base-char --train_file trainPOS.json --validation_file validPOS.json --output_dir roberta-classical-chinese-base-char.pos --do_train --do_eval
219
+ fi
220
+ if [ ! -d roberta-classical-chinese-large-char.pos ]
221
+ then mkdir -p roberta-classical-chinese-large-char.pos
222
+ python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-large-char --train_file trainPOS.json --validation_file validPOS.json --output_dir roberta-classical-chinese-large-char.pos --do_train --do_eval
223
+ fi
224
+
225
+ nawk '
226
+ BEGIN{
227
+ f[0]="test.conllu";
228
+ f[1]="dev.conllu";
229
+ for(i=2;i<10;i++)
230
+ f[i]="train.conllu";
231
+ }
232
+ {
233
+ printf("%s\n",$0)>>f[i%10];
234
+ if($0=="")
235
+ i++;
236
+ }' traditional.conllu
237
+ if [ ! -f roberta-classical-chinese-base-char.pos/roberta-classical-chinese-base-char.supar ]
238
+ then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p roberta-classical-chinese-base-char.pos/roberta-classical-chinese-base-char.supar -c biaffine-dep-en -f bert --bert KoichiYasuoka/roberta-classical-chinese-base-char --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
239
+ fi
240
+ if [ ! -f roberta-classical-chinese-large-char.pos/roberta-classical-chinese-large-char.supar ]
241
+ then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p roberta-classical-chinese-large-char.pos/roberta-classical-chinese-large-char.supar -c biaffine-dep-en -f bert --bert KoichiYasuoka/roberta-classical-chinese-large-char --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
242
+ fi
243
+
244
+ python3 -c '
245
+ tokens=[]
246
+ tags=[]
247
+ i=0
248
+ while True:
249
+ try:
250
+ s=input()
251
+ except:
252
+ if len(tokens)>0:
253
+ print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
254
+ quit()
255
+ t=s.split("\t")
256
+ if len(t)==10:
257
+ for c in t[1]:
258
+ tokens.append(c)
259
+ i+=1
260
+ else:
261
+ if i==1:
262
+ tags.append("S")
263
+ elif i==2:
264
+ tags+=["B","E"]
265
+ elif i==3:
266
+ tags+=["B","E2","E"]
267
+ else:
268
+ tags+=["B"]+["M"]*(i-4)+["E3","E2","E"]
269
+ i=0
270
+ if len(tokens)>80:
271
+ print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
272
+ tokens=[]
273
+ tags=[]
274
+ ' < traditional.conllu | tee traditionalDanku.json | nawk '
275
+ {
276
+ if(NR%10>0)
277
+ printf("%s\n",$0)>>"trainDanku.json";
278
+ else
279
+ printf("%s\n",$0)>>"validDanku.json";
280
+ }'
281
+ if [ ! -d roberta-classical-chinese-base-char.danku ]
282
+ then mkdir -p roberta-classical-chinese-base-char.danku
283
+ python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-base-char --train_file trainDanku.json --validation_file validDanku.json --output_dir roberta-classical-chinese-base-char.danku --do_train --do_eval
284
+ fi
285
+ if [ ! -d roberta-classical-chinese-large-char.danku ]
286
+ then mkdir -p roberta-classical-chinese-large-char.danku
287
+ python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-large-char --train_file trainDanku.json --validation_file validDanku.json --output_dir roberta-classical-chinese-large-char.danku --do_train --do_eval
288
+ fi
289
+
290
+ nawk '
291
+ {
292
+ if(NR%10>0)
293
+ printf("%s\n",$0)>"trainPOS.json";
294
+ else
295
+ printf("%s\n",$0)>"validPOS.json";
296
+ }' traditionalPOS.json
297
+ if [ ! -d sikubert.pos ]
298
+ then mkdir -p sikubert.pos
299
+ python3 run_ner.py --model_name_or_path SIKU-BERT/sikubert --train_file trainPOS.json --validation_file validPOS.json --output_dir sikubert.pos --do_train --do_eval
300
+ fi
301
+ if [ ! -d sikuroberta.pos ]
302
+ then mkdir -p sikuroberta.pos
303
+ python3 run_ner.py --model_name_or_path SIKU-BERT/sikuroberta --train_file trainPOS.json --validation_file validPOS.json --output_dir sikuroberta.pos --do_train --do_eval
304
+ fi
305
+
306
+ nawk '
307
+ BEGIN{
308
+ f[0]="test.conllu";
309
+ f[1]="dev.conllu";
310
+ for(i=2;i<10;i++)
311
+ f[i]="train.conllu";
312
+ }
313
+ {
314
+ printf("%s\n",$0)>f[i%10];
315
+ if($0=="")
316
+ i++;
317
+ }' traditional.conllu
318
+ if [ ! -f sikubert.pos/sikubert.supar ]
319
+ then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p sikubert.pos/sikubert.supar -c biaffine-dep-en -f bert --bert SIKU-BERT/sikubert --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
320
+ fi
321
+ if [ ! -f sikuroberta.pos/sikuroberta.supar ]
322
+ then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p sikuroberta.pos/sikuroberta.supar -c biaffine-dep-en -f bert --bert SIKU-BERT/sikuroberta --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
323
+ fi
324
+
325
+ nawk '
326
+ {
327
+ if(NR%10>0)
328
+ printf("%s\n",$0)>"trainDanku.json";
329
+ else
330
+ printf("%s\n",$0)>"validDanku.json";
331
+ }' traditionalDanku.json
332
+ if [ ! -d sikubert.danku ]
333
+ then mkdir -p sikubert.danku
334
+ python3 run_ner.py --model_name_or_path SIKU-BERT/sikubert --train_file trainDanku.json --validation_file validDanku.json --output_dir sikubert.danku --do_train --do_eval
335
+ fi
336
+ if [ ! -d sikuroberta.danku ]
337
+ then mkdir -p sikuroberta.danku
338
+ python3 run_ner.py --model_name_or_path SIKU-BERT/sikuroberta --train_file trainDanku.json --validation_file validDanku.json --output_dir sikuroberta.danku --do_train --do_eval
339
+ fi
340
+
341
+ exit 0
suparkanbun/models/splitter.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #! /bin/sh
2
+ for F
3
+ do split -a 2 -b 83886080 --numeric-suffixes=01 $F $F.
4
+ ls -1 $F.0[1-9] | sed 's/^\(.*\)0\([1-9]\)$/mv & \1\2/' | sh
5
+ done
6
+ exit 0