#! /usr/bin/python3 src="KoichiYasuoka/deberta-base-ainu-upos" tgt="KoichiYasuoka/deberta-base-ainu-ud-goeswith" import os url="https://github.com/KoichiYasuoka/UD-Ainu" d=os.path.basename(url) os.system("test -d {} || git clone --depth=1 {}".format(d,url)) s='{if($0==""){if(u~/\\t0\\troot\\t/)print u;u=""}else u=u$0"\\n"}' os.system("nawk -F'\\t' '{}' {}/ain_*-ud-*.conllu > train.conllu".format(s,d)) class UDgoeswithDataset(object): def __init__(self,conllu,tokenizer): self.ids,self.tags,label=[],[],set() with open(conllu,"r",encoding="utf-8") as r: cls,sep,msk=tokenizer.cls_token_id,tokenizer.sep_token_id,tokenizer.mask_token_id dep,c="-|_|dep",[] for s in r: t=s.split("\t") if len(t)==10: if t[0].isdecimal(): c.append(t) elif c!=[]: for x in [1,2]: d=list(c) v=tokenizer([t[x] for t in d],add_special_tokens=False)["input_ids"] for i in range(len(v)-1,-1,-1): for j in range(1,len(v[i])): d.insert(i+1,[d[i][0],"_","_","X","_","_",d[i][0],"goeswith","_","_"]) y=["0"]+[t[0] for t in d] h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(d,1)] p,v=[t[3]+"|"+t[4]+"|"+t[7] for t in d],sum(v,[]) if len(v)