#! /usr/bin/python3 src="KoichiYasuoka/deberta-base-belarusian-upos" tgt="KoichiYasuoka/deberta-base-belarusian-ud-goeswith" import os url="https://github.com/UniversalDependencies/UD_Belarusian-HSE" d=os.path.basename(url) os.system("test -d "+d+" || git clone --depth=1 "+url) os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done") class UDgoeswithDataset(object): def __init__(self,conllu,tokenizer): self.ids,self.tags,label=[],[],set() with open(conllu,"r",encoding="utf-8") as r: cls,sep,msk=tokenizer.cls_token_id,tokenizer.sep_token_id,tokenizer.mask_token_id dep,c="-|_|dep",[] for s in r: t=s.split("\t") if len(t)==10: if t[0].isdecimal(): c.append(t) elif c!=[]: v=tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"] for i in range(len(v)-1,-1,-1): if len(v[i])==0: v[i]=[tokenizer.unk_token_id] elif len(v[i])>1: for j in range(1,len(v[i])): c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"]) y=["0"]+[t[0] for t in c] h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)] p,v=[t[3]+"|"+t[5]+"|"+t[7] for t in c],sum(v,[]) if len(v)