#! /usr/bin/python3 src="roberta-base" tgt="KoichiYasuoka/roberta-base-english-ud-goeswith" import os for d in ["UD_English-EWT","UD_English-GUM","UD_English-ParTUT","UD_English-Lines","UD_English-Atis"]: os.system("test -d "+d+" || git clone --depth=1 https://github.com/UniversalDependencies/"+d) os.system("for F in train dev test ; do cat UD_English-*/*-$F.conllu > $F.conllu ; done") class UDgoeswithDataset(object): def __init__(self,conllu,tokenizer): self.ids,self.tags,label=[],[],set() with open(conllu,"r",encoding="utf-8") as r: cls,sep,msk=tokenizer.cls_token_id,tokenizer.sep_token_id,tokenizer.mask_token_id dep,c,m="-|_|dep",[],[0,1] for s in r: t=s.split("\t") if len(t)==10: if t[0].isdecimal(): i=int(t[0]) if not m[0]=0: m=[i,i+1] elif t[0].find("-")>0: m=[int(i) for i in t[0].split("-")] elif c!=[]: v=tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"] for i in range(len(v)-1,-1,-1): for j in range(1,len(v[i])): c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"]) y=["0"]+[t[0] for t in c] h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)] p,v=[t[3]+"|"+t[5]+"|"+t[7] for t in c],sum(v,[]) if len(v)