KoichiYasuoka commited on
Commit
1bce38c
1 Parent(s): 5d3738a

multiword support

Browse files
Files changed (1) hide show
  1. ud.py +18 -6
ud.py CHANGED
@@ -25,19 +25,31 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
25
  k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
26
  m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
27
  h=self.chu_liu_edmonds(m)
28
- v=[(s,e) for s,e in model_outputs["offset_mapping"][0].tolist() if s<e]
29
  q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
30
  if "aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none":
31
  for i,j in reversed(list(enumerate(q[1:],1))):
32
  if j[-1]=="goeswith" and set([t[-1] for t in q[h[i]+1:i+1]])=={"goeswith"}:
33
  h=[b if i>b else b-1 for a,b in enumerate(h) if i!=a]
34
- v[i-1]=(v[i-1][0],v.pop(i)[1])
 
35
  q.pop(i)
36
  t=model_outputs["sentence"].replace("\n"," ")
37
- u="# text = "+t+"\n"
38
- for i,(s,e) in enumerate(v):
39
- u+="\t".join([str(i+1),t[s:e],"_",q[i][0],"|".join(q[i][1:-1]),"_",str(0 if h[i]==i else h[i]+1),q[i][-1],"_","_" if i+1<len(v) and e<v[i+1][0] else "SpaceAfter=No"])+"\n"
40
- return u+"\n"
 
 
 
 
 
 
 
 
 
 
 
41
  def chu_liu_edmonds(self,matrix):
42
  import numpy
43
  h=numpy.nanargmax(matrix,axis=0)
 
25
  k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
26
  m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
27
  h=self.chu_liu_edmonds(m)
28
+ v=[(s,e,c) for (s,e),c in zip(model_outputs["offset_mapping"][0].tolist(),self.tokenizer.convert_ids_to_tokens(model_outputs["input_ids"][0].tolist())) if s<e]
29
  q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
30
  if "aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none":
31
  for i,j in reversed(list(enumerate(q[1:],1))):
32
  if j[-1]=="goeswith" and set([t[-1] for t in q[h[i]+1:i+1]])=={"goeswith"}:
33
  h=[b if i>b else b-1 for a,b in enumerate(h) if i!=a]
34
+ s,e,c=v.pop(i)
35
+ v[i-1]=(v[i-1][0],e,v[i-1][2]+c)
36
  q.pop(i)
37
  t=model_outputs["sentence"].replace("\n"," ")
38
+ u="\n"
39
+ z={"a":"ァ","i":"ィ","u":"ゥ","e":"ェ","o":"ォ","k":"ㇰ","s":"ㇱ","t":"ㇳ","n":"ㇴ","h":"ㇷ","m":"ㇺ","r":"ㇽ","p":"ㇷ゚"}
40
+ f=-1
41
+ for i,(s,e,c) in reversed(list(enumerate(v))):
42
+ w,x=[j for j in t[s:e]],""
43
+ if i>0 and s<v[i-1][1]:
44
+ w[0]=z[c[0]] if c[0] in z else "ッ"
45
+ f=max(f,i)
46
+ elif f>0:
47
+ x="{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t{}\n".format(i+1,f+1,t[s:v[f][1]],"_" if f+1<len(v) and v[f][1]<v[f+1][0] else "SpaceAfter=No")
48
+ f=-1
49
+ if i+1<len(v) and e>v[i+1][0]:
50
+ w[-1]=z[c[-1]] if c[-1] in z else "ッ"
51
+ u=x+"\t".join([str(i+1),"".join(w),"_",q[i][0],"|".join(q[i][1:-1]),"_",str(0 if h[i]==i else h[i]+1),q[i][-1],"_","_" if i+1<len(v) and e<v[i+1][0] else "SpaceAfter=No"])+"\n"+u
52
+ return "# text = "+t+"\n"+u
53
  def chu_liu_edmonds(self,matrix):
54
  import numpy
55
  h=numpy.nanargmax(matrix,axis=0)