KoichiYasuoka commited on
Commit
563a060
1 Parent(s): badae88

initial release

Browse files
Files changed (9) hide show
  1. README.md +56 -0
  2. config.json +0 -0
  3. maker.py +62 -0
  4. merges.txt +0 -0
  5. pytorch_model.bin +3 -0
  6. special_tokens_map.json +15 -0
  7. tokenizer.json +0 -0
  8. tokenizer_config.json +15 -0
  9. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "en"
4
+ tags:
5
+ - "english"
6
+ - "token-classification"
7
+ - "pos"
8
+ - "dependency-parsing"
9
+ datasets:
10
+ - "universal_dependencies"
11
+ license: "cc-by-sa-4.0"
12
+ pipeline_tag: "token-classification"
13
+ ---
14
+
15
+ # roberta-base-english-ud-goeswith
16
+
17
+ ## Model Description
18
+
19
+ This is a RoBERTa model for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [roberta-base](https://huggingface.co/roberta-base).
20
+
21
+ ## How to Use
22
+
23
+ ```py
24
+ class UDgoeswith(object):
25
+ def __init__(self,bert):
26
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
27
+ self.tokenizer=AutoTokenizer.from_pretrained(bert)
28
+ self.model=AutoModelForTokenClassification.from_pretrained(bert)
29
+ def __call__(self,text):
30
+ import numpy,torch,ufal.chu_liu_edmonds
31
+ w=self.tokenizer(text,return_offsets_mapping=True)
32
+ v=w["input_ids"]
33
+ n=len(v)-1
34
+ with torch.no_grad():
35
+ d=self.model(input_ids=torch.tensor([v[0:i]+[self.tokenizer.mask_token_id]+v[i+1:]+[v[i]] for i in range(1,n)]))
36
+ e=d.logits.numpy()[:,1:n,:]
37
+ e[:,:,0]=numpy.nan
38
+ m=numpy.full((n,n),numpy.nan)
39
+ m[1:,1:]=numpy.nanmax(e,axis=2).transpose()
40
+ p=numpy.zeros((n,n))
41
+ p[1:,1:]=numpy.nanargmax(e,axis=2).transpose()
42
+ for i in range(1,n):
43
+ m[i,0],m[i,i],p[i,0]=m[i,i],numpy.nan,p[i,i]
44
+ h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
45
+ u="# text = "+text+"\n"
46
+ v=[(s,e) for s,e in w["offset_mapping"] if s<e]
47
+ for i,(s,e) in enumerate(v,1):
48
+ q=self.model.config.id2label[p[i,h[i]]].split("|")
49
+ u+="\t".join([str(i),text[s:e],"_",q[0],"_","|".join(q[1:-1]),str(h[i]),q[-1],"_","_" if i<len(v) and e<v[i][0] else "SpaceAfter=No"])+"\n"
50
+ return u+"\n"
51
+
52
+ nlp=UDgoeswith("KoichiYasuoka/roberta-base-english-ud-goeswith")
53
+ print(nlp("I saw a horse yesterday which had no name"))
54
+ ```
55
+
56
+ [ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/) is required.
config.json ADDED
The diff for this file is too large to render. See raw diff
 
maker.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/python3
2
+ src="roberta-base"
3
+ tgt="KoichiYasuoka/roberta-base-english-ud-goeswith"
4
+ import os
5
+ for d in ["UD_English-EWT","UD_English-GUM","UD_English-ParTUT","UD_English-Lines","UD_English-Atis"]:
6
+ os.system("test -d "+d+" || git clone --depth=1 https://github.com/UniversalDependencies/"+d)
7
+ os.system("for F in train dev test ; do cat UD_English-*/*-$F.conllu > $F.conllu ; done")
8
+ class UDgoeswithDataset(object):
9
+ def __init__(self,conllu,tokenizer):
10
+ self.ids,self.tags,label=[],[],set()
11
+ with open(conllu,"r",encoding="utf-8") as r:
12
+ cls,sep,msk=tokenizer.cls_token_id,tokenizer.sep_token_id,tokenizer.mask_token_id
13
+ dep,c,m="-|_|dep",[],[0,1]
14
+ for s in r:
15
+ t=s.split("\t")
16
+ if len(t)==10:
17
+ if t[0].isdecimal():
18
+ i=int(t[0])
19
+ if not m[0]<i<=m[1]:
20
+ t[1]=" "+t[1]
21
+ c.append(t)
22
+ if t[9].find("SpaceAfter=No")>=0:
23
+ m=[i,i+1]
24
+ elif t[0].find("-")>0:
25
+ m=[int(i) for i in t[0].split("-")]
26
+ elif c!=[]:
27
+ v=tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
28
+ for i in range(len(v)-1,-1,-1):
29
+ for j in range(1,len(v[i])):
30
+ c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"])
31
+ y=["0"]+[t[0] for t in c]
32
+ h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)]
33
+ p,v=[t[3]+"|"+t[5]+"|"+t[7] for t in c],sum(v,[])
34
+ if len(v)<tokenizer.model_max_length-3:
35
+ self.ids.append([cls]+v+[sep])
36
+ self.tags.append([dep]+p+[dep])
37
+ label=set(sum([self.tags[-1],list(label)],[]))
38
+ for i,k in enumerate(v):
39
+ self.ids.append([cls]+v[0:i]+[msk]+v[i+1:]+[sep,k])
40
+ self.tags.append([dep]+[t if h[j]==i+1 else dep for j,t in enumerate(p)]+[dep,dep])
41
+ c,m=[],[0,1]
42
+ self.label2id={l:i for i,l in enumerate(sorted(label))}
43
+ def __call__(*args):
44
+ label=set(sum([list(t.label2id) for t in args],[]))
45
+ lid={l:i for i,l in enumerate(sorted(label))}
46
+ for t in args:
47
+ t.label2id=lid
48
+ return lid
49
+ __len__=lambda self:len(self.ids)
50
+ __getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
51
+ from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
52
+ tkz=AutoTokenizer.from_pretrained(src)
53
+ trainDS=UDgoeswithDataset("train.conllu",tkz)
54
+ devDS=UDgoeswithDataset("dev.conllu",tkz)
55
+ testDS=UDgoeswithDataset("test.conllu",tkz)
56
+ lid=trainDS(devDS,testDS)
57
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
58
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
59
+ trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),train_dataset=trainDS,eval_dataset=devDS)
60
+ trn.train()
61
+ trn.save_model(tgt)
62
+ tkz.save_pretrained(tgt)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76129d1e3c7a86725afdb3f38a92168f4fb0c23fe7866b312838a3b611af5fe
3
+ size 504168881
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "errors": "replace",
7
+ "mask_token": "<mask>",
8
+ "model_max_length": 512,
9
+ "pad_token": "<pad>",
10
+ "sep_token": "</s>",
11
+ "special_tokens_map_file": null,
12
+ "tokenizer_class": "RobertaTokenizerFast",
13
+ "trim_offsets": true,
14
+ "unk_token": "<unk>"
15
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff