KoichiYasuoka commited on
Commit
909aa8f
1 Parent(s): 5d99ddd

model improved for transformers 4.42

Browse files
config.json CHANGED
@@ -4,22 +4,11 @@
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
- "auto_map": {
8
- "AutoModelForTokenClassification": "upos.LlamaForTokenClassification"
9
- },
10
  "bos_token_id": 128000,
11
  "custom_pipelines": {
12
  "upos": {
13
  "impl": "upos.BellmanFordTokenClassificationPipeline",
14
  "pt": "AutoModelForTokenClassification"
15
- },
16
- "token-classification": {
17
- "impl": "upos.RawTokenClassificationPipeline",
18
- "pt": "AutoModelForTokenClassification"
19
- },
20
- "ner": {
21
- "impl": "upos.RawTokenClassificationPipeline",
22
- "pt": "AutoModelForTokenClassification"
23
  }
24
  },
25
  "eos_token_id": 128001,
@@ -162,9 +151,9 @@
162
  "rope_scaling": null,
163
  "rope_theta": 500000.0,
164
  "tie_word_embeddings": false,
165
- "torch_dtype": "float32",
166
  "tokenizer_class": "LlamaTokenizerFast",
167
- "transformers_version": "4.41.2",
 
168
  "use_cache": true,
169
  "vocab_size": 128259
170
  }
 
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
 
 
 
7
  "bos_token_id": 128000,
8
  "custom_pipelines": {
9
  "upos": {
10
  "impl": "upos.BellmanFordTokenClassificationPipeline",
11
  "pt": "AutoModelForTokenClassification"
 
 
 
 
 
 
 
 
12
  }
13
  },
14
  "eos_token_id": 128001,
 
151
  "rope_scaling": null,
152
  "rope_theta": 500000.0,
153
  "tie_word_embeddings": false,
 
154
  "tokenizer_class": "LlamaTokenizerFast",
155
+ "torch_dtype": "float32",
156
+ "transformers_version": "4.42.4",
157
  "use_cache": true,
158
  "vocab_size": 128259
159
  }
maker.sh ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/sh
2
+ test -f ja_gsd_modern.conllu || curl -LO https://github.com/KoichiYasuoka/SuPar-UniDic/raw/main/suparunidic/suparmodels/ja_gsd_modern.conllu
3
+ curl -L https://huggingface.co/KoichiYasuoka/Llama-3-Swallow-8B-upos/resolve/main/tokenizer.json | egrep -v '"ã(ģ[^ ]|Ĥ[ģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵ]) ã(ģ[^ ]|Ĥ[ģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵ])",$' > newtokenizer.json
4
+
5
+ TMP=./maker$$.py
6
+ cat << 'EOF' > $TMP
7
+ #! /usr/bin/env deepspeed
8
+ src="KoichiYasuoka/Llama-3-Swallow-8B-upos"
9
+ tgt="KoichiYasuoka/Llama-3-Swallow-8B-char-upos"
10
+ from transformers import LlamaTokenizerFast,LlamaForTokenClassification,AutoConfig,DataCollatorForTokenClassification,TrainingArguments,Trainer
11
+
12
+ class UPOSFileDataset(object):
13
+ def __init__(self,conllu,tokenizer):
14
+ self.conllu=open(conllu,"r",encoding="utf-8")
15
+ self.tokenizer=tokenizer
16
+ self.seeks=[0]
17
+ self.multiword={}
18
+ label=set(["SYM"])
19
+ s=self.conllu.readline()
20
+ while s!="":
21
+ if s=="\n":
22
+ self.seeks.append(self.conllu.tell())
23
+ else:
24
+ w=s.split("\t")
25
+ if len(w)==10:
26
+ if w[0].isdecimal():
27
+ label.add(w[3] if w[5]=="_" else w[3]+"|"+w[5])
28
+ elif w[0].find("-")>0:
29
+ t=w[0].split("-")
30
+ f,j,k=w[1],[],[]
31
+ for i in range(int(t[0]),int(t[1])+1):
32
+ w=self.conllu.readline().split("\t")
33
+ j.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
34
+ k.append(w[1])
35
+ p="+".join(j)
36
+ label.add(p)
37
+ if p in self.multiword:
38
+ self.multiword[p][f]=list(k)
39
+ else:
40
+ self.multiword[p]={f:list(k)}
41
+ s=self.conllu.readline()
42
+ lid={}
43
+ for i,l in enumerate(sorted(label)):
44
+ lid[l],lid["B-"+l],lid["I-"+l]=i*3,i*3+1,i*3+2
45
+ self.label2id=lid
46
+ def __call__(*args):
47
+ lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))}
48
+ for t in args:
49
+ t.label2id=lid
50
+ return lid
51
+ def __del__(self):
52
+ self.conllu.close()
53
+ __len__=lambda self:len(self.seeks)-1
54
+ def __getitem__(self,i):
55
+ self.conllu.seek(self.seeks[i])
56
+ form,upos=[],[]
57
+ while self.conllu.tell()<self.seeks[i+1]:
58
+ w=self.conllu.readline().split("\t")
59
+ if len(w)==10:
60
+ form.append(w[1])
61
+ if w[0].isdecimal():
62
+ upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
63
+ elif w[0].find("-")>0:
64
+ t=w[0].split("-")
65
+ u=[]
66
+ for j in range(int(t[0]),int(t[1])+1):
67
+ k=self.conllu.readline().split("\t")
68
+ u.append(k[3] if k[5]=="_" else k[3]+"|"+k[5])
69
+ upos.append("+".join(u))
70
+ v=self.tokenizer(form,add_special_tokens=False)
71
+ i,u=[],[]
72
+ for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
73
+ if x!=[]:
74
+ i+=x
75
+ u+=[y] if len(x)==1 else ["B-"+y]+["I-"+y]*(len(x)-1)
76
+ if len(i)<self.tokenizer.model_max_length-3:
77
+ ids=[self.tokenizer.cls_token_id]+i+[self.tokenizer.sep_token_id]
78
+ upos=["SYM"]+u+["SYM"]
79
+ else:
80
+ ids=i[0:self.tokenizer.model_max_length-2]
81
+ upos=u[0:self.tokenizer.model_max_length-2]
82
+ return {"input_ids":ids,"labels":[self.label2id[t] for t in upos]}
83
+
84
+ tkz=LlamaTokenizerFast.from_pretrained(src,tokenizer_file="newtokenizer.json")
85
+ trainDS=UPOSFileDataset("ja_gsd_modern.conllu",tkz)
86
+ lid=trainDS.label2id
87
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
88
+ dsp={"fp16":{"enabled":"auto"},"optimizer":{"type":"AdamW"},"scheduler":{"type":"WarmupLR","params":{}},"train_batch_size":"auto","train_micro_batch_size_per_gpu":"auto","zero_optimization":{"stage":3,"offload_optimizer":{"device":"cpu","pin_memory":True},"offload_param":{"device":"cpu","pin_memory":True},"overlap_comm":True,"contiguous_gradients":True,"reduce_bucket_size":"auto","stage3_prefetch_bucket_size":"auto","stage3_param_persistence_threshold":"auto","stage3_gather_16bit_weights_on_model_save":True}}
89
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=8,deepspeed=dsp,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
90
+ trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=LlamaForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True),train_dataset=trainDS)
91
+ trn.train()
92
+ trn.save_model(tgt)
93
+ tkz.save_pretrained(tgt)
94
+ EOF
95
+ chmod 755 $TMP
96
+ $TMP
97
+ exit
pytorch_model-00001-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49c1020becbd2967f7776b863723430ec6d821af391f86662e027cefe447bba6
3
  size 4886522810
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46bfedcc1eaa8d42531ddd628c9c94b482dd2a9f20c1c16d11864e8ab70223e6
3
  size 4886522810
pytorch_model-00002-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c2d24bf362a0d7c776dc4d18c27b40a1efd28541cda74edf14337cee9e305a3
3
  size 4832018324
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbb4659f12fa3a9f5f878d9e3811ad00f400592e21ae27aa7334ffbf473ae582
3
  size 4832018324
pytorch_model-00003-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0eb5f92fb8782e507f6e2ae881e536aa67aaf69e3f7981eecb7280d587f4238a
3
  size 4999825256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07d570e107da750283b13b47082b1c1378651b8c77b9d37cb9f0b87dbf4cbe48
3
  size 4999825256
pytorch_model-00004-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96889f5352ed44150847ae0c664665510991ad2fcfec15a798828a6ddc2b9aeb
3
  size 4999825316
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c79d77e8e2ca8e77ac5d14712c07a5c5e8dbaba66e4476f55c561cd1b67b0c0b
3
  size 4999825316
pytorch_model-00005-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b993fe8ca469d962e58c3e5d2c0d98e8c658257da743cccf8cbb7ee486570650
3
  size 4832018324
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbfa6b3c3cb0b1e85e2cff6b63a2a4956c2a59710315f0db489f54f4ae9c006c
3
  size 4832018324
pytorch_model-00006-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a17fc52fd91941f772c7165f87c942fb74899c7bb28e07c2a5c8d4c51bb2cb18
3
  size 4999825320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94082c38acc2313d196be5de9285a0e4e80a3769d5301860c705091b6e3fea86
3
  size 4999825320
pytorch_model-00007-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efd153934df07485b3201de6e71a8ebff835646721cf6ce5e9099fb039749ace
3
  size 470797675
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eb4ca4e2de739ea97b5774722341c230104b0146aac414349abe0aa76eb8af5
3
  size 470797675
pytorch_model.bin.index.json CHANGED
@@ -3,8 +3,6 @@
3
  "total_size": 30020731120
4
  },
5
  "weight_map": {
6
- "classifier.bias": "pytorch_model-00007-of-00007.bin",
7
- "classifier.weight": "pytorch_model-00007-of-00007.bin",
8
  "model.embed_tokens.weight": "pytorch_model-00001-of-00007.bin",
9
  "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
10
  "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
@@ -294,6 +292,8 @@
294
  "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
295
  "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
296
  "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
297
- "model.norm.weight": "pytorch_model-00007-of-00007.bin"
 
 
298
  }
299
  }
 
3
  "total_size": 30020731120
4
  },
5
  "weight_map": {
 
 
6
  "model.embed_tokens.weight": "pytorch_model-00001-of-00007.bin",
7
  "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
8
  "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
 
292
  "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
293
  "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
294
  "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
295
+ "model.norm.weight": "pytorch_model-00007-of-00007.bin",
296
+ "score.bias": "pytorch_model-00007-of-00007.bin",
297
+ "score.weight": "pytorch_model-00007-of-00007.bin"
298
  }
299
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
 
4
  "added_tokens_decoder": {
5
  "128000": {
6
  "content": "<|begin_of_text|>",
 
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
4
+ "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "128000": {
7
  "content": "<|begin_of_text|>",
upos.py CHANGED
@@ -1,5 +1,4 @@
1
- from transformers import TokenClassificationPipeline,LlamaModel,LlamaPreTrainedModel
2
- from transformers.modeling_outputs import TokenClassifierOutput
3
 
4
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
5
  def __init__(self,**kwargs):
@@ -40,41 +39,3 @@ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
40
  t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
41
  return w
42
 
43
- class RawTokenClassificationPipeline(TokenClassificationPipeline):
44
- def check_model_type(self,supported_models):
45
- pass
46
-
47
- class LlamaForTokenClassification(LlamaPreTrainedModel):
48
- def __init__(self,config):
49
- from torch import nn
50
- super().__init__(config)
51
- self.num_labels=config.num_labels
52
- self.model=LlamaModel(config)
53
- if hasattr(config,"classifier_dropout") and config.classifier_dropout is not None:
54
- classifier_dropout=config.classifier_dropout
55
- elif hasattr(config,"hidden_dropout") and config.hidden_dropout is not None:
56
- classifier_dropout=config.hidden_dropout
57
- else:
58
- classifier_dropout=0.1
59
- self.dropout=nn.Dropout(classifier_dropout)
60
- self.classifier=nn.Linear(config.hidden_size,config.num_labels)
61
- self.post_init()
62
- def get_input_embeddings(self):
63
- return self.model.embed_tokens
64
- def set_input_embeddings(self,value):
65
- self.model.embed_tokens=value
66
- def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
67
- return_dict=return_dict if return_dict is not None else self.config.use_return_dict
68
- transformer_outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
69
- hidden_states=transformer_outputs[0]
70
- hidden_states=self.dropout(hidden_states)
71
- logits=self.classifier(hidden_states)
72
- loss=None
73
- if labels is not None:
74
- from torch import nn
75
- loss_fct=nn.CrossEntropyLoss()
76
- loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
77
- if not return_dict:
78
- output=(logits,)+transformer_outputs[2:]
79
- return ((loss,)+output) if loss is not None else output
80
- return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=transformer_outputs.hidden_states,attentions=transformer_outputs.attentions)
 
1
+ from transformers import TokenClassificationPipeline
 
2
 
3
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
4
  def __init__(self,**kwargs):
 
39
  t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
40
  return w
41