model improved for transformers 4.42

Browse files

Files changed (12) hide show

config.json +2 -13
maker.sh +97 -0
pytorch_model-00001-of-00007.bin +1 -1
pytorch_model-00002-of-00007.bin +1 -1
pytorch_model-00003-of-00007.bin +1 -1
pytorch_model-00004-of-00007.bin +1 -1
pytorch_model-00005-of-00007.bin +1 -1
pytorch_model-00006-of-00007.bin +1 -1
pytorch_model-00007-of-00007.bin +1 -1
pytorch_model.bin.index.json +3 -3
tokenizer_config.json +1 -0
upos.py +1 -40

config.json CHANGED Viewed

@@ -4,22 +4,11 @@
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
-  "auto_map": {
-    "AutoModelForTokenClassification": "upos.LlamaForTokenClassification"
-  },
   "bos_token_id": 128000,
   "custom_pipelines": {
     "upos": {
       "impl": "upos.BellmanFordTokenClassificationPipeline",
       "pt": "AutoModelForTokenClassification"
-    },
-    "token-classification": {
-      "impl": "upos.RawTokenClassificationPipeline",
-      "pt": "AutoModelForTokenClassification"
-    },
-    "ner": {
-      "impl": "upos.RawTokenClassificationPipeline",
-      "pt": "AutoModelForTokenClassification"
     }
   },
   "eos_token_id": 128001,
@@ -162,9 +151,9 @@
   "rope_scaling": null,
   "rope_theta": 500000.0,
   "tie_word_embeddings": false,
-  "torch_dtype": "float32",
   "tokenizer_class": "LlamaTokenizerFast",
-  "transformers_version": "4.41.2",
   "use_cache": true,
   "vocab_size": 128259
 }

   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 128000,
   "custom_pipelines": {
     "upos": {
       "impl": "upos.BellmanFordTokenClassificationPipeline",
       "pt": "AutoModelForTokenClassification"
     }
   },
   "eos_token_id": 128001,
   "rope_scaling": null,
   "rope_theta": 500000.0,
   "tie_word_embeddings": false,
   "tokenizer_class": "LlamaTokenizerFast",
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.4",
   "use_cache": true,
   "vocab_size": 128259
 }

maker.sh ADDED Viewed

	@@ -0,0 +1,97 @@

+#! /bin/sh
+test -f ja_gsd_modern.conllu || curl -LO https://github.com/KoichiYasuoka/SuPar-UniDic/raw/main/suparunidic/suparmodels/ja_gsd_modern.conllu
+curl -L https://huggingface.co/KoichiYasuoka/Llama-3-Swallow-8B-upos/resolve/main/tokenizer.json | egrep -v '"ã(ģ[^ ]|Ĥ[ģĤĥĦħĨĩĪīĬĭĮįİıĲĳĴĵ]) ã(ģ[^ ]|Ĥ[ģĤĥĦħĨĩĪīĬĭĮįİıĲĳĴĵ])",$' > newtokenizer.json
+TMP=./maker$$.py
+cat << 'EOF' > $TMP
+#! /usr/bin/env deepspeed
+src="KoichiYasuoka/Llama-3-Swallow-8B-upos"
+tgt="KoichiYasuoka/Llama-3-Swallow-8B-char-upos"
+from transformers import LlamaTokenizerFast,LlamaForTokenClassification,AutoConfig,DataCollatorForTokenClassification,TrainingArguments,Trainer
+class UPOSFileDataset(object):
+  def __init__(self,conllu,tokenizer):
+    self.conllu=open(conllu,"r",encoding="utf-8")
+    self.tokenizer=tokenizer
+    self.seeks=[0]
+    self.multiword={}
+    label=set(["SYM"])
+    s=self.conllu.readline()
+    while s!="":
+      if s=="\n":
+        self.seeks.append(self.conllu.tell())
+      else:
+        w=s.split("\t")
+        if len(w)==10:
+          if w[0].isdecimal():
+            label.add(w[3] if w[5]=="_" else w[3]+"|"+w[5])
+          elif w[0].find("-")>0:
+            t=w[0].split("-")
+            f,j,k=w[1],[],[]
+            for i in range(int(t[0]),int(t[1])+1):
+              w=self.conllu.readline().split("\t")
+              j.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
+              k.append(w[1])
+            p="+".join(j)
+            label.add(p)
+            if p in self.multiword:
+              self.multiword[p][f]=list(k)
+            else:
+              self.multiword[p]={f:list(k)}
+      s=self.conllu.readline()
+    lid={}
+    for i,l in enumerate(sorted(label)):
+      lid[l],lid["B-"+l],lid["I-"+l]=i*3,i*3+1,i*3+2
+    self.label2id=lid
+  def __call__(*args):
+    lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))}
+    for t in args:
+      t.label2id=lid
+    return lid
+  def __del__(self):
+    self.conllu.close()
+  __len__=lambda self:len(self.seeks)-1
+  def __getitem__(self,i):
+    self.conllu.seek(self.seeks[i])
+    form,upos=[],[]
+    while self.conllu.tell()<self.seeks[i+1]:
+      w=self.conllu.readline().split("\t")
+      if len(w)==10:
+        form.append(w[1])
+        if w[0].isdecimal():
+          upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
+        elif w[0].find("-")>0:
+          t=w[0].split("-")
+          u=[]
+          for j in range(int(t[0]),int(t[1])+1):
+            k=self.conllu.readline().split("\t")
+            u.append(k[3] if k[5]=="_" else k[3]+"|"+k[5])
+          upos.append("+".join(u))
+    v=self.tokenizer(form,add_special_tokens=False)
+    i,u=[],[]
+    for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
+      if x!=[]:
+        i+=x
+        u+=[y] if len(x)==1 else ["B-"+y]+["I-"+y]*(len(x)-1)
+    if len(i)<self.tokenizer.model_max_length-3:
+      ids=[self.tokenizer.cls_token_id]+i+[self.tokenizer.sep_token_id]
+      upos=["SYM"]+u+["SYM"]
+    else:
+      ids=i[0:self.tokenizer.model_max_length-2]
+      upos=u[0:self.tokenizer.model_max_length-2]
+    return {"input_ids":ids,"labels":[self.label2id[t] for t in upos]}
+tkz=LlamaTokenizerFast.from_pretrained(src,tokenizer_file="newtokenizer.json")
+trainDS=UPOSFileDataset("ja_gsd_modern.conllu",tkz)
+lid=trainDS.label2id
+cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
+dsp={"fp16":{"enabled":"auto"},"optimizer":{"type":"AdamW"},"scheduler":{"type":"WarmupLR","params":{}},"train_batch_size":"auto","train_micro_batch_size_per_gpu":"auto","zero_optimization":{"stage":3,"offload_optimizer":{"device":"cpu","pin_memory":True},"offload_param":{"device":"cpu","pin_memory":True},"overlap_comm":True,"contiguous_gradients":True,"reduce_bucket_size":"auto","stage3_prefetch_bucket_size":"auto","stage3_param_persistence_threshold":"auto","stage3_gather_16bit_weights_on_model_save":True}}
+arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=8,deepspeed=dsp,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
+trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=LlamaForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True),train_dataset=trainDS)
+trn.train()
+trn.save_model(tgt)
+tkz.save_pretrained(tgt)
+EOF
+chmod 755 $TMP
+$TMP
+exit

pytorch_model-00001-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49c1020becbd2967f7776b863723430ec6d821af391f86662e027cefe447bba6
 size 4886522810

 version https://git-lfs.github.com/spec/v1
+oid sha256:46bfedcc1eaa8d42531ddd628c9c94b482dd2a9f20c1c16d11864e8ab70223e6
 size 4886522810

pytorch_model-00002-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c2d24bf362a0d7c776dc4d18c27b40a1efd28541cda74edf14337cee9e305a3
 size 4832018324

 version https://git-lfs.github.com/spec/v1
+oid sha256:cbb4659f12fa3a9f5f878d9e3811ad00f400592e21ae27aa7334ffbf473ae582
 size 4832018324

pytorch_model-00003-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0eb5f92fb8782e507f6e2ae881e536aa67aaf69e3f7981eecb7280d587f4238a
 size 4999825256

 version https://git-lfs.github.com/spec/v1
+oid sha256:07d570e107da750283b13b47082b1c1378651b8c77b9d37cb9f0b87dbf4cbe48
 size 4999825256

pytorch_model-00004-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96889f5352ed44150847ae0c664665510991ad2fcfec15a798828a6ddc2b9aeb
 size 4999825316

 version https://git-lfs.github.com/spec/v1
+oid sha256:c79d77e8e2ca8e77ac5d14712c07a5c5e8dbaba66e4476f55c561cd1b67b0c0b
 size 4999825316

pytorch_model-00005-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b993fe8ca469d962e58c3e5d2c0d98e8c658257da743cccf8cbb7ee486570650
 size 4832018324

 version https://git-lfs.github.com/spec/v1
+oid sha256:fbfa6b3c3cb0b1e85e2cff6b63a2a4956c2a59710315f0db489f54f4ae9c006c
 size 4832018324

pytorch_model-00006-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a17fc52fd91941f772c7165f87c942fb74899c7bb28e07c2a5c8d4c51bb2cb18
 size 4999825320

 version https://git-lfs.github.com/spec/v1
+oid sha256:94082c38acc2313d196be5de9285a0e4e80a3769d5301860c705091b6e3fea86
 size 4999825320

pytorch_model-00007-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:efd153934df07485b3201de6e71a8ebff835646721cf6ce5e9099fb039749ace
 size 470797675

 version https://git-lfs.github.com/spec/v1
+oid sha256:9eb4ca4e2de739ea97b5774722341c230104b0146aac414349abe0aa76eb8af5
 size 470797675

pytorch_model.bin.index.json CHANGED Viewed

@@ -3,8 +3,6 @@
     "total_size": 30020731120
   },
   "weight_map": {
-    "classifier.bias": "pytorch_model-00007-of-00007.bin",
-    "classifier.weight": "pytorch_model-00007-of-00007.bin",
     "model.embed_tokens.weight": "pytorch_model-00001-of-00007.bin",
     "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
     "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
@@ -294,6 +292,8 @@
     "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
     "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
     "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "model.norm.weight": "pytorch_model-00007-of-00007.bin"
   }
 }

     "total_size": 30020731120
   },
   "weight_map": {
     "model.embed_tokens.weight": "pytorch_model-00001-of-00007.bin",
     "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
     "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
     "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
     "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
     "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.norm.weight": "pytorch_model-00007-of-00007.bin",
+    "score.bias": "pytorch_model-00007-of-00007.bin",
+    "score.weight": "pytorch_model-00007-of-00007.bin"
   }
 }

tokenizer_config.json CHANGED Viewed

@@ -1,6 +1,7 @@
 {
   "add_bos_token": true,
   "add_eos_token": false,
   "added_tokens_decoder": {
     "128000": {
       "content": "<|begin_of_text|>",

 {
   "add_bos_token": true,
   "add_eos_token": false,
+  "add_prefix_space": null,
   "added_tokens_decoder": {
     "128000": {
       "content": "<|begin_of_text|>",

upos.py CHANGED Viewed

@@ -1,5 +1,4 @@
-from transformers import TokenClassificationPipeline,LlamaModel,LlamaPreTrainedModel
-from transformers.modeling_outputs import TokenClassifierOutput
 class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
   def __init__(self,**kwargs):
@@ -40,41 +39,3 @@ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
       t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
     return w
-class RawTokenClassificationPipeline(TokenClassificationPipeline):
-  def check_model_type(self,supported_models):
-    pass
-class LlamaForTokenClassification(LlamaPreTrainedModel):
-  def __init__(self,config):
-    from torch import nn
-    super().__init__(config)
-    self.num_labels=config.num_labels
-    self.model=LlamaModel(config)
-    if hasattr(config,"classifier_dropout") and config.classifier_dropout is not None:
-      classifier_dropout=config.classifier_dropout
-    elif hasattr(config,"hidden_dropout") and config.hidden_dropout is not None:
-      classifier_dropout=config.hidden_dropout
-    else:
-      classifier_dropout=0.1
-    self.dropout=nn.Dropout(classifier_dropout)
-    self.classifier=nn.Linear(config.hidden_size,config.num_labels)
-    self.post_init()
-  def get_input_embeddings(self):
-    return self.model.embed_tokens
-  def set_input_embeddings(self,value):
-    self.model.embed_tokens=value
-  def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
-    return_dict=return_dict if return_dict is not None else self.config.use_return_dict
-    transformer_outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
-    hidden_states=transformer_outputs[0]
-    hidden_states=self.dropout(hidden_states)
-    logits=self.classifier(hidden_states)
-    loss=None
-    if labels is not None:
-      from torch import nn
-      loss_fct=nn.CrossEntropyLoss()
-      loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
-    if not return_dict:
-      output=(logits,)+transformer_outputs[2:]
-      return ((loss,)+output) if loss is not None else output
-    return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=transformer_outputs.hidden_states,attentions=transformer_outputs.attentions)

+from transformers import TokenClassificationPipeline
 class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
   def __init__(self,**kwargs):
       t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
     return w