KoichiYasuoka commited on
Commit
0ee48d1
1 Parent(s): c300498

model improved for transformers 4.42

Browse files
config.json CHANGED
@@ -3,22 +3,11 @@
3
  "Qwen2ForTokenClassification"
4
  ],
5
  "attention_dropout": 0.0,
6
- "auto_map": {
7
- "AutoModelForTokenClassification": "upos.Qwen2ForTokenClassification"
8
- },
9
  "bos_token_id": 151643,
10
  "custom_pipelines": {
11
  "upos": {
12
  "impl": "upos.BellmanFordTokenClassificationPipeline",
13
  "pt": "AutoModelForTokenClassification"
14
- },
15
- "token-classification":{
16
- "impl": "upos.RawTokenClassificationPipeline",
17
- "pt": "AutoModelForTokenClassification"
18
- },
19
- "ner":{
20
- "impl": "upos.RawTokenClassificationPipeline",
21
- "pt": "AutoModelForTokenClassification"
22
  }
23
  },
24
  "eos_token_id": 151643,
@@ -376,9 +365,9 @@
376
  "rope_theta": 5000000.0,
377
  "sliding_window": 32768,
378
  "tie_word_embeddings": false,
379
- "tokenizer_class": "Qwen2Tokenizer",
380
  "torch_dtype": "float32",
381
- "transformers_version": "4.41.2",
 
382
  "use_cache": false,
383
  "use_sliding_window": false,
384
  "vocab_size": 151936
 
3
  "Qwen2ForTokenClassification"
4
  ],
5
  "attention_dropout": 0.0,
 
 
 
6
  "bos_token_id": 151643,
7
  "custom_pipelines": {
8
  "upos": {
9
  "impl": "upos.BellmanFordTokenClassificationPipeline",
10
  "pt": "AutoModelForTokenClassification"
 
 
 
 
 
 
 
 
11
  }
12
  },
13
  "eos_token_id": 151643,
 
365
  "rope_theta": 5000000.0,
366
  "sliding_window": 32768,
367
  "tie_word_embeddings": false,
 
368
  "torch_dtype": "float32",
369
+ "tokenizer_class": "Qwen2Tokenizer",
370
+ "transformers_version": "4.42.4",
371
  "use_cache": false,
372
  "use_sliding_window": false,
373
  "vocab_size": 151936
maker.sh CHANGED
@@ -13,43 +13,7 @@ TMP=./maker$$.py
13
  echo 'tgt="KoichiYasuoka/'$S'-upos"'
14
  ) > $TMP
15
  cat << 'EOF' >> $TMP
16
- from transformers import AutoTokenizer,Qwen2Model,Qwen2PreTrainedModel,AutoConfig,DataCollatorForTokenClassification,TrainingArguments,Trainer
17
- from transformers.modeling_outputs import TokenClassifierOutput
18
-
19
- class Qwen2ForTokenClassification(Qwen2PreTrainedModel):
20
- def __init__(self,config):
21
- from torch import nn
22
- super().__init__(config)
23
- self.num_labels=config.num_labels
24
- self.model=Qwen2Model(config)
25
- if getattr(config,"classifier_dropout",None) is not None:
26
- classifier_dropout=config.classifier_dropout
27
- elif getattr(config,"hidden_dropout",None) is not None:
28
- classifier_dropout=config.hidden_dropout
29
- else:
30
- classifier_dropout=0.1
31
- self.dropout=nn.Dropout(classifier_dropout)
32
- self.score=nn.Linear(config.hidden_size,config.num_labels)
33
- self.post_init()
34
- def get_input_embeddings(self):
35
- return self.model.embed_tokens
36
- def set_input_embeddings(self,value):
37
- self.model.embed_tokens=value
38
- def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
39
- return_dict=return_dict if return_dict is not None else self.config.use_return_dict
40
- outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
41
- sequence_output=outputs[0]
42
- sequence_output=self.dropout(sequence_output)
43
- logits=self.score(sequence_output)
44
- loss=None
45
- if labels is not None:
46
- from torch import nn
47
- loss_fct=nn.CrossEntropyLoss()
48
- loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
49
- if not return_dict:
50
- output=(logits,)+outputs[2:]
51
- return ((loss,)+output) if loss is not None else output
52
- return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=outputs.hidden_states,attentions=outputs.attentions)
53
 
54
  class UPOSFileDataset(object):
55
  def __init__(self,conllu,tokenizer):
 
13
  echo 'tgt="KoichiYasuoka/'$S'-upos"'
14
  ) > $TMP
15
  cat << 'EOF' >> $TMP
16
+ from transformers import AutoTokenizer,Qwen2ForTokenClassification,AutoConfig,DataCollatorForTokenClassification,TrainingArguments,Trainer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  class UPOSFileDataset(object):
19
  def __init__(self,conllu,tokenizer):
pytorch_model-00001-of-00003.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5185783530ae8fed3b241173fa869e56fbdb2e134dc9154d20c08057c958db2
3
  size 4974769352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47333ed441637dc876fad1a283080dbf29c00970c0570d14f4293ac5d9382723
3
  size 4974769352
pytorch_model-00002-of-00003.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db08d2ff094e9c86a2bf683e97297da33031011d9e070cebf9b9ead1cad98ea4
3
  size 4934433952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db2ebda5f396ded9ba823c3b86e46fd751d8e208822c9b94da333c5c3ababd89
3
  size 4934433952
pytorch_model-00003-of-00003.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ad91e73ed770e4a05528e3713b41a4c39b0796ebb5f68a3fb78cd13497b5c7e
3
  size 4338334558
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d64c6d1dc4366f38ac560fa2413115f565dc7bff0ad64eb9ae55e7b0476ca23
3
  size 4338334558
tokenizer_config.json CHANGED
@@ -31,6 +31,7 @@
31
  "<|im_end|>"
32
  ],
33
  "bos_token": null,
 
34
  "clean_up_tokenization_spaces": false,
35
  "eos_token": "<|endoftext|>",
36
  "errors": "replace",
 
31
  "<|im_end|>"
32
  ],
33
  "bos_token": null,
34
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '<|endoftext|>' }}{% endif %}{% endfor %}",
35
  "clean_up_tokenization_spaces": false,
36
  "eos_token": "<|endoftext|>",
37
  "errors": "replace",
upos.py CHANGED
@@ -1,5 +1,4 @@
1
- from transformers import TokenClassificationPipeline,Qwen2Model,Qwen2PreTrainedModel
2
- from transformers.modeling_outputs import TokenClassifierOutput
3
 
4
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
5
  def __init__(self,**kwargs):
@@ -17,6 +16,7 @@ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
17
  import numpy
18
  if "logits" not in model_outputs:
19
  return self.postprocess(model_outputs[0],**kwargs)
 
20
  m=model_outputs["logits"][0].numpy()
21
  e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))
22
  z=e/e.sum(axis=-1,keepdims=True)
@@ -40,42 +40,3 @@ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
40
  t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
41
  return w
42
 
43
- class RawTokenClassificationPipeline(TokenClassificationPipeline):
44
- def check_model_type(self,supported_models):
45
- pass
46
-
47
- class Qwen2ForTokenClassification(Qwen2PreTrainedModel):
48
- def __init__(self,config):
49
- from torch import nn
50
- super().__init__(config)
51
- self.num_labels=config.num_labels
52
- self.model=Qwen2Model(config)
53
- if getattr(config,"classifier_dropout",None) is not None:
54
- classifier_dropout=config.classifier_dropout
55
- elif getattr(config,"hidden_dropout",None) is not None:
56
- classifier_dropout=config.hidden_dropout
57
- else:
58
- classifier_dropout=0.1
59
- self.dropout=nn.Dropout(classifier_dropout)
60
- self.score=nn.Linear(config.hidden_size,config.num_labels)
61
- self.post_init()
62
- def get_input_embeddings(self):
63
- return self.model.embed_tokens
64
- def set_input_embeddings(self,value):
65
- self.model.embed_tokens=value
66
- def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
67
- return_dict=return_dict if return_dict is not None else self.config.use_return_dict
68
- outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
69
- sequence_output=outputs[0]
70
- sequence_output=self.dropout(sequence_output)
71
- logits=self.score(sequence_output)
72
- loss=None
73
- if labels is not None:
74
- from torch import nn
75
- loss_fct=nn.CrossEntropyLoss()
76
- loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
77
- if not return_dict:
78
- output=(logits,)+outputs[2:]
79
- return ((loss,)+output) if loss is not None else output
80
- return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=outputs.hidden_states,attentions=outputs.attentions)
81
-
 
1
+ from transformers import TokenClassificationPipeline
 
2
 
3
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
4
  def __init__(self,**kwargs):
 
16
  import numpy
17
  if "logits" not in model_outputs:
18
  return self.postprocess(model_outputs[0],**kwargs)
19
+ print(model_outputs["logits"].size())
20
  m=model_outputs["logits"][0].numpy()
21
  e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))
22
  z=e/e.sum(axis=-1,keepdims=True)
 
40
  t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
41
  return w
42