KoichiYasuoka commited on
Commit
5f1d37e
1 Parent(s): f643899

model changed

Browse files
Files changed (5) hide show
  1. README.md +3 -3
  2. config.json +2 -5
  3. maker.py +5 -5
  4. pytorch_model.bin +2 -2
  5. tokenizer_config.json +1 -0
README.md CHANGED
@@ -16,7 +16,7 @@ pipeline_tag: "token-classification"
16
 
17
  ## Model Description
18
 
19
- This is a RoBERTa model pre-trained on Chinese Wikipedia texts (both simplified and traditional) for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [chinese-roberta-base-upos](https://huggingface.co/KoichiYasuoka/chinese-roberta-base-upos).
20
 
21
  ## How to Use
22
 
@@ -60,7 +60,7 @@ class UDgoeswith(object):
60
  return u+"\n"
61
 
62
  nlp=UDgoeswith("KoichiYasuoka/roberta-base-chinese-ud-goeswith")
63
- print(nlp("我叫萨拉,我住在伦敦。"))
64
  ```
65
 
66
  with [ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/).
@@ -69,6 +69,6 @@ Or without ufal.chu-liu-edmonds:
69
  ```
70
  from transformers import pipeline
71
  nlp=pipeline("universal-dependencies","KoichiYasuoka/roberta-base-chinese-ud-goeswith",trust_remote_code=True,aggregation_strategy="simple")
72
- print(nlp("我叫萨拉,我住在伦敦。"))
73
  ```
74
 
 
16
 
17
  ## Model Description
18
 
19
+ This is a RoBERTa model pre-trained on Chinese Wikipedia texts (both simplified and traditional) for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [roberta_chinese_base](https://huggingface.co/clue/roberta_chinese_base).
20
 
21
  ## How to Use
22
 
 
60
  return u+"\n"
61
 
62
  nlp=UDgoeswith("KoichiYasuoka/roberta-base-chinese-ud-goeswith")
63
+ print(nlp("我把这本书看完了"))
64
  ```
65
 
66
  with [ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/).
 
69
  ```
70
  from transformers import pipeline
71
  nlp=pipeline("universal-dependencies","KoichiYasuoka/roberta-base-chinese-ud-goeswith",trust_remote_code=True,aggregation_strategy="simple")
72
+ print(nlp("我把这本书看完了"))
73
  ```
74
 
config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "architectures": [
3
- "BertForTokenClassification"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
@@ -12,8 +12,6 @@
12
  },
13
  "directionality": "bidi",
14
  "eos_token_id": 2,
15
- "finetuning_task": "ner",
16
- "gradient_checkpointing": false,
17
  "hidden_act": "gelu",
18
  "hidden_dropout_prob": 0.1,
19
  "hidden_size": 768,
@@ -631,10 +629,9 @@
631
  },
632
  "layer_norm_eps": 1e-12,
633
  "max_position_embeddings": 512,
634
- "model_type": "bert",
635
  "num_attention_heads": 12,
636
  "num_hidden_layers": 12,
637
- "output_past": true,
638
  "pad_token_id": 1,
639
  "pooler_fc_size": 768,
640
  "pooler_num_attention_heads": 12,
 
1
  {
2
  "architectures": [
3
+ "RobertaForTokenClassification"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
 
12
  },
13
  "directionality": "bidi",
14
  "eos_token_id": 2,
 
 
15
  "hidden_act": "gelu",
16
  "hidden_dropout_prob": 0.1,
17
  "hidden_size": 768,
 
629
  },
630
  "layer_norm_eps": 1e-12,
631
  "max_position_embeddings": 512,
632
+ "model_type": "roberta",
633
  "num_attention_heads": 12,
634
  "num_hidden_layers": 12,
 
635
  "pad_token_id": 1,
636
  "pooler_fc_size": 768,
637
  "pooler_num_attention_heads": 12,
maker.py CHANGED
@@ -1,5 +1,5 @@
1
  #! /usr/bin/python3
2
- src="KoichiYasuoka/chinese-roberta-base-upos"
3
  tgt="KoichiYasuoka/roberta-base-chinese-ud-goeswith"
4
  import os
5
  for d in ["UD_Chinese-GSD","UD_Chinese-GSDSimp"]:
@@ -39,15 +39,15 @@ class UDgoeswithDataset(object):
39
  return lid
40
  __len__=lambda self:len(self.ids)
41
  __getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
42
- from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
43
- tkz=AutoTokenizer.from_pretrained(src)
44
  trainDS=UDgoeswithDataset("train.conllu",tkz)
45
  devDS=UDgoeswithDataset("dev.conllu",tkz)
46
  testDS=UDgoeswithDataset("test.conllu",tkz)
47
  lid=trainDS(devDS,testDS)
48
- cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
49
  arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
50
- trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True),train_dataset=trainDS,eval_dataset=devDS)
51
  trn.train()
52
  trn.save_model(tgt)
53
  tkz.save_pretrained(tgt)
 
1
  #! /usr/bin/python3
2
+ src="clue/roberta_chinese_base"
3
  tgt="KoichiYasuoka/roberta-base-chinese-ud-goeswith"
4
  import os
5
  for d in ["UD_Chinese-GSD","UD_Chinese-GSDSimp"]:
 
39
  return lid
40
  __len__=lambda self:len(self.ids)
41
  __getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
42
+ from transformers import BertTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
43
+ tkz=BertTokenizer.from_pretrained(src,model_max_length=512)
44
  trainDS=UDgoeswithDataset("train.conllu",tkz)
45
  devDS=UDgoeswithDataset("dev.conllu",tkz)
46
  testDS=UDgoeswithDataset("test.conllu",tkz)
47
  lid=trainDS(devDS,testDS)
48
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
49
  arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
50
+ trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),train_dataset=trainDS,eval_dataset=devDS)
51
  trn.train()
52
  trn.save_model(tgt)
53
  tkz.save_pretrained(tgt)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:803211fef51e6bd23e68f211c1396d8975f16e2aea0ac0469a612fd83ccb38ba
3
- size 407710641
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfb7b516490a2a111a771d186112e9a33f9aa9c39c54514a0db029382159fd45
3
+ size 407711217
tokenizer_config.json CHANGED
@@ -7,6 +7,7 @@
7
  "never_split": null,
8
  "pad_token": "[PAD]",
9
  "sep_token": "[SEP]",
 
10
  "strip_accents": null,
11
  "tokenize_chinese_chars": true,
12
  "tokenizer_class": "BertTokenizerFast",
 
7
  "never_split": null,
8
  "pad_token": "[PAD]",
9
  "sep_token": "[SEP]",
10
+ "special_tokens_map_file": null,
11
  "strip_accents": null,
12
  "tokenize_chinese_chars": true,
13
  "tokenizer_class": "BertTokenizerFast",