Upload README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,143 @@
|
|
1 |
-
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: ja
|
3 |
+
thumbnail: https://github.com/ycat3/japanese-pretrained-models/blob/master/jweb.png
|
4 |
+
tags:
|
5 |
+
- ja
|
6 |
+
- japanese
|
7 |
+
- gpt2
|
8 |
+
- text-generation
|
9 |
+
- lm
|
10 |
+
- nlp
|
11 |
+
- rust
|
12 |
+
- rust-bert
|
13 |
+
|
14 |
+
license: mit
|
15 |
+
|
16 |
+
datasets:
|
17 |
+
- cc100
|
18 |
+
- wikipedia
|
19 |
+
- AozoraBunko
|
20 |
+
|
21 |
+
widget:
|
22 |
+
- text: "夏目漱石は、"
|
23 |
+
---
|
24 |
+
|
25 |
+
# japanese-soseki-gpt2-1b
|
26 |
+
|
27 |
+
![jweb-icon](./jweb.png)
|
28 |
+
|
29 |
+
This repository provides a 1.3B-parameter finetuned Japanese GPT2 model.
|
30 |
+
The model was finetuned by [jweb](https://jweb.asia/) based on trained by [rinna Co., Ltd.](https://corp.rinna.co.jp/)
|
31 |
+
Both pytorch(pytorch_model.bin) and Rust(rust_model.ot) models are provided
|
32 |
+
|
33 |
+
# How to use the model
|
34 |
+
|
35 |
+
*NOTE:* Use `T5Tokenizer` to initiate the tokenizer.
|
36 |
+
|
37 |
+
python
|
38 |
+
~~~~
|
39 |
+
import torch
|
40 |
+
from transformers import T5Tokenizer, AutoModelForCausalLM
|
41 |
+
|
42 |
+
tokenizer = T5Tokenizer.from_pretrained("jweb/japanese-soseki-gpt2-1b")
|
43 |
+
model = AutoModelForCausalLM.from_pretrained("jweb/japanese-soseki-gpt2-1b")
|
44 |
+
|
45 |
+
if torch.cuda.is_available():
|
46 |
+
model = model.to("cuda")
|
47 |
+
|
48 |
+
text = "夏目漱石は、"
|
49 |
+
token_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")
|
50 |
+
|
51 |
+
with torch.no_grad():
|
52 |
+
output_ids = model.generate(
|
53 |
+
token_ids.to(model.device),
|
54 |
+
max_length=128,
|
55 |
+
min_length=40,
|
56 |
+
do_sample=True,
|
57 |
+
repetition_penalty= 1.6,
|
58 |
+
early_stopping= True,
|
59 |
+
num_beams= 5,
|
60 |
+
temperature= 1.0,
|
61 |
+
top_k=500,
|
62 |
+
top_p=0.95,
|
63 |
+
pad_token_id=tokenizer.pad_token_id,
|
64 |
+
bos_token_id=tokenizer.bos_token_id,
|
65 |
+
eos_token_id=tokenizer.eos_token_id,
|
66 |
+
)
|
67 |
+
|
68 |
+
output = tokenizer.decode(output_ids.tolist()[0])
|
69 |
+
print(output)
|
70 |
+
# sample output: 夏目漱石は、明治時代を代表する文豪です。夏目漱石の代表作は「吾輩は猫である」や「坊っちゃん」、「草枕」「三四郎」、それに「虞美人草(ぐびじんそう)」などたくさんあります。
|
71 |
+
~~~~
|
72 |
+
|
73 |
+
rust
|
74 |
+
~~~~
|
75 |
+
use rust_bert::gpt2::GPT2Generator;
|
76 |
+
use rust_bert::pipelines::common::{ModelType, TokenizerOption};
|
77 |
+
use rust_bert::pipelines::generation_utils::{GenerateConfig, LanguageGenerator};
|
78 |
+
use rust_bert::resources::{ RemoteResource, ResourceProvider};
|
79 |
+
use tch::Device;
|
80 |
+
|
81 |
+
fn main() -> anyhow::Result<()> {
|
82 |
+
let model_resource = Box::new(RemoteResource {
|
83 |
+
url: "https://huggingface.co/jweb/japanese-soseki-gpt2-1b/resolve/main/rust_model.ot".into(),
|
84 |
+
cache_subdir: "japanese-soseki-gpt2-1b/model".into(),
|
85 |
+
});
|
86 |
+
let config_resource = Box::new(RemoteResource {
|
87 |
+
url: "https://huggingface.co/jweb/japanese-soseki-gpt2-1b/resolve/main/config.json".into(),
|
88 |
+
cache_subdir: "japanese-soseki-gpt2-1b/config".into(),
|
89 |
+
});
|
90 |
+
let vocab_resource = Box::new(RemoteResource {
|
91 |
+
url: "https://huggingface.co/jweb/japanese-soseki-gpt2-1b/resolve/main/spiece.model".into(),
|
92 |
+
cache_subdir: "japanese-soseki-gpt2-1b/vocab".into(),
|
93 |
+
});
|
94 |
+
let vocab_resource_token = vocab_resource.clone();
|
95 |
+
let merges_resource = vocab_resource.clone();
|
96 |
+
let generate_config = GenerateConfig {
|
97 |
+
model_resource,
|
98 |
+
config_resource,
|
99 |
+
vocab_resource,
|
100 |
+
merges_resource, // not used
|
101 |
+
device: Device::Cpu,
|
102 |
+
repetition_penalty: 1.6,
|
103 |
+
min_length: 40,
|
104 |
+
max_length: 128,
|
105 |
+
do_sample: true,
|
106 |
+
early_stopping: true,
|
107 |
+
num_beams: 5,
|
108 |
+
temperature: 1.0,
|
109 |
+
top_k: 500,
|
110 |
+
top_p: 0.95,
|
111 |
+
..Default::default()
|
112 |
+
};
|
113 |
+
let tokenizer = TokenizerOption::from_file(
|
114 |
+
ModelType::T5,
|
115 |
+
vocab_resource_token.get_local_path().unwrap().to_str().unwrap(),
|
116 |
+
None,
|
117 |
+
true,
|
118 |
+
None,
|
119 |
+
None,
|
120 |
+
)?;
|
121 |
+
let mut gpt2_model = GPT2Generator::new_with_tokenizer(generate_config, tokenizer.into())?;
|
122 |
+
gpt2_model.set_device(Device::cuda_if_available());
|
123 |
+
let input_text = "夏目漱石は、";
|
124 |
+
let t1 = std::time::Instant::now();
|
125 |
+
let output = gpt2_model.generate(Some(&[input_text]), None);
|
126 |
+
println!("{}", output[0].text);
|
127 |
+
println!("Elapsed Time(ms):{}",t1.elapsed().as_millis());
|
128 |
+
Ok(())
|
129 |
+
}
|
130 |
+
// sample output: 夏目漱石は、明治時代を代表する文豪です。夏目漱石の代表作は「吾輩は猫である」や「坊っちゃん」、「草枕」「三四郎」、それに「虞美人草(ぐびじんそう)」などたくさんあります。
|
131 |
+
~~~~
|
132 |
+
|
133 |
+
# Model architecture
|
134 |
+
A 24-layer, 2048-hidden-size transformer-based language model.
|
135 |
+
|
136 |
+
# Training
|
137 |
+
The model was trained on [Japanese C4](https://huggingface.co/datasets/allenai/c4), [Japanese CC-100](http://data.statmt.org/cc-100/ja.txt.xz) and [Japanese Wikipedia](https://dumps.wikimedia.org/other/cirrussearch) to optimize a traditional language modelling objective. It reaches around 14 perplexity on a chosen validation set from the same data.
|
138 |
+
# Finetuning
|
139 |
+
The model was finetuned on [Aozorabunko](https://github.com/aozorabunko/aozorabunko), especially Natume Soseki books.
|
140 |
+
# Tokenization
|
141 |
+
The model uses a [sentencepiece](https://github.com/google/sentencepiece)-based tokenizer. The vocabulary was first trained on a selected subset from the training data using the official sentencepiece training script, and then augmented with emojis and symbols.
|
142 |
+
# Licenese
|
143 |
+
[The MIT license](https://opensource.org/licenses/MIT)
|