ArunMoonpai commited on
Commit
da9b686
1 Parent(s): 6d195e7

First model version

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Abinaya Mahendiran
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+
3
+ language: ta
4
+ datasets:
5
+ - oscar
6
+ - IndicNLP
7
+ widget:
8
+ - text: 'ஒரு ஊரிலே ஒரு காக்கைக்கு'
9
+
10
+ ---
11
+ # GPT2-Tamil
12
+
13
+ This repository is created as part of the Flax/Jax community week by Huggingface. The aim of this project is to pretrain a language model using GPT-2 specifically for Tamil language.
14
+
15
+ ## Setup:
16
+ To setup the project, run the following command,
17
+ ```python
18
+ pip install -r requirements.txt
19
+ ```
20
+
21
+ ## Model:
22
+ Pretrained model on Tamil language using a causal language modeling (CLM) objective.
23
+
24
+ ## Dataset Used:
25
+ The GTP-2 model is trained on [oscar dataset - ta](https://huggingface.co/datasets/oscar) and [IndicNLP dataset - ta](https://indicnlp.ai4bharat.org/corpora/)
26
+
27
+ ## Intended uses & limitations:
28
+ You can use the raw model for next sentence prediction, but it's mostly intended to be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=gpt2) to look for fine-tuned versions on a task that interests you.
29
+
30
+ ## How to pretrain the model:
31
+ To perform training, do the following steps,
32
+
33
+ - Export the model directory (where you want to store the model artifacts like config, tokenizer, etc.)
34
+ ```python
35
+ >>> export MODEL_DIR=<model_dir>
36
+ ```
37
+ - Create the config.json by running the following command,
38
+ ```python
39
+ >>> python src/create_config.py
40
+ ```
41
+ - Create the tokenizer by running the following command,
42
+ ```python
43
+ >>> python src/train_tokenizer.py
44
+ ```
45
+ - Once the config and tokenizer is created, run the following script to start training the flax model
46
+ ```python
47
+ >>> python scripts/train_gpt2-oscar-tamil.sh
48
+ ```
49
+
50
+ ## How to use:
51
+ To perform language generation using the model, pipeline can be used directly.
52
+
53
+ - First convert the flax model to pytorch using the following command,
54
+ ```python
55
+ python src/convert_flax_to_pytorch.py
56
+ ```
57
+ - Use the following snippet to perform language generation,
58
+ ```python
59
+ >>> from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline
60
+ >>> model_name = 'abinayam/gpt-2-tamil'
61
+ >>> model = AutoModelWithLMHead.from_pretrained(model_name)
62
+ >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
63
+ >>> set_seed(42)
64
+ >>> input_text = "ஒரு ஊரிலே ஒரு காக்கைக்கு"
65
+ >>> max_len = 300
66
+ >>> no_seq = 5
67
+ >>> generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
68
+ >>> sequence = generator(input_text, max_length=max_len, num_return_sequences=no_seq)
69
+ ```
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../gpt-2-tamil",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.0,
10
+ "eos_token_id": 50256,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "resid_pdrop": 0.0,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 300
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.9.0.dev0",
36
+ "use_cache": true,
37
+ "vocab_size": 50257
38
+ }
gpt-2-tamil/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.0,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.0,
9
+ "eos_token_id": 50256,
10
+ "gradient_checkpointing": false,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "resid_pdrop": 0.0,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "task_specific_params": {
28
+ "text-generation": {
29
+ "do_sample": true,
30
+ "max_length": 50
31
+ }
32
+ },
33
+ "transformers_version": "4.9.0.dev0",
34
+ "use_cache": true,
35
+ "vocab_size": 50257
36
+ }
gpt-2-tamil/events.out.tfevents.1626336540.t1v-n-ebe36c53-w-0.751183.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1799847ce42c1a5f9fe25dfa8d8da9e1a6ff57595979b2bd0daea658d9ea785
3
+ size 40
gpt-2-tamil/events.out.tfevents.1626339585.t1v-n-ebe36c53-w-0.759145.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b47918f07e65192c48181c8f775cbf29f08585ac3a559e67df1e3f13fb1ca01
3
+ size 40
gpt-2-tamil/events.out.tfevents.1626340740.t1v-n-ebe36c53-w-0.765413.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5855b0a71977e29453739fe2c5055c32753a62fa6d3db8ea3f105fd8ca75357b
3
+ size 40
gpt-2-tamil/events.out.tfevents.1626341319.t1v-n-ebe36c53-w-0.768105.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:938ebc19608236e36e53fd65f7c12c9d7ad0de447d01d60627441645872ef573
3
+ size 22272043
gpt-2-tamil/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89396995064d16071519a20c2771d661400da8c3d644966f0a586d299d1b2fa3
3
+ size 497764120
gpt-2-tamil/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a428e17fe6c41c96b3ac840af045afc1bf45a043e04d220f78944c38168b740
3
+ size 510359598
pyproject.toml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Black formatting
2
+ [tool.black]
3
+ line-length = 85
4
+ include = '\.pyi?$'
5
+ exclude = '''
6
+ /(
7
+ \.eggs # exclude a few common directories in the
8
+ | \.git # root of the project
9
+ | \.hg
10
+ | \.mypy_cache
11
+ | \.tox
12
+ | \.venv
13
+ | _build
14
+ | buck-out
15
+ | build
16
+ | dist
17
+ | wandb
18
+ | model
19
+ | dataset
20
+ | notebook
21
+ )/
22
+ '''
23
+
24
+ # iSort
25
+ [tool.isort]
26
+ profile = "black"
27
+ line_length = 85
28
+ multi_line_output = 3
29
+ include_trailing_comma = true
30
+ skip_gitignore = true
31
+ virtual_env = "venv"
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:183e23beb6421156e9472504710d66104d4d43829fb87cffd22d888565f27a3a
3
+ size 510401385
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ tqdm
2
+ transformers
3
+ datasets
4
+ jax
5
+ jaxlib
6
+ flax
7
+ optax
8
+ wandb
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff