Spaces:
Sleeping
Sleeping
Commit
β’
c6b60a2
1
Parent(s):
d9f2bb5
Updated training script and data
Browse files- app.py +6 -5
- fine_tuned_gpt2_model/config.json +0 -39
- fine_tuned_gpt2_model/generation_config.json +0 -6
- fine_tuned_gpt2_model/merges.txt +0 -0
- fine_tuned_gpt2_model/model.safetensors +0 -3
- fine_tuned_gpt2_model/special_tokens_map.json +0 -24
- fine_tuned_gpt2_model/tokenizer_config.json +0 -22
- fine_tuned_gpt2_model/vocab.json +0 -0
- fypmc20277423/.gitattributes +0 -35
- fypmc20277423/README.md +0 -12
- fypmc20277423/app.py +0 -52
- fypmc20277423/output/runs/Mar31_21-01-46_LAPTOP-23I4320M/events.out.tfevents.1711915307.LAPTOP-23I4320M.22972.0 +0 -3
- fypmc20277423/requirements.txt +0 -4
- fypmc20277423/training.txt +0 -0
app.py
CHANGED
@@ -2,7 +2,8 @@ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArgume
|
|
2 |
from datasets import load_dataset
|
3 |
|
4 |
# Load the text dataset from the specified file
|
5 |
-
dataset = load_dataset("text", data_files="training.txt")
|
|
|
6 |
|
7 |
# Initialize the GPT-2 tokenizer
|
8 |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
@@ -37,7 +38,7 @@ training_args = TrainingArguments(
|
|
37 |
overwrite_output_dir=True,
|
38 |
num_train_epochs=2, # Optionally reduced for quicker iteration
|
39 |
per_device_train_batch_size=2, # Reduced from 4 to 2
|
40 |
-
gradient_accumulation_steps=
|
41 |
save_steps=10_000,
|
42 |
save_total_limit=2,
|
43 |
)
|
@@ -52,6 +53,6 @@ trainer = Trainer(
|
|
52 |
# Start the training process
|
53 |
trainer.train()
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
2 |
from datasets import load_dataset
|
3 |
|
4 |
# Load the text dataset from the specified file
|
5 |
+
dataset = load_dataset("text", data_files="C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\training.txt")
|
6 |
+
|
7 |
|
8 |
# Initialize the GPT-2 tokenizer
|
9 |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
|
|
38 |
overwrite_output_dir=True,
|
39 |
num_train_epochs=2, # Optionally reduced for quicker iteration
|
40 |
per_device_train_batch_size=2, # Reduced from 4 to 2
|
41 |
+
gradient_accumulation_steps=16, # Added to compensate for smaller batch size
|
42 |
save_steps=10_000,
|
43 |
save_total_limit=2,
|
44 |
)
|
|
|
53 |
# Start the training process
|
54 |
trainer.train()
|
55 |
|
56 |
+
model.save_pretrained('C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\output\\fine_tuned_gpt2_model')
|
57 |
+
tokenizer.save_pretrained('C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\output\\fine_tuned_gpt2_model')
|
58 |
+
|
fine_tuned_gpt2_model/config.json
DELETED
@@ -1,39 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": "gpt2",
|
3 |
-
"activation_function": "gelu_new",
|
4 |
-
"architectures": [
|
5 |
-
"GPT2LMHeadModel"
|
6 |
-
],
|
7 |
-
"attn_pdrop": 0.1,
|
8 |
-
"bos_token_id": 50256,
|
9 |
-
"embd_pdrop": 0.1,
|
10 |
-
"eos_token_id": 50256,
|
11 |
-
"initializer_range": 0.02,
|
12 |
-
"layer_norm_epsilon": 1e-05,
|
13 |
-
"model_type": "gpt2",
|
14 |
-
"n_ctx": 1024,
|
15 |
-
"n_embd": 768,
|
16 |
-
"n_head": 12,
|
17 |
-
"n_inner": null,
|
18 |
-
"n_layer": 12,
|
19 |
-
"n_positions": 1024,
|
20 |
-
"reorder_and_upcast_attn": false,
|
21 |
-
"resid_pdrop": 0.1,
|
22 |
-
"scale_attn_by_inverse_layer_idx": false,
|
23 |
-
"scale_attn_weights": true,
|
24 |
-
"summary_activation": null,
|
25 |
-
"summary_first_dropout": 0.1,
|
26 |
-
"summary_proj_to_labels": true,
|
27 |
-
"summary_type": "cls_index",
|
28 |
-
"summary_use_proj": true,
|
29 |
-
"task_specific_params": {
|
30 |
-
"text-generation": {
|
31 |
-
"do_sample": true,
|
32 |
-
"max_length": 50
|
33 |
-
}
|
34 |
-
},
|
35 |
-
"torch_dtype": "float32",
|
36 |
-
"transformers_version": "4.39.2",
|
37 |
-
"use_cache": true,
|
38 |
-
"vocab_size": 50257
|
39 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fine_tuned_gpt2_model/generation_config.json
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_from_model_config": true,
|
3 |
-
"bos_token_id": 50256,
|
4 |
-
"eos_token_id": 50256,
|
5 |
-
"transformers_version": "4.39.2"
|
6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fine_tuned_gpt2_model/merges.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
fine_tuned_gpt2_model/model.safetensors
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ff5c53b27a7bc018d4cd08d406c1d9cfa150b4a3fac682703ff231d8408bc205
|
3 |
-
size 497774208
|
|
|
|
|
|
|
|
fine_tuned_gpt2_model/special_tokens_map.json
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"bos_token": {
|
3 |
-
"content": "<|endoftext|>",
|
4 |
-
"lstrip": false,
|
5 |
-
"normalized": true,
|
6 |
-
"rstrip": false,
|
7 |
-
"single_word": false
|
8 |
-
},
|
9 |
-
"eos_token": {
|
10 |
-
"content": "<|endoftext|>",
|
11 |
-
"lstrip": false,
|
12 |
-
"normalized": true,
|
13 |
-
"rstrip": false,
|
14 |
-
"single_word": false
|
15 |
-
},
|
16 |
-
"pad_token": "<|endoftext|>",
|
17 |
-
"unk_token": {
|
18 |
-
"content": "<|endoftext|>",
|
19 |
-
"lstrip": false,
|
20 |
-
"normalized": true,
|
21 |
-
"rstrip": false,
|
22 |
-
"single_word": false
|
23 |
-
}
|
24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fine_tuned_gpt2_model/tokenizer_config.json
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"add_bos_token": false,
|
3 |
-
"add_prefix_space": false,
|
4 |
-
"added_tokens_decoder": {
|
5 |
-
"50256": {
|
6 |
-
"content": "<|endoftext|>",
|
7 |
-
"lstrip": false,
|
8 |
-
"normalized": true,
|
9 |
-
"rstrip": false,
|
10 |
-
"single_word": false,
|
11 |
-
"special": true
|
12 |
-
}
|
13 |
-
},
|
14 |
-
"bos_token": "<|endoftext|>",
|
15 |
-
"clean_up_tokenization_spaces": true,
|
16 |
-
"eos_token": "<|endoftext|>",
|
17 |
-
"errors": "replace",
|
18 |
-
"model_max_length": 1024,
|
19 |
-
"pad_token": "<|endoftext|>",
|
20 |
-
"tokenizer_class": "GPT2Tokenizer",
|
21 |
-
"unk_token": "<|endoftext|>"
|
22 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fine_tuned_gpt2_model/vocab.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
fypmc20277423/.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fypmc20277423/README.md
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Fypmc20277423
|
3 |
-
emoji: π
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: gray
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.32.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fypmc20277423/app.py
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
|
2 |
-
from datasets import load_dataset
|
3 |
-
|
4 |
-
# Load the text dataset from the specified file.
|
5 |
-
dataset = load_dataset("text", data_files="training.txt")
|
6 |
-
|
7 |
-
# Initialize the GPT-2 tokenizer.
|
8 |
-
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
9 |
-
# Set the tokenizer's pad token to the EOS token.
|
10 |
-
tokenizer.pad_token = tokenizer.eos_token
|
11 |
-
|
12 |
-
# Define a function to tokenize the dataset and prepare labels.
|
13 |
-
def tokenize_function(examples):
|
14 |
-
# Tokenize the text to input_ids, attention_mask
|
15 |
-
tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
16 |
-
# Prepare labels: labels are the same as input_ids for language modeling
|
17 |
-
tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
|
18 |
-
return tokenized_inputs
|
19 |
-
|
20 |
-
# Tokenize the entire dataset.
|
21 |
-
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
22 |
-
# Remove the 'text' column as it's no longer needed after tokenization.
|
23 |
-
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
|
24 |
-
# Set the format of the dataset to PyTorch tensors.
|
25 |
-
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
|
26 |
-
|
27 |
-
# Load the GPT-2 model.
|
28 |
-
model = GPT2LMHeadModel.from_pretrained("gpt2")
|
29 |
-
|
30 |
-
# Define training arguments.
|
31 |
-
training_args = TrainingArguments(
|
32 |
-
output_dir="./output",
|
33 |
-
overwrite_output_dir=True,
|
34 |
-
num_train_epochs=3,
|
35 |
-
per_device_train_batch_size=4,
|
36 |
-
save_steps=10_000,
|
37 |
-
save_total_limit=2,
|
38 |
-
)
|
39 |
-
|
40 |
-
# Initialize the Trainer with the training dataset including labels.
|
41 |
-
trainer = Trainer(
|
42 |
-
model=model,
|
43 |
-
args=training_args,
|
44 |
-
train_dataset=tokenized_datasets["train"],
|
45 |
-
)
|
46 |
-
|
47 |
-
# Start the training process.
|
48 |
-
trainer.train()
|
49 |
-
|
50 |
-
# Save the fine-tuned model and tokenizer.
|
51 |
-
model.save_pretrained("fine_tuned_gpt2_model")
|
52 |
-
tokenizer.save_pretrained("fine_tuned_gpt2_model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fypmc20277423/output/runs/Mar31_21-01-46_LAPTOP-23I4320M/events.out.tfevents.1711915307.LAPTOP-23I4320M.22972.0
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7f422cc424360ad0b6d022f492ce2d8d45fa8fb317ad09588d920344d1bfdaad
|
3 |
-
size 4805
|
|
|
|
|
|
|
|
fypmc20277423/requirements.txt
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
torch
|
2 |
-
transformers[torch]
|
3 |
-
streamlit
|
4 |
-
accelerate>=0.21.0
|
|
|
|
|
|
|
|
|
|
fypmc20277423/training.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|