Corrigan123 commited on
Commit
c6b60a2
β€’
1 Parent(s): d9f2bb5

Updated training script and data

Browse files
app.py CHANGED
@@ -2,7 +2,8 @@ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArgume
2
  from datasets import load_dataset
3
 
4
  # Load the text dataset from the specified file
5
- dataset = load_dataset("text", data_files="training.txt")
 
6
 
7
  # Initialize the GPT-2 tokenizer
8
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
@@ -37,7 +38,7 @@ training_args = TrainingArguments(
37
  overwrite_output_dir=True,
38
  num_train_epochs=2, # Optionally reduced for quicker iteration
39
  per_device_train_batch_size=2, # Reduced from 4 to 2
40
- gradient_accumulation_steps=8, # Added to compensate for smaller batch size
41
  save_steps=10_000,
42
  save_total_limit=2,
43
  )
@@ -52,6 +53,6 @@ trainer = Trainer(
52
  # Start the training process
53
  trainer.train()
54
 
55
- # Save the fine-tuned model and tokenizer
56
- model.save_pretrained("fine_tuned_gpt2_model")
57
- tokenizer.save_pretrained("fine_tuned_gpt2_model")
 
2
  from datasets import load_dataset
3
 
4
  # Load the text dataset from the specified file
5
+ dataset = load_dataset("text", data_files="C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\training.txt")
6
+
7
 
8
  # Initialize the GPT-2 tokenizer
9
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
38
  overwrite_output_dir=True,
39
  num_train_epochs=2, # Optionally reduced for quicker iteration
40
  per_device_train_batch_size=2, # Reduced from 4 to 2
41
+ gradient_accumulation_steps=16, # Added to compensate for smaller batch size
42
  save_steps=10_000,
43
  save_total_limit=2,
44
  )
 
53
  # Start the training process
54
  trainer.train()
55
 
56
+ model.save_pretrained('C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\output\\fine_tuned_gpt2_model')
57
+ tokenizer.save_pretrained('C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\output\\fine_tuned_gpt2_model')
58
+
fine_tuned_gpt2_model/config.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "_name_or_path": "gpt2",
3
- "activation_function": "gelu_new",
4
- "architectures": [
5
- "GPT2LMHeadModel"
6
- ],
7
- "attn_pdrop": 0.1,
8
- "bos_token_id": 50256,
9
- "embd_pdrop": 0.1,
10
- "eos_token_id": 50256,
11
- "initializer_range": 0.02,
12
- "layer_norm_epsilon": 1e-05,
13
- "model_type": "gpt2",
14
- "n_ctx": 1024,
15
- "n_embd": 768,
16
- "n_head": 12,
17
- "n_inner": null,
18
- "n_layer": 12,
19
- "n_positions": 1024,
20
- "reorder_and_upcast_attn": false,
21
- "resid_pdrop": 0.1,
22
- "scale_attn_by_inverse_layer_idx": false,
23
- "scale_attn_weights": true,
24
- "summary_activation": null,
25
- "summary_first_dropout": 0.1,
26
- "summary_proj_to_labels": true,
27
- "summary_type": "cls_index",
28
- "summary_use_proj": true,
29
- "task_specific_params": {
30
- "text-generation": {
31
- "do_sample": true,
32
- "max_length": 50
33
- }
34
- },
35
- "torch_dtype": "float32",
36
- "transformers_version": "4.39.2",
37
- "use_cache": true,
38
- "vocab_size": 50257
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fine_tuned_gpt2_model/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 50256,
4
- "eos_token_id": 50256,
5
- "transformers_version": "4.39.2"
6
- }
 
 
 
 
 
 
 
fine_tuned_gpt2_model/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
fine_tuned_gpt2_model/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff5c53b27a7bc018d4cd08d406c1d9cfa150b4a3fac682703ff231d8408bc205
3
- size 497774208
 
 
 
 
fine_tuned_gpt2_model/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<|endoftext|>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<|endoftext|>",
17
- "unk_token": {
18
- "content": "<|endoftext|>",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fine_tuned_gpt2_model/tokenizer_config.json DELETED
@@ -1,22 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "50256": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- }
13
- },
14
- "bos_token": "<|endoftext|>",
15
- "clean_up_tokenization_spaces": true,
16
- "eos_token": "<|endoftext|>",
17
- "errors": "replace",
18
- "model_max_length": 1024,
19
- "pad_token": "<|endoftext|>",
20
- "tokenizer_class": "GPT2Tokenizer",
21
- "unk_token": "<|endoftext|>"
22
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fine_tuned_gpt2_model/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
fypmc20277423/.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fypmc20277423/README.md DELETED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Fypmc20277423
3
- emoji: πŸ“š
4
- colorFrom: indigo
5
- colorTo: gray
6
- sdk: streamlit
7
- sdk_version: 1.32.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
fypmc20277423/app.py DELETED
@@ -1,52 +0,0 @@
1
- from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
2
- from datasets import load_dataset
3
-
4
- # Load the text dataset from the specified file.
5
- dataset = load_dataset("text", data_files="training.txt")
6
-
7
- # Initialize the GPT-2 tokenizer.
8
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
9
- # Set the tokenizer's pad token to the EOS token.
10
- tokenizer.pad_token = tokenizer.eos_token
11
-
12
- # Define a function to tokenize the dataset and prepare labels.
13
- def tokenize_function(examples):
14
- # Tokenize the text to input_ids, attention_mask
15
- tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
16
- # Prepare labels: labels are the same as input_ids for language modeling
17
- tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
18
- return tokenized_inputs
19
-
20
- # Tokenize the entire dataset.
21
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
22
- # Remove the 'text' column as it's no longer needed after tokenization.
23
- tokenized_datasets = tokenized_datasets.remove_columns(["text"])
24
- # Set the format of the dataset to PyTorch tensors.
25
- tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
26
-
27
- # Load the GPT-2 model.
28
- model = GPT2LMHeadModel.from_pretrained("gpt2")
29
-
30
- # Define training arguments.
31
- training_args = TrainingArguments(
32
- output_dir="./output",
33
- overwrite_output_dir=True,
34
- num_train_epochs=3,
35
- per_device_train_batch_size=4,
36
- save_steps=10_000,
37
- save_total_limit=2,
38
- )
39
-
40
- # Initialize the Trainer with the training dataset including labels.
41
- trainer = Trainer(
42
- model=model,
43
- args=training_args,
44
- train_dataset=tokenized_datasets["train"],
45
- )
46
-
47
- # Start the training process.
48
- trainer.train()
49
-
50
- # Save the fine-tuned model and tokenizer.
51
- model.save_pretrained("fine_tuned_gpt2_model")
52
- tokenizer.save_pretrained("fine_tuned_gpt2_model")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fypmc20277423/output/runs/Mar31_21-01-46_LAPTOP-23I4320M/events.out.tfevents.1711915307.LAPTOP-23I4320M.22972.0 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f422cc424360ad0b6d022f492ce2d8d45fa8fb317ad09588d920344d1bfdaad
3
- size 4805
 
 
 
 
fypmc20277423/requirements.txt DELETED
@@ -1,4 +0,0 @@
1
- torch
2
- transformers[torch]
3
- streamlit
4
- accelerate>=0.21.0
 
 
 
 
 
fypmc20277423/training.txt DELETED
The diff for this file is too large to render. See raw diff