Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +1 -0
cached_lm_GPT2Tokenizer_128_truth_v2.text +3 -0
cached_lm_GPT2Tokenizer_128_truth_v2.text.lock +0 -0
config.json +40 -0
generation_config.json +6 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
special_tokens_map.json +23 -0
tokenizer_config.json +33 -0
train.py +139 -0
truth_v2.text +0 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cached_lm_GPT2Tokenizer_128_truth_v2.text filter=lfs diff=lfs merge=lfs -text

cached_lm_GPT2Tokenizer_128_truth_v2.text ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6858d0d35cfd438b0dff9013864b2dc3f2f201627c3cf257272a2350156cd4df
+size 1102061

cached_lm_GPT2Tokenizer_128_truth_v2.text.lock ADDED Viewed

File without changes

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "/Users/migueldeguzman/Desktop/gpt2xl_algos/RLLMv13/layer6/",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1600,
+  "n_head": 25,
+  "n_inner": null,
+  "n_layer": 48,
+  "n_positions": 1024,
+  "output_past": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.33.3",
+  "use_cache": true,
+  "vocab_size": 50257
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.33.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6e0e6766e76b71293fd825b56bf40f82244d27774a532d9e058ef775d22a137
+size 6230624769

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

train.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+# Set the KMP_DUPLICATE_LIB_OK environment variable to handle a known issue with PyTorch
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+import sys
+import torch
+from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, get_linear_schedule_with_warmup
+class GPT2Assistant:
+    def __init__(self):
+        # Load the GPT-2 tokenizer from the specified path
+        self.tokenizer = GPT2Tokenizer.from_pretrained("/Users/migueldeguzman/Desktop/gpt2xl_algos/RLLMv13/layer6/")
+    def fine_tune(self, answer_file_path, model_output_dir, epochs=1.0):
+        # Load the pre-trained GPT-2 model from the specified path
+        self.model = GPT2LMHeadModel.from_pretrained("/Users/migueldeguzman/Desktop/gpt2xl_algos/RLLMv13/layer6/")
+        # Create a text dataset from the specified file path and tokenizer, with a block size of 128
+        train_dataset = TextDataset(
+            tokenizer=self.tokenizer,
+            file_path=answer_file_path,
+            block_size=128
+        )
+        # Create a data collator for language modeling tasks
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=self.tokenizer,
+            mlm=False
+        )
+         # Calculate the total number of training steps based on the dataset length and number of epochs
+        total_steps = len(train_dataset) * epochs
+        # Set the number of warmup steps for the learning rate scheduler
+        warmup_steps = 0.1 * total_steps
+        # Create an Adam optimizer with specified learning rate and weight decay
+        optimizer = torch.optim.Adam(self.model.parameters(), lr=42e-6, weight_decay=0.005)
+        # Create a linear learning rate scheduler with warmup steps
+        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
+        # Define the training arguments
+        training_args = TrainingArguments(
+            output_dir=model_output_dir,
+            overwrite_output_dir=True,
+            num_train_epochs=epochs,
+            per_device_train_batch_size=4,
+            save_steps=10_000,
+            save_total_limit=2,
+            gradient_accumulation_steps=8,
+            lr_scheduler_type='cosine',
+            warmup_steps=500
+        )
+         # Create a Trainer instance with the specified model, arguments, data collator, dataset, and optimizers
+        trainer = Trainer(
+            model=self.model,
+            args=training_args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            optimizers=(optimizer, scheduler)
+        )
+        # Fine-tune the model using the Trainer
+        trainer.train()
+        # Save the fine-tuned model and tokenizer to the specified output directory
+        self.model.save_pretrained(model_output_dir)
+        self.tokenizer.save_pretrained(model_output_dir)
+    def generate_answer(self, prompt, max_length=1000):
+        # Encode the input prompt using the tokenizer
+        input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+        # Check if the tokenizer has a pad token and set it if not
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Create an attention mask for the input ids
+        attention_mask = (input_ids != self.tokenizer.pad_token_id).long()
+        # Generate text using the fine-tuned model with the specified parameters
+        output = self.model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_length=max_length,
+            num_return_sequences=1,
+            no_repeat_ngram_size=2,
+            do_sample=True,
+            top_k=50,
+            top_p=0.95,
+            temperature=0.0000000000000000000000000001
+        )
+        # Decode the generated output using the tokenizer, skipping special tokens
+        answer = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        # Return the generated answer, excluding the original prompt
+        return answer[len(prompt):]
+    def query(self, prompt):
+        # Generate an answer for the given prompt
+        generated_answer = self.generate_answer(prompt)
+        print(generated_answer)
+        return generated_answer
+def main():
+    # Set the file path for the text file to fine-tune on
+    text_file_path = "/Users/migueldeguzman/Desktop/gpt2xl_algos/RLLMv13/layer7/truth_v2.text"
+    # Set the output directory path for the fine-tuned model
+    model_output_dir = "/Users/migueldeguzman/Desktop/gpt2xl_algos/RLLMv13/layer7/"
+    assistant = GPT2Assistant()
+    # Prompt the user to choose whether to fine-tune a new model or load an existing one
+    choice = input("Do you want to fine-tune a new model (n) or load an existing one (e)? (n/e): ")
+    if choice.lower() == "n":
+        # Fine-tune the model if the user chooses 'n'
+        print("Fine-tuning the model...")
+        assistant.fine_tune(text_file_path, model_output_dir)
+        print("Model fine-tuning complete.")
+    elif choice.lower() == "e":
+        print("Loading the existing model...")
+        # Load the existing fine-tuned model if the user chooses 'e'
+        assistant.model = GPT2LMHeadModel.from_pretrained(model_output_dir)
+        print("Existing model loaded.")
+    else:
+        print("Invalid choice. Exiting the program.")
+        sys.exit()
+    while True:
+        # Prompt the user for a question# Prompt the user for a question
+        prompt = input("Enter your question (or type 'exit' to stop): ")
+        if prompt.lower() == "exit":
+            break
+        print("Answering in progress...")
+        # Generate an answer for the user's prompt
+        generated_answer = assistant.query(prompt)
+        print("\n")
+if __name__ == "__main__":
+    main()

truth_v2.text ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff