Upload 4 files

Browse files

Files changed (4) hide show

cleanGutenberg.py +110 -0
finetune_werther.py +116 -0
generate_werther_text.py +55 -0
upload_model.py +36 -0

cleanGutenberg.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import re
+import os
+def clean_werther_gutenberg(input_filepath, output_filepath):
+    """
+    Specifically cleans 'The Sorrows of Young Werther' from Project Gutenberg,
+    using precise start/end markers observed in the file.
+    """
+    print(f"Reading from: {input_filepath}")
+    try:
+        with open(input_filepath, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+    except FileNotFoundError:
+        print(f"Error: The file '{input_filepath}' was not found.")
+        return
+    start_line_marker = "Language: English" # This is the last line of the header info
+    end_line_marker_start = "Professor Michael S. Hart was the originator of the Project" # This is the start of the footer info
+    start_index = -1
+    for i, line in enumerate(lines):
+        if start_line_marker in line:
+            start_index = i
+            break
+    # We need to find the actual start of the *novel*, which is often a few lines after
+    # "Language: English". We can look for the first non-empty line after this marker.
+    if start_index != -1:
+        # Skip blank lines and the lines immediately following the 'Language: English' line
+        # The actual content of Werther often starts with "May 4." or "May 10." etc.
+        # Let's find the first substantial line after the marker + some buffer.
+        # A more robust way: Look for the first line that seems like novel text.
+        # This will be very specific to "Werther" but ensures accuracy.
+        # For "Werther", letters start with dates (e.g., "May 4.", "May 10.")
+        # We need to find the actual start of the book which seems to be around line 170-180
+        # Let's read the file again and find the section: "BOOK THE FIRST." or "May 4."
+        # Let's set a hardcoded start to be safe, if we know exactly where it is.
+        # Based on a quick check of the eBook, the text starts around line 170.
+        # (This is less robust for general use, but precise for *this* file)
+        # Let's try to find "BOOK THE FIRST." or "May 4."
+        found_real_start = False
+        for i in range(start_index + 1, len(lines)): # Start searching after the 'Language: English' line
+            cleaned_line = lines[i].strip()
+            if cleaned_line.startswith("BOOK THE FIRST.") or cleaned_line.startswith("May 4."):
+                start_index = i
+                found_real_start = True
+                print(f"Found actual novel start at line {i+1}.")
+                break
+        if not found_real_start:
+            print("Warning: Could not find specific novel start. Using general approach.")
+            # Fallback if the specific start isn't found
+            # Find the first non-empty line after the 'Language: English' marker
+            for i in range(start_index + 1, len(lines)):
+                if lines[i].strip():
+                    start_index = i
+                    break
+    else:
+        print("Error: 'Language: English' marker not found. Cannot determine start.")
+        return
+    end_index = len(lines)
+    for i in range(len(lines) - 1, -1, -1): # Iterate backwards
+        if end_line_marker_start in lines[i]:
+            end_index = i
+            break
+    if end_index == len(lines):
+        # Fallback: Look for "End of the Project Gutenberg EBook"
+        for i in range(len(lines) - 1, -1, -1):
+            if "End of the Project Gutenberg EBook" in lines[i]:
+                end_index = i
+                print("Found end marker via 'End of the Project Gutenberg EBook'.")
+                break
+        if end_index == len(lines):
+            # Another common end marker is a line like "Etext by" or similar
+            for i in range(len(lines) - 1, -1, -1):
+                if "Etext by" in lines[i]:
+                    end_index = i
+                    print("Found end marker via 'Etext by'.")
+                    break
+    if end_index == len(lines):
+        print("Warning: Could not find clear end marker for boilerplate. Content might include footer.")
+    # Extract the relevant lines
+    cleaned_lines = lines[start_index:end_index]
+    # Join lines and apply final cleaning
+    text = "".join(cleaned_lines)
+    text = re.sub(r'\n\s*\n', '\n\n', text) # Replace multiple blank lines with two newlines
+    text = re.sub(r'[ \t]+', ' ', text).strip() # Replace multiple spaces/tabs with one, and strip leading/trailing whitespace
+    print(f"Writing cleaned text to: {output_filepath}")
+    with open(output_filepath, 'w', encoding='utf-8') as f:
+        f.write(text)
+    print("Text cleaning complete.")
+if __name__ == "__main__":
+    current_dir = os.getcwd()
+    input_filename = "pg2527.txt"
+    output_filename = "werther_cleaned_final.txt" # New name for the final cleaned file
+    input_filepath = os.path.join(current_dir, input_filename)
+    output_filepath = os.path.join(current_dir, output_filename)
+    clean_werther_gutenberg(input_filepath, output_filepath)

finetune_werther.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
+from datasets import Dataset # Import Dataset directly
+# --- 1. Define File Paths and Model Parameters ---
+current_dir = os.getcwd()
+cleaned_text_file = os.path.join(current_dir, "werther_cleaned_final.txt")
+output_dir = os.path.join(current_dir, "fine_tuned_werther_model")
+os.makedirs(output_dir, exist_ok=True)
+model_max_length = 512
+# --- 2. Load Tokenizer and Prepare Dataset (Manual Approach) ---
+print("Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+if tokenizer.pad_token is None:
+    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+    # model.resize_token_embeddings(len(tokenizer)) # This is done AFTER model loading
+print(f"Reading entire text from: {cleaned_text_file}")
+try:
+    with open(cleaned_text_file, 'r', encoding='utf-8') as f:
+        full_text = f.read()
+except FileNotFoundError:
+    print(f"Error: The file '{cleaned_text_file}' was not found.")
+    exit()
+print("Tokenizing entire text...")
+# Tokenize the entire text. No `truncation` or `return_overflowing_tokens` yet.
+# We'll handle chunking manually.
+tokenized_output = tokenizer(full_text)
+all_input_ids = tokenized_output["input_ids"]
+print(f"Total tokens in cleaned text: {len(all_input_ids)}")
+# Manually create fixed-size chunks
+input_blocks = []
+labels_blocks = []
+for i in range(0, len(all_input_ids), model_max_length):
+    chunk = all_input_ids[i : i + model_max_length]
+    # Ensure all chunks are exactly model_max_length.
+    # If the last chunk is shorter, pad it. For language modeling, we generally
+    # prefer full blocks, but padding can be useful too.
+    # Here, we'll only take full blocks, dropping the remainder as done previously.
+    if len(chunk) == model_max_length:
+        input_blocks.append(chunk)
+        labels_blocks.append(chunk.copy()) # Labels are shifted copies of input_ids internally by Trainer
+# Create a Hugging Face Dataset from our manually prepared blocks
+print(f"Number of processed blocks for training: {len(input_blocks)}")
+# This ensures we have 'input_ids' and 'labels' columns
+lm_dataset = Dataset.from_dict({
+    "input_ids": input_blocks,
+    "labels": labels_blocks
+})
+# --- 3. Load Model and Data Collator ---
+print("Loading DistilGPT2 model...")
+model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+# If you added a padding token earlier, resize the model's token embeddings here
+# This needs to be done *after* loading the pre-trained model.
+model.resize_token_embeddings(len(tokenizer))
+# Data collator for causal language modeling
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer,
+    mlm=False,  # False for Causal Language Modeling (like GPT-2)
+)
+# --- 4. Define Training Arguments ---
+print("Setting up training arguments...")
+training_args = TrainingArguments(
+    output_dir=output_dir,
+    overwrite_output_dir=True,
+    num_train_epochs=5,
+    per_device_train_batch_size=8,
+    save_steps=1000,
+    save_total_limit=2,
+    logging_dir='./logs',
+    logging_steps=50,
+    learning_rate=2e-5,
+    weight_decay=0.01,
+    evaluation_strategy="steps",
+    eval_steps=1000,
+)
+# --- 5. Initialize and Start Trainer ---
+print("Initializing Trainer...")
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=lm_dataset, # Pass the directly created dataset
+    data_collator=data_collator,
+)
+print("\nStarting fine-tuning...")
+try:
+    trainer.train()
+    print("Fine-tuning complete!")
+    # --- 6. Save the Final Model ---
+    print(f"Saving fine-tuned model and tokenizer to {output_dir}...")
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
+    print("Model and tokenizer saved successfully.")
+except RuntimeError as e:
+    if "out of memory" in str(e):
+        print("\nERROR: Out of GPU memory! Try reducing `per_device_train_batch_size` (e.g., to 4 or 2) in the TrainingArguments.")
+    else:
+        raise e
+except Exception as e:
+    print(f"\nAn error occurred during training: {e}")

generate_werther_text.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+from transformers import pipeline
+model_path = os.path.join(os.getcwd(), "fine_tuned_werther_model")
+print(f"Loading fine-tuned model from: {model_path}...")
+try:
+    generator = pipeline("text-generation", model=model_path)
+    print("Model loaded successfully!")
+    print("\n--- Generating Text (Adjusted Parameters) ---")
+    # Example 1: Lower temperature for less repetition, shorter length
+    prompt1 = "How happy I am that I am gone!"
+    print(f"\nPrompt: '{prompt1}'")
+    generated_text1 = generator(
+        prompt1,
+        max_new_tokens=60, # Shorter output
+        num_return_sequences=1,
+        do_sample=True,
+        temperature=0.6, # Lower temperature
+        top_k=50,
+        top_p=0.9
+    )
+    print(f"Generated text: {generated_text1[0]['generated_text']}")
+    # Example 2: Try slightly different values
+    prompt2 = "My soul yearns for"
+    print(f"\nPrompt: '{prompt2}'")
+    generated_text2 = generator(
+        prompt2,
+        max_new_tokens=70,
+        num_return_sequences=1,
+        do_sample=True,
+        temperature=0.7, # Slightly higher than 0.6, lower than 0.9
+        top_k=40, # Smaller top_k
+        top_p=0.85 # Slightly lower top_p
+    )
+    print(f"Generated text: {generated_text2[0]['generated_text']}")
+    # Example 3: Experiment with a very low temperature (more deterministic)
+    prompt3 = "The world seemed to me"
+    print(f"\nPrompt: '{prompt3}'")
+    generated_text3 = generator(
+        prompt3,
+        max_new_tokens=80,
+        num_return_sequences=1,
+        do_sample=True,
+        temperature=0.5 # Very low temperature
+    )
+    print(f"Generated text: {generated_text3[0]['generated_text']}")
+except Exception as e:
+    print(f"\nAn error occurred during text generation: {e}")
+    print("Please ensure the 'fine_tuned_werther_model' directory exists and contains the model and tokenizer files.")

upload_model.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# --- 1. Define Paths ---
+# Path to your fine-tuned model directory
+local_model_path = os.path.join(os.getcwd(), "fine_tuned_werther_model")
+# --- 2. Define Hugging Face Hub Repository ID ---
+# Replace 'your-username' with your actual Hugging Face username
+# Replace 'distilgpt2-werther-finetuned' with your desired model name on the Hub
+repo_id = "ajsbsd/distilgpt2-werther-finetuned"
+# --- 3. Load Model and Tokenizer from Local Directory ---
+print(f"Loading model and tokenizer from local path: {local_model_path}...")
+try:
+    model = AutoModelForCausalLM.from_pretrained(local_model_path)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_path)
+    print("Model and tokenizer loaded successfully.")
+except Exception as e:
+    print(f"Error loading local model/tokenizer: {e}")
+    print("Please ensure the 'fine_tuned_werther_model' directory exists and contains the necessary files.")
+    exit()
+# --- 4. Push to Hugging Face Hub ---
+print(f"\nUploading model and tokenizer to Hugging Face Hub: {repo_id}...")
+try:
+    # Use push_to_hub method for model and tokenizer
+    # The 'commit_message' will appear in your model's history on the Hub
+    model.push_to_hub(repo_id, commit_message="Fine-tuned DistilGPT2 on The Sorrows of Young Werther")
+    tokenizer.push_to_hub(repo_id, commit_message="Tokenizer for Werther fine-tuned model")
+    print("Model and tokenizer uploaded successfully!")
+    print(f"You can view your model here: https://huggingface.co/{repo_id}")
+except Exception as e:
+    print(f"An error occurred during upload: {e}")
+    print("Ensure you are logged in to Hugging Face Hub (`huggingface-cli login`) and have write access to the repository.")