Upload 4 files
Browse files- cleanGutenberg.py +110 -0
- finetune_werther.py +116 -0
- generate_werther_text.py +55 -0
- upload_model.py +36 -0
cleanGutenberg.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
def clean_werther_gutenberg(input_filepath, output_filepath):
|
| 5 |
+
"""
|
| 6 |
+
Specifically cleans 'The Sorrows of Young Werther' from Project Gutenberg,
|
| 7 |
+
using precise start/end markers observed in the file.
|
| 8 |
+
"""
|
| 9 |
+
print(f"Reading from: {input_filepath}")
|
| 10 |
+
try:
|
| 11 |
+
with open(input_filepath, 'r', encoding='utf-8') as f:
|
| 12 |
+
lines = f.readlines()
|
| 13 |
+
except FileNotFoundError:
|
| 14 |
+
print(f"Error: The file '{input_filepath}' was not found.")
|
| 15 |
+
return
|
| 16 |
+
|
| 17 |
+
start_line_marker = "Language: English" # This is the last line of the header info
|
| 18 |
+
end_line_marker_start = "Professor Michael S. Hart was the originator of the Project" # This is the start of the footer info
|
| 19 |
+
|
| 20 |
+
start_index = -1
|
| 21 |
+
for i, line in enumerate(lines):
|
| 22 |
+
if start_line_marker in line:
|
| 23 |
+
start_index = i
|
| 24 |
+
break
|
| 25 |
+
|
| 26 |
+
# We need to find the actual start of the *novel*, which is often a few lines after
|
| 27 |
+
# "Language: English". We can look for the first non-empty line after this marker.
|
| 28 |
+
if start_index != -1:
|
| 29 |
+
# Skip blank lines and the lines immediately following the 'Language: English' line
|
| 30 |
+
# The actual content of Werther often starts with "May 4." or "May 10." etc.
|
| 31 |
+
# Let's find the first substantial line after the marker + some buffer.
|
| 32 |
+
|
| 33 |
+
# A more robust way: Look for the first line that seems like novel text.
|
| 34 |
+
# This will be very specific to "Werther" but ensures accuracy.
|
| 35 |
+
# For "Werther", letters start with dates (e.g., "May 4.", "May 10.")
|
| 36 |
+
|
| 37 |
+
# We need to find the actual start of the book which seems to be around line 170-180
|
| 38 |
+
# Let's read the file again and find the section: "BOOK THE FIRST." or "May 4."
|
| 39 |
+
|
| 40 |
+
# Let's set a hardcoded start to be safe, if we know exactly where it is.
|
| 41 |
+
# Based on a quick check of the eBook, the text starts around line 170.
|
| 42 |
+
# (This is less robust for general use, but precise for *this* file)
|
| 43 |
+
|
| 44 |
+
# Let's try to find "BOOK THE FIRST." or "May 4."
|
| 45 |
+
found_real_start = False
|
| 46 |
+
for i in range(start_index + 1, len(lines)): # Start searching after the 'Language: English' line
|
| 47 |
+
cleaned_line = lines[i].strip()
|
| 48 |
+
if cleaned_line.startswith("BOOK THE FIRST.") or cleaned_line.startswith("May 4."):
|
| 49 |
+
start_index = i
|
| 50 |
+
found_real_start = True
|
| 51 |
+
print(f"Found actual novel start at line {i+1}.")
|
| 52 |
+
break
|
| 53 |
+
if not found_real_start:
|
| 54 |
+
print("Warning: Could not find specific novel start. Using general approach.")
|
| 55 |
+
# Fallback if the specific start isn't found
|
| 56 |
+
# Find the first non-empty line after the 'Language: English' marker
|
| 57 |
+
for i in range(start_index + 1, len(lines)):
|
| 58 |
+
if lines[i].strip():
|
| 59 |
+
start_index = i
|
| 60 |
+
break
|
| 61 |
+
else:
|
| 62 |
+
print("Error: 'Language: English' marker not found. Cannot determine start.")
|
| 63 |
+
return
|
| 64 |
+
|
| 65 |
+
end_index = len(lines)
|
| 66 |
+
for i in range(len(lines) - 1, -1, -1): # Iterate backwards
|
| 67 |
+
if end_line_marker_start in lines[i]:
|
| 68 |
+
end_index = i
|
| 69 |
+
break
|
| 70 |
+
|
| 71 |
+
if end_index == len(lines):
|
| 72 |
+
# Fallback: Look for "End of the Project Gutenberg EBook"
|
| 73 |
+
for i in range(len(lines) - 1, -1, -1):
|
| 74 |
+
if "End of the Project Gutenberg EBook" in lines[i]:
|
| 75 |
+
end_index = i
|
| 76 |
+
print("Found end marker via 'End of the Project Gutenberg EBook'.")
|
| 77 |
+
break
|
| 78 |
+
if end_index == len(lines):
|
| 79 |
+
# Another common end marker is a line like "Etext by" or similar
|
| 80 |
+
for i in range(len(lines) - 1, -1, -1):
|
| 81 |
+
if "Etext by" in lines[i]:
|
| 82 |
+
end_index = i
|
| 83 |
+
print("Found end marker via 'Etext by'.")
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
if end_index == len(lines):
|
| 87 |
+
print("Warning: Could not find clear end marker for boilerplate. Content might include footer.")
|
| 88 |
+
|
| 89 |
+
# Extract the relevant lines
|
| 90 |
+
cleaned_lines = lines[start_index:end_index]
|
| 91 |
+
|
| 92 |
+
# Join lines and apply final cleaning
|
| 93 |
+
text = "".join(cleaned_lines)
|
| 94 |
+
text = re.sub(r'\n\s*\n', '\n\n', text) # Replace multiple blank lines with two newlines
|
| 95 |
+
text = re.sub(r'[ \t]+', ' ', text).strip() # Replace multiple spaces/tabs with one, and strip leading/trailing whitespace
|
| 96 |
+
|
| 97 |
+
print(f"Writing cleaned text to: {output_filepath}")
|
| 98 |
+
with open(output_filepath, 'w', encoding='utf-8') as f:
|
| 99 |
+
f.write(text)
|
| 100 |
+
print("Text cleaning complete.")
|
| 101 |
+
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
current_dir = os.getcwd()
|
| 104 |
+
input_filename = "pg2527.txt"
|
| 105 |
+
output_filename = "werther_cleaned_final.txt" # New name for the final cleaned file
|
| 106 |
+
|
| 107 |
+
input_filepath = os.path.join(current_dir, input_filename)
|
| 108 |
+
output_filepath = os.path.join(current_dir, output_filename)
|
| 109 |
+
|
| 110 |
+
clean_werther_gutenberg(input_filepath, output_filepath)
|
finetune_werther.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
|
| 3 |
+
from datasets import Dataset # Import Dataset directly
|
| 4 |
+
|
| 5 |
+
# --- 1. Define File Paths and Model Parameters ---
|
| 6 |
+
current_dir = os.getcwd()
|
| 7 |
+
cleaned_text_file = os.path.join(current_dir, "werther_cleaned_final.txt")
|
| 8 |
+
output_dir = os.path.join(current_dir, "fine_tuned_werther_model")
|
| 9 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 10 |
+
model_max_length = 512
|
| 11 |
+
|
| 12 |
+
# --- 2. Load Tokenizer and Prepare Dataset (Manual Approach) ---
|
| 13 |
+
print("Loading tokenizer...")
|
| 14 |
+
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
|
| 15 |
+
if tokenizer.pad_token is None:
|
| 16 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
| 17 |
+
# model.resize_token_embeddings(len(tokenizer)) # This is done AFTER model loading
|
| 18 |
+
|
| 19 |
+
print(f"Reading entire text from: {cleaned_text_file}")
|
| 20 |
+
try:
|
| 21 |
+
with open(cleaned_text_file, 'r', encoding='utf-8') as f:
|
| 22 |
+
full_text = f.read()
|
| 23 |
+
except FileNotFoundError:
|
| 24 |
+
print(f"Error: The file '{cleaned_text_file}' was not found.")
|
| 25 |
+
exit()
|
| 26 |
+
|
| 27 |
+
print("Tokenizing entire text...")
|
| 28 |
+
# Tokenize the entire text. No `truncation` or `return_overflowing_tokens` yet.
|
| 29 |
+
# We'll handle chunking manually.
|
| 30 |
+
tokenized_output = tokenizer(full_text)
|
| 31 |
+
all_input_ids = tokenized_output["input_ids"]
|
| 32 |
+
|
| 33 |
+
print(f"Total tokens in cleaned text: {len(all_input_ids)}")
|
| 34 |
+
|
| 35 |
+
# Manually create fixed-size chunks
|
| 36 |
+
input_blocks = []
|
| 37 |
+
labels_blocks = []
|
| 38 |
+
|
| 39 |
+
for i in range(0, len(all_input_ids), model_max_length):
|
| 40 |
+
chunk = all_input_ids[i : i + model_max_length]
|
| 41 |
+
|
| 42 |
+
# Ensure all chunks are exactly model_max_length.
|
| 43 |
+
# If the last chunk is shorter, pad it. For language modeling, we generally
|
| 44 |
+
# prefer full blocks, but padding can be useful too.
|
| 45 |
+
# Here, we'll only take full blocks, dropping the remainder as done previously.
|
| 46 |
+
if len(chunk) == model_max_length:
|
| 47 |
+
input_blocks.append(chunk)
|
| 48 |
+
labels_blocks.append(chunk.copy()) # Labels are shifted copies of input_ids internally by Trainer
|
| 49 |
+
|
| 50 |
+
# Create a Hugging Face Dataset from our manually prepared blocks
|
| 51 |
+
print(f"Number of processed blocks for training: {len(input_blocks)}")
|
| 52 |
+
|
| 53 |
+
# This ensures we have 'input_ids' and 'labels' columns
|
| 54 |
+
lm_dataset = Dataset.from_dict({
|
| 55 |
+
"input_ids": input_blocks,
|
| 56 |
+
"labels": labels_blocks
|
| 57 |
+
})
|
| 58 |
+
|
| 59 |
+
# --- 3. Load Model and Data Collator ---
|
| 60 |
+
print("Loading DistilGPT2 model...")
|
| 61 |
+
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
|
| 62 |
+
|
| 63 |
+
# If you added a padding token earlier, resize the model's token embeddings here
|
| 64 |
+
# This needs to be done *after* loading the pre-trained model.
|
| 65 |
+
model.resize_token_embeddings(len(tokenizer))
|
| 66 |
+
|
| 67 |
+
# Data collator for causal language modeling
|
| 68 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 69 |
+
tokenizer=tokenizer,
|
| 70 |
+
mlm=False, # False for Causal Language Modeling (like GPT-2)
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# --- 4. Define Training Arguments ---
|
| 74 |
+
print("Setting up training arguments...")
|
| 75 |
+
training_args = TrainingArguments(
|
| 76 |
+
output_dir=output_dir,
|
| 77 |
+
overwrite_output_dir=True,
|
| 78 |
+
num_train_epochs=5,
|
| 79 |
+
per_device_train_batch_size=8,
|
| 80 |
+
save_steps=1000,
|
| 81 |
+
save_total_limit=2,
|
| 82 |
+
logging_dir='./logs',
|
| 83 |
+
logging_steps=50,
|
| 84 |
+
learning_rate=2e-5,
|
| 85 |
+
weight_decay=0.01,
|
| 86 |
+
evaluation_strategy="steps",
|
| 87 |
+
eval_steps=1000,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# --- 5. Initialize and Start Trainer ---
|
| 91 |
+
print("Initializing Trainer...")
|
| 92 |
+
trainer = Trainer(
|
| 93 |
+
model=model,
|
| 94 |
+
args=training_args,
|
| 95 |
+
train_dataset=lm_dataset, # Pass the directly created dataset
|
| 96 |
+
data_collator=data_collator,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
print("\nStarting fine-tuning...")
|
| 100 |
+
try:
|
| 101 |
+
trainer.train()
|
| 102 |
+
print("Fine-tuning complete!")
|
| 103 |
+
|
| 104 |
+
# --- 6. Save the Final Model ---
|
| 105 |
+
print(f"Saving fine-tuned model and tokenizer to {output_dir}...")
|
| 106 |
+
model.save_pretrained(output_dir)
|
| 107 |
+
tokenizer.save_pretrained(output_dir)
|
| 108 |
+
print("Model and tokenizer saved successfully.")
|
| 109 |
+
|
| 110 |
+
except RuntimeError as e:
|
| 111 |
+
if "out of memory" in str(e):
|
| 112 |
+
print("\nERROR: Out of GPU memory! Try reducing `per_device_train_batch_size` (e.g., to 4 or 2) in the TrainingArguments.")
|
| 113 |
+
else:
|
| 114 |
+
raise e
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"\nAn error occurred during training: {e}")
|
generate_werther_text.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
|
| 4 |
+
model_path = os.path.join(os.getcwd(), "fine_tuned_werther_model")
|
| 5 |
+
print(f"Loading fine-tuned model from: {model_path}...")
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
generator = pipeline("text-generation", model=model_path)
|
| 9 |
+
print("Model loaded successfully!")
|
| 10 |
+
|
| 11 |
+
print("\n--- Generating Text (Adjusted Parameters) ---")
|
| 12 |
+
|
| 13 |
+
# Example 1: Lower temperature for less repetition, shorter length
|
| 14 |
+
prompt1 = "How happy I am that I am gone!"
|
| 15 |
+
print(f"\nPrompt: '{prompt1}'")
|
| 16 |
+
generated_text1 = generator(
|
| 17 |
+
prompt1,
|
| 18 |
+
max_new_tokens=60, # Shorter output
|
| 19 |
+
num_return_sequences=1,
|
| 20 |
+
do_sample=True,
|
| 21 |
+
temperature=0.6, # Lower temperature
|
| 22 |
+
top_k=50,
|
| 23 |
+
top_p=0.9
|
| 24 |
+
)
|
| 25 |
+
print(f"Generated text: {generated_text1[0]['generated_text']}")
|
| 26 |
+
|
| 27 |
+
# Example 2: Try slightly different values
|
| 28 |
+
prompt2 = "My soul yearns for"
|
| 29 |
+
print(f"\nPrompt: '{prompt2}'")
|
| 30 |
+
generated_text2 = generator(
|
| 31 |
+
prompt2,
|
| 32 |
+
max_new_tokens=70,
|
| 33 |
+
num_return_sequences=1,
|
| 34 |
+
do_sample=True,
|
| 35 |
+
temperature=0.7, # Slightly higher than 0.6, lower than 0.9
|
| 36 |
+
top_k=40, # Smaller top_k
|
| 37 |
+
top_p=0.85 # Slightly lower top_p
|
| 38 |
+
)
|
| 39 |
+
print(f"Generated text: {generated_text2[0]['generated_text']}")
|
| 40 |
+
|
| 41 |
+
# Example 3: Experiment with a very low temperature (more deterministic)
|
| 42 |
+
prompt3 = "The world seemed to me"
|
| 43 |
+
print(f"\nPrompt: '{prompt3}'")
|
| 44 |
+
generated_text3 = generator(
|
| 45 |
+
prompt3,
|
| 46 |
+
max_new_tokens=80,
|
| 47 |
+
num_return_sequences=1,
|
| 48 |
+
do_sample=True,
|
| 49 |
+
temperature=0.5 # Very low temperature
|
| 50 |
+
)
|
| 51 |
+
print(f"Generated text: {generated_text3[0]['generated_text']}")
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"\nAn error occurred during text generation: {e}")
|
| 55 |
+
print("Please ensure the 'fine_tuned_werther_model' directory exists and contains the model and tokenizer files.")
|
upload_model.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 3 |
+
|
| 4 |
+
# --- 1. Define Paths ---
|
| 5 |
+
# Path to your fine-tuned model directory
|
| 6 |
+
local_model_path = os.path.join(os.getcwd(), "fine_tuned_werther_model")
|
| 7 |
+
|
| 8 |
+
# --- 2. Define Hugging Face Hub Repository ID ---
|
| 9 |
+
# Replace 'your-username' with your actual Hugging Face username
|
| 10 |
+
# Replace 'distilgpt2-werther-finetuned' with your desired model name on the Hub
|
| 11 |
+
repo_id = "ajsbsd/distilgpt2-werther-finetuned"
|
| 12 |
+
|
| 13 |
+
# --- 3. Load Model and Tokenizer from Local Directory ---
|
| 14 |
+
print(f"Loading model and tokenizer from local path: {local_model_path}...")
|
| 15 |
+
try:
|
| 16 |
+
model = AutoModelForCausalLM.from_pretrained(local_model_path)
|
| 17 |
+
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
|
| 18 |
+
print("Model and tokenizer loaded successfully.")
|
| 19 |
+
except Exception as e:
|
| 20 |
+
print(f"Error loading local model/tokenizer: {e}")
|
| 21 |
+
print("Please ensure the 'fine_tuned_werther_model' directory exists and contains the necessary files.")
|
| 22 |
+
exit()
|
| 23 |
+
|
| 24 |
+
# --- 4. Push to Hugging Face Hub ---
|
| 25 |
+
print(f"\nUploading model and tokenizer to Hugging Face Hub: {repo_id}...")
|
| 26 |
+
try:
|
| 27 |
+
# Use push_to_hub method for model and tokenizer
|
| 28 |
+
# The 'commit_message' will appear in your model's history on the Hub
|
| 29 |
+
model.push_to_hub(repo_id, commit_message="Fine-tuned DistilGPT2 on The Sorrows of Young Werther")
|
| 30 |
+
tokenizer.push_to_hub(repo_id, commit_message="Tokenizer for Werther fine-tuned model")
|
| 31 |
+
print("Model and tokenizer uploaded successfully!")
|
| 32 |
+
print(f"You can view your model here: https://huggingface.co/{repo_id}")
|
| 33 |
+
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"An error occurred during upload: {e}")
|
| 36 |
+
print("Ensure you are logged in to Hugging Face Hub (`huggingface-cli login`) and have write access to the repository.")
|