ajsbsd commited on
Commit
3dacdd0
·
verified ·
1 Parent(s): 9b3942d

Upload 4 files

Browse files
cleanGutenberg.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+
4
+ def clean_werther_gutenberg(input_filepath, output_filepath):
5
+ """
6
+ Specifically cleans 'The Sorrows of Young Werther' from Project Gutenberg,
7
+ using precise start/end markers observed in the file.
8
+ """
9
+ print(f"Reading from: {input_filepath}")
10
+ try:
11
+ with open(input_filepath, 'r', encoding='utf-8') as f:
12
+ lines = f.readlines()
13
+ except FileNotFoundError:
14
+ print(f"Error: The file '{input_filepath}' was not found.")
15
+ return
16
+
17
+ start_line_marker = "Language: English" # This is the last line of the header info
18
+ end_line_marker_start = "Professor Michael S. Hart was the originator of the Project" # This is the start of the footer info
19
+
20
+ start_index = -1
21
+ for i, line in enumerate(lines):
22
+ if start_line_marker in line:
23
+ start_index = i
24
+ break
25
+
26
+ # We need to find the actual start of the *novel*, which is often a few lines after
27
+ # "Language: English". We can look for the first non-empty line after this marker.
28
+ if start_index != -1:
29
+ # Skip blank lines and the lines immediately following the 'Language: English' line
30
+ # The actual content of Werther often starts with "May 4." or "May 10." etc.
31
+ # Let's find the first substantial line after the marker + some buffer.
32
+
33
+ # A more robust way: Look for the first line that seems like novel text.
34
+ # This will be very specific to "Werther" but ensures accuracy.
35
+ # For "Werther", letters start with dates (e.g., "May 4.", "May 10.")
36
+
37
+ # We need to find the actual start of the book which seems to be around line 170-180
38
+ # Let's read the file again and find the section: "BOOK THE FIRST." or "May 4."
39
+
40
+ # Let's set a hardcoded start to be safe, if we know exactly where it is.
41
+ # Based on a quick check of the eBook, the text starts around line 170.
42
+ # (This is less robust for general use, but precise for *this* file)
43
+
44
+ # Let's try to find "BOOK THE FIRST." or "May 4."
45
+ found_real_start = False
46
+ for i in range(start_index + 1, len(lines)): # Start searching after the 'Language: English' line
47
+ cleaned_line = lines[i].strip()
48
+ if cleaned_line.startswith("BOOK THE FIRST.") or cleaned_line.startswith("May 4."):
49
+ start_index = i
50
+ found_real_start = True
51
+ print(f"Found actual novel start at line {i+1}.")
52
+ break
53
+ if not found_real_start:
54
+ print("Warning: Could not find specific novel start. Using general approach.")
55
+ # Fallback if the specific start isn't found
56
+ # Find the first non-empty line after the 'Language: English' marker
57
+ for i in range(start_index + 1, len(lines)):
58
+ if lines[i].strip():
59
+ start_index = i
60
+ break
61
+ else:
62
+ print("Error: 'Language: English' marker not found. Cannot determine start.")
63
+ return
64
+
65
+ end_index = len(lines)
66
+ for i in range(len(lines) - 1, -1, -1): # Iterate backwards
67
+ if end_line_marker_start in lines[i]:
68
+ end_index = i
69
+ break
70
+
71
+ if end_index == len(lines):
72
+ # Fallback: Look for "End of the Project Gutenberg EBook"
73
+ for i in range(len(lines) - 1, -1, -1):
74
+ if "End of the Project Gutenberg EBook" in lines[i]:
75
+ end_index = i
76
+ print("Found end marker via 'End of the Project Gutenberg EBook'.")
77
+ break
78
+ if end_index == len(lines):
79
+ # Another common end marker is a line like "Etext by" or similar
80
+ for i in range(len(lines) - 1, -1, -1):
81
+ if "Etext by" in lines[i]:
82
+ end_index = i
83
+ print("Found end marker via 'Etext by'.")
84
+ break
85
+
86
+ if end_index == len(lines):
87
+ print("Warning: Could not find clear end marker for boilerplate. Content might include footer.")
88
+
89
+ # Extract the relevant lines
90
+ cleaned_lines = lines[start_index:end_index]
91
+
92
+ # Join lines and apply final cleaning
93
+ text = "".join(cleaned_lines)
94
+ text = re.sub(r'\n\s*\n', '\n\n', text) # Replace multiple blank lines with two newlines
95
+ text = re.sub(r'[ \t]+', ' ', text).strip() # Replace multiple spaces/tabs with one, and strip leading/trailing whitespace
96
+
97
+ print(f"Writing cleaned text to: {output_filepath}")
98
+ with open(output_filepath, 'w', encoding='utf-8') as f:
99
+ f.write(text)
100
+ print("Text cleaning complete.")
101
+
102
+ if __name__ == "__main__":
103
+ current_dir = os.getcwd()
104
+ input_filename = "pg2527.txt"
105
+ output_filename = "werther_cleaned_final.txt" # New name for the final cleaned file
106
+
107
+ input_filepath = os.path.join(current_dir, input_filename)
108
+ output_filepath = os.path.join(current_dir, output_filename)
109
+
110
+ clean_werther_gutenberg(input_filepath, output_filepath)
finetune_werther.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
3
+ from datasets import Dataset # Import Dataset directly
4
+
5
+ # --- 1. Define File Paths and Model Parameters ---
6
+ current_dir = os.getcwd()
7
+ cleaned_text_file = os.path.join(current_dir, "werther_cleaned_final.txt")
8
+ output_dir = os.path.join(current_dir, "fine_tuned_werther_model")
9
+ os.makedirs(output_dir, exist_ok=True)
10
+ model_max_length = 512
11
+
12
+ # --- 2. Load Tokenizer and Prepare Dataset (Manual Approach) ---
13
+ print("Loading tokenizer...")
14
+ tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
15
+ if tokenizer.pad_token is None:
16
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
17
+ # model.resize_token_embeddings(len(tokenizer)) # This is done AFTER model loading
18
+
19
+ print(f"Reading entire text from: {cleaned_text_file}")
20
+ try:
21
+ with open(cleaned_text_file, 'r', encoding='utf-8') as f:
22
+ full_text = f.read()
23
+ except FileNotFoundError:
24
+ print(f"Error: The file '{cleaned_text_file}' was not found.")
25
+ exit()
26
+
27
+ print("Tokenizing entire text...")
28
+ # Tokenize the entire text. No `truncation` or `return_overflowing_tokens` yet.
29
+ # We'll handle chunking manually.
30
+ tokenized_output = tokenizer(full_text)
31
+ all_input_ids = tokenized_output["input_ids"]
32
+
33
+ print(f"Total tokens in cleaned text: {len(all_input_ids)}")
34
+
35
+ # Manually create fixed-size chunks
36
+ input_blocks = []
37
+ labels_blocks = []
38
+
39
+ for i in range(0, len(all_input_ids), model_max_length):
40
+ chunk = all_input_ids[i : i + model_max_length]
41
+
42
+ # Ensure all chunks are exactly model_max_length.
43
+ # If the last chunk is shorter, pad it. For language modeling, we generally
44
+ # prefer full blocks, but padding can be useful too.
45
+ # Here, we'll only take full blocks, dropping the remainder as done previously.
46
+ if len(chunk) == model_max_length:
47
+ input_blocks.append(chunk)
48
+ labels_blocks.append(chunk.copy()) # Labels are shifted copies of input_ids internally by Trainer
49
+
50
+ # Create a Hugging Face Dataset from our manually prepared blocks
51
+ print(f"Number of processed blocks for training: {len(input_blocks)}")
52
+
53
+ # This ensures we have 'input_ids' and 'labels' columns
54
+ lm_dataset = Dataset.from_dict({
55
+ "input_ids": input_blocks,
56
+ "labels": labels_blocks
57
+ })
58
+
59
+ # --- 3. Load Model and Data Collator ---
60
+ print("Loading DistilGPT2 model...")
61
+ model = AutoModelForCausalLM.from_pretrained("distilgpt2")
62
+
63
+ # If you added a padding token earlier, resize the model's token embeddings here
64
+ # This needs to be done *after* loading the pre-trained model.
65
+ model.resize_token_embeddings(len(tokenizer))
66
+
67
+ # Data collator for causal language modeling
68
+ data_collator = DataCollatorForLanguageModeling(
69
+ tokenizer=tokenizer,
70
+ mlm=False, # False for Causal Language Modeling (like GPT-2)
71
+ )
72
+
73
+ # --- 4. Define Training Arguments ---
74
+ print("Setting up training arguments...")
75
+ training_args = TrainingArguments(
76
+ output_dir=output_dir,
77
+ overwrite_output_dir=True,
78
+ num_train_epochs=5,
79
+ per_device_train_batch_size=8,
80
+ save_steps=1000,
81
+ save_total_limit=2,
82
+ logging_dir='./logs',
83
+ logging_steps=50,
84
+ learning_rate=2e-5,
85
+ weight_decay=0.01,
86
+ evaluation_strategy="steps",
87
+ eval_steps=1000,
88
+ )
89
+
90
+ # --- 5. Initialize and Start Trainer ---
91
+ print("Initializing Trainer...")
92
+ trainer = Trainer(
93
+ model=model,
94
+ args=training_args,
95
+ train_dataset=lm_dataset, # Pass the directly created dataset
96
+ data_collator=data_collator,
97
+ )
98
+
99
+ print("\nStarting fine-tuning...")
100
+ try:
101
+ trainer.train()
102
+ print("Fine-tuning complete!")
103
+
104
+ # --- 6. Save the Final Model ---
105
+ print(f"Saving fine-tuned model and tokenizer to {output_dir}...")
106
+ model.save_pretrained(output_dir)
107
+ tokenizer.save_pretrained(output_dir)
108
+ print("Model and tokenizer saved successfully.")
109
+
110
+ except RuntimeError as e:
111
+ if "out of memory" in str(e):
112
+ print("\nERROR: Out of GPU memory! Try reducing `per_device_train_batch_size` (e.g., to 4 or 2) in the TrainingArguments.")
113
+ else:
114
+ raise e
115
+ except Exception as e:
116
+ print(f"\nAn error occurred during training: {e}")
generate_werther_text.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import pipeline
3
+
4
+ model_path = os.path.join(os.getcwd(), "fine_tuned_werther_model")
5
+ print(f"Loading fine-tuned model from: {model_path}...")
6
+
7
+ try:
8
+ generator = pipeline("text-generation", model=model_path)
9
+ print("Model loaded successfully!")
10
+
11
+ print("\n--- Generating Text (Adjusted Parameters) ---")
12
+
13
+ # Example 1: Lower temperature for less repetition, shorter length
14
+ prompt1 = "How happy I am that I am gone!"
15
+ print(f"\nPrompt: '{prompt1}'")
16
+ generated_text1 = generator(
17
+ prompt1,
18
+ max_new_tokens=60, # Shorter output
19
+ num_return_sequences=1,
20
+ do_sample=True,
21
+ temperature=0.6, # Lower temperature
22
+ top_k=50,
23
+ top_p=0.9
24
+ )
25
+ print(f"Generated text: {generated_text1[0]['generated_text']}")
26
+
27
+ # Example 2: Try slightly different values
28
+ prompt2 = "My soul yearns for"
29
+ print(f"\nPrompt: '{prompt2}'")
30
+ generated_text2 = generator(
31
+ prompt2,
32
+ max_new_tokens=70,
33
+ num_return_sequences=1,
34
+ do_sample=True,
35
+ temperature=0.7, # Slightly higher than 0.6, lower than 0.9
36
+ top_k=40, # Smaller top_k
37
+ top_p=0.85 # Slightly lower top_p
38
+ )
39
+ print(f"Generated text: {generated_text2[0]['generated_text']}")
40
+
41
+ # Example 3: Experiment with a very low temperature (more deterministic)
42
+ prompt3 = "The world seemed to me"
43
+ print(f"\nPrompt: '{prompt3}'")
44
+ generated_text3 = generator(
45
+ prompt3,
46
+ max_new_tokens=80,
47
+ num_return_sequences=1,
48
+ do_sample=True,
49
+ temperature=0.5 # Very low temperature
50
+ )
51
+ print(f"Generated text: {generated_text3[0]['generated_text']}")
52
+
53
+ except Exception as e:
54
+ print(f"\nAn error occurred during text generation: {e}")
55
+ print("Please ensure the 'fine_tuned_werther_model' directory exists and contains the model and tokenizer files.")
upload_model.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+
4
+ # --- 1. Define Paths ---
5
+ # Path to your fine-tuned model directory
6
+ local_model_path = os.path.join(os.getcwd(), "fine_tuned_werther_model")
7
+
8
+ # --- 2. Define Hugging Face Hub Repository ID ---
9
+ # Replace 'your-username' with your actual Hugging Face username
10
+ # Replace 'distilgpt2-werther-finetuned' with your desired model name on the Hub
11
+ repo_id = "ajsbsd/distilgpt2-werther-finetuned"
12
+
13
+ # --- 3. Load Model and Tokenizer from Local Directory ---
14
+ print(f"Loading model and tokenizer from local path: {local_model_path}...")
15
+ try:
16
+ model = AutoModelForCausalLM.from_pretrained(local_model_path)
17
+ tokenizer = AutoTokenizer.from_pretrained(local_model_path)
18
+ print("Model and tokenizer loaded successfully.")
19
+ except Exception as e:
20
+ print(f"Error loading local model/tokenizer: {e}")
21
+ print("Please ensure the 'fine_tuned_werther_model' directory exists and contains the necessary files.")
22
+ exit()
23
+
24
+ # --- 4. Push to Hugging Face Hub ---
25
+ print(f"\nUploading model and tokenizer to Hugging Face Hub: {repo_id}...")
26
+ try:
27
+ # Use push_to_hub method for model and tokenizer
28
+ # The 'commit_message' will appear in your model's history on the Hub
29
+ model.push_to_hub(repo_id, commit_message="Fine-tuned DistilGPT2 on The Sorrows of Young Werther")
30
+ tokenizer.push_to_hub(repo_id, commit_message="Tokenizer for Werther fine-tuned model")
31
+ print("Model and tokenizer uploaded successfully!")
32
+ print(f"You can view your model here: https://huggingface.co/{repo_id}")
33
+
34
+ except Exception as e:
35
+ print(f"An error occurred during upload: {e}")
36
+ print("Ensure you are logged in to Hugging Face Hub (`huggingface-cli login`) and have write access to the repository.")