remiai3 commited on
Commit
5a93c7a
·
verified ·
1 Parent(s): ca7dbcc

Upload 3 files

Browse files
Files changed (3) hide show
  1. all_in_one.py +116 -114
  2. app.py +45 -38
  3. test.py +2 -2
all_in_one.py CHANGED
@@ -1,115 +1,117 @@
1
- import os
2
- import torch
3
- import json
4
- from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
5
- from datasets import Dataset
6
- import matplotlib.pyplot as plt
7
-
8
- # Set Hugging Face token (replace with your actual token)
9
- os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your HF_TOKEN
10
-
11
- # Download model and tokenizer
12
- model_name = "Salesforce/codegen-350M-multi"
13
- local_model_path = "./codegen_model"
14
- tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
15
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir=local_model_path)
16
-
17
- # Set padding token
18
- tokenizer.pad_token = tokenizer.eos_token
19
-
20
- # Move model to CPU
21
- device = torch.device("cpu")
22
- model.to(device)
23
-
24
- # Load custom dataset from JSONL
25
- dataset_path = "./custom_dataset.jsonl"
26
- data = []
27
- with open(dataset_path, 'r', encoding='utf-8') as f:
28
- for line in f:
29
- data.append(json.loads(line.strip()))
30
- dataset = Dataset.from_list(data)
31
-
32
- # Tokenize dataset
33
- def tokenize_function(examples):
34
- inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
35
- return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
36
-
37
- tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
38
-
39
- # Data collator for language modeling
40
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
41
-
42
- # Define training arguments
43
- training_args = TrainingArguments(
44
- output_dir="./finetuned_codegen",
45
- overwrite_output_dir=True,
46
- num_train_epochs=3,
47
- per_device_train_batch_size=1,
48
- gradient_accumulation_steps=4,
49
- save_steps=500,
50
- save_total_limit=2,
51
- logging_steps=100,
52
- learning_rate=5e-5,
53
- fp16=False,
54
- no_cuda=True,
55
- dataloader_pin_memory=False,
56
- )
57
-
58
- # Custom callback to store training loss
59
- class LossCallback(TrainerCallback):
60
- def __init__(self):
61
- self.losses = []
62
-
63
- def on_log(self, args, state, control, logs=None, **kwargs):
64
- if logs and "loss" in logs:
65
- self.losses.append(logs["loss"])
66
-
67
- loss_callback = LossCallback()
68
-
69
- # Initialize Trainer
70
- trainer = Trainer(
71
- model=model,
72
- args=training_args,
73
- train_dataset=tokenized_dataset,
74
- data_collator=data_collator,
75
- callbacks=[loss_callback],
76
- )
77
-
78
- # Start fine-tuning
79
- print("Starting fine-tuning...")
80
- trainer.train()
81
-
82
- # Save fine-tuned model
83
- model.save_pretrained("./finetuned_codegen")
84
- tokenizer.save_pretrained("./finetuned_codegen")
85
-
86
- # Plot training loss
87
- plt.plot(loss_callback.losses, label="Training Loss")
88
- plt.xlabel("Steps")
89
- plt.ylabel("Loss")
90
- plt.title("Fine-Tuning Loss Curve")
91
- plt.legend()
92
- plt.savefig("./finetuned_codegen/loss_plot.png")
93
- plt.show()
94
-
95
- print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")
96
-
97
- # Test fine-tuned model
98
- print("\nTesting fine-tuned model...")
99
- prompts = [
100
- "Write a Python program to print 'Hello, World!'"
101
- ]
102
-
103
- for prompt in prompts:
104
- inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
105
- outputs = model.generate(
106
- **inputs,
107
- max_length=200,
108
- num_return_sequences=1,
109
- pad_token_id=tokenizer.eos_token_id,
110
- do_sample=True,
111
- temperature=0.7,
112
- top_p=0.9
113
- )
114
- generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
115
  print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")
 
1
+ import os
2
+ import torch
3
+ import json
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
5
+ from datasets import Dataset
6
+ import matplotlib.pyplot as plt
7
+
8
+ # Set Hugging Face token (replace with your actual token)
9
+ os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your HF_TOKEN
10
+
11
+ # Download model and tokenizer
12
+ model_name = "Salesforce/codegen-350M-multi"
13
+ local_model_path = "./codegen_model"
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
15
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir=local_model_path)
16
+
17
+ # Set padding token
18
+ tokenizer.pad_token = tokenizer.eos_token
19
+
20
+ # Move model to CPU
21
+ device = torch.device("cpu")
22
+ model.to(device)
23
+
24
+ # Load custom dataset from JSONL
25
+ dataset_path = "./custom_dataset.jsonl"
26
+ data = []
27
+ with open(dataset_path, 'r', encoding='utf-8') as f:
28
+ for line in f:
29
+ data.append(json.loads(line.strip()))
30
+ dataset = Dataset.from_list(data)
31
+
32
+ # Tokenize dataset
33
+ def tokenize_function(examples):
34
+ inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
35
+ return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
36
+
37
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
38
+
39
+ # Data collator for language modeling
40
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
41
+
42
+ # Define training arguments
43
+ training_args = TrainingArguments(
44
+ output_dir="./finetuned_codegen",
45
+ overwrite_output_dir=True,
46
+ num_train_epochs=5,
47
+ per_device_train_batch_size=1,
48
+ gradient_accumulation_steps=4,
49
+ save_steps=500,
50
+ save_total_limit=2,
51
+ logging_steps=10, # Reduced logging steps for more frequent loss recording
52
+ learning_rate=5e-5,
53
+ fp16=False,
54
+ no_cuda=True,
55
+ dataloader_pin_memory=False,
56
+ )
57
+
58
+ # Custom callback to store training loss
59
+ class LossCallback(TrainerCallback):
60
+ def __init__(self):
61
+ self.losses = []
62
+ self.steps = []
63
+
64
+ def on_log(self, args, state, control, logs=None, **kwargs):
65
+ if logs and "loss" in logs:
66
+ self.losses.append(logs["loss"])
67
+ self.steps.append(state.global_step)
68
+
69
+ loss_callback = LossCallback()
70
+
71
+ # Initialize Trainer
72
+ trainer = Trainer(
73
+ model=model,
74
+ args=training_args,
75
+ train_dataset=tokenized_dataset,
76
+ data_collator=data_collator,
77
+ callbacks=[loss_callback],
78
+ )
79
+
80
+ # Start fine-tuning
81
+ print("Starting fine-tuning...")
82
+ trainer.train()
83
+
84
+ # Save fine-tuned model
85
+ model.save_pretrained("./finetuned_codegen")
86
+ tokenizer.save_pretrained("./finetuned_codegen")
87
+
88
+ # Plot training loss
89
+ plt.plot(loss_callback.steps, loss_callback.losses, label="Training Loss")
90
+ plt.xlabel("Steps")
91
+ plt.ylabel("Loss")
92
+ plt.title("Fine-Tuning Loss Curve")
93
+ plt.legend()
94
+ plt.savefig("./finetuned_codegen/loss_plot.png")
95
+ plt.show()
96
+
97
+ print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")
98
+
99
+ # Test fine-tuned model
100
+ print("\nTesting fine-tuned model...")
101
+ prompts = [
102
+ "Write a Python program to print 'Hello, guys how are you!'"
103
+ ]
104
+
105
+ for prompt in prompts:
106
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
107
+ outputs = model.generate(
108
+ **inputs,
109
+ max_length=200,
110
+ num_return_sequences=1,
111
+ pad_token_id=tokenizer.eos_token_id,
112
+ do_sample=True,
113
+ temperature=0.7,
114
+ top_p=0.9
115
+ )
116
+ generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
117
  print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")
app.py CHANGED
@@ -1,39 +1,46 @@
1
- from flask import Flask, render_template, request
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
-
5
- app = Flask(__name__)
6
-
7
- # Load fine-tuned model and tokenizer
8
- model_path = "./finetuned_codegen"
9
- tokenizer = AutoTokenizer.from_pretrained(model_path)
10
- model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32)
11
-
12
- # Set padding token
13
- tokenizer.pad_token = tokenizer.eos_token
14
-
15
- # Move model to CPU
16
- device = torch.device("cpu")
17
- model.to(device)
18
-
19
- @app.route("/", methods=["GET", "POST"])
20
- def index():
21
- generated_code = ""
22
- prompt = ""
23
- if request.method == "POST":
24
- prompt = request.form["prompt"]
25
- inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
26
- outputs = model.generate(
27
- **inputs,
28
- max_length=200,
29
- num_return_sequences=1,
30
- pad_token_id=tokenizer.eos_token_id,
31
- do_sample=True,
32
- temperature=0.7,
33
- top_p=0.9
34
- )
35
- generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
36
- return render_template("index.html", generated_code=generated_code, prompt=prompt)
37
-
38
- if __name__ == "__main__":
 
 
 
 
 
 
 
39
  app.run(debug=True)
 
1
+ from flask import Flask, render_template, request
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+
5
+ app = Flask(__name__)
6
+
7
+ # Load fine-tuned model and tokenizer
8
+ model_path = "./finetuned_codegen"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
10
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32)
11
+
12
+ # Set padding token
13
+ tokenizer.pad_token = tokenizer.eos_token
14
+
15
+ # Move model to CPU
16
+ device = torch.device("cpu")
17
+ model.to(device)
18
+
19
+ @app.route("/", methods=["GET", "POST"])
20
+ def index():
21
+ generated_code = ""
22
+ prompt = ""
23
+ if request.method == "POST":
24
+ prompt = request.form["prompt"]
25
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
26
+ outputs = model.generate(
27
+ **inputs,
28
+ max_length=200,
29
+ num_return_sequences=1,
30
+ pad_token_id=tokenizer.eos_token_id,
31
+ do_sample=True,
32
+ temperature=0.2, # Lower temperature for more precise outputs
33
+ top_p=0.95, # Adjusted for better sampling
34
+ top_k=50, # Added to focus on top-k tokens
35
+ no_repeat_ngram_size=3 # Prevent repetitive phrases
36
+ )
37
+ generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
38
+ # Clean up output to remove prompt prefix and extra text
39
+ if generated_code.startswith(prompt):
40
+ generated_code = generated_code[len(prompt):].strip()
41
+ # Remove any trailing or redundant text
42
+ generated_code = generated_code.split("\n")[0].strip() if "\n" in generated_code else generated_code
43
+ return render_template("index.html", generated_code=generated_code, prompt=prompt)
44
+
45
+ if __name__ == "__main__":
46
  app.run(debug=True)
test.py CHANGED
@@ -4,7 +4,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  # Load fine-tuned model and tokenizer
5
  model_path = "./finetuned_codegen"
6
  tokenizer = AutoTokenizer.from_pretrained(model_path)
7
- model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
8
 
9
  # Set padding token
10
  tokenizer.pad_token = tokenizer.eos_token
@@ -13,7 +13,7 @@ tokenizer.pad_token = tokenizer.eos_token
13
  device = torch.device("cpu")
14
  model.to(device)
15
 
16
- # Test prompts (including dataset prompts)
17
  prompts = [
18
  "Write a Python program to print 'Hello, you name or any other thing!'"
19
  ]
 
4
  # Load fine-tuned model and tokenizer
5
  model_path = "./finetuned_codegen"
6
  tokenizer = AutoTokenizer.from_pretrained(model_path)
7
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32)
8
 
9
  # Set padding token
10
  tokenizer.pad_token = tokenizer.eos_token
 
13
  device = torch.device("cpu")
14
  model.to(device)
15
 
16
+ # Test prompts
17
  prompts = [
18
  "Write a Python program to print 'Hello, you name or any other thing!'"
19
  ]