Ozaii commited on
Commit
2dd3233
1 Parent(s): 66c31dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -71
app.py CHANGED
@@ -1,79 +1,202 @@
1
- import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
- import torch
4
  import spaces
5
 
6
- # Load the model and tokenizer from Hugging Face
7
- model_path = "Ozaii/W.AI-13B-Chat" # Replace with your username and repository name
8
- tokenizer = AutoTokenizer.from_pretrained(model_path)
9
- model = AutoModelForCausalLM.from_pretrained(model_path)
 
 
 
 
 
 
 
10
 
11
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
- model.to(device)
13
- model.eval()
14
 
15
- @spaces.GPU
16
- def generate_response(user_input, chat_history):
17
- max_context_length = 750
18
- max_response_length = 150
19
-
20
- prompt = ""
21
- for message in chat_history:
22
- if message[0] is not None:
23
- prompt += f"User: {message[0]}\n"
24
- if message[1] is not None:
25
- prompt += f"Assistant: {message[1]}\n"
26
- prompt += f"User: {user_input}\nAssistant:"
27
-
28
- prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
29
- if len(prompt_tokens) > max_context_length:
30
- prompt_tokens = prompt_tokens[-max_context_length:]
31
- prompt = tokenizer.decode(prompt_tokens, clean_up_tokenization_spaces=True)
32
-
33
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
34
- with torch.no_grad():
35
- outputs = model.generate(
36
- inputs.input_ids,
37
- max_length=len(inputs.input_ids[0]) + max_response_length, # Limit the maximum length for context and response
38
- min_length=45,
39
- temperature=0.7, # Slightly higher temperature for more diverse responses
40
- top_k=30,
41
- top_p=0.9, # Allow a bit more randomness
42
- repetition_penalty=1.1, # Mild repetition penalty
43
- no_repeat_ngram_size=3, # Ensure no repeated phrases
44
- eos_token_id=tokenizer.eos_token_id,
45
- pad_token_id=tokenizer.eos_token_id
46
- )
47
-
48
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
49
- assistant_response = response.split("Assistant:")[-1].strip()
50
- assistant_response = assistant_response.split('\n')[0].strip()
51
- chat_history.append((user_input, assistant_response))
52
- return chat_history, chat_history
53
-
54
- def restart_chat():
55
- return [], []
56
-
57
- with gr.Blocks() as chat_interface:
58
- gr.Markdown("<h1><center>W.AI Chat Nikker xD</center></h1>")
59
- chat_history = gr.State([])
60
- with gr.Column():
61
- chatbox = gr.Chatbot()
62
- with gr.Row():
63
- user_input = gr.Textbox(show_label=False, placeholder="Summon Wali Here...")
64
- submit_button = gr.Button("Send")
65
- restart_button = gr.Button("Restart")
66
-
67
- submit_button.click(
68
- generate_response,
69
- inputs=[user_input, chat_history],
70
- outputs=[chatbox, chat_history]
 
 
 
71
  )
72
 
73
- restart_button.click(
74
- restart_chat,
75
- inputs=[],
76
- outputs=[chatbox, chat_history]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- chat_interface.launch(share=True)
 
 
 
 
 
1
+ # Import spaces first to ensure GPU resources are managed correctly
 
 
2
  import spaces
3
 
4
+ # Import necessary libraries
5
+ import os
6
+ import json
7
+ import logging
8
+ import time
9
+ import torch
10
+ import bitsandbytes as bnb
11
+ from datasets import Dataset
12
+ from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
13
+ from peft import PeftModel, LoraConfig
14
+ from transformers import BitsAndBytesConfig
15
 
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO, filename='training_log.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s')
18
+ logging.info("Started the script")
19
 
20
+ # Load the Hugging Face API token from environment variables
21
+ HF_API_TOKEN = os.getenv('HF_API_TOKEN')
22
+
23
+ # Load the dataset
24
+ file_path = 'best_training_data.json' # Adjust path as needed
25
+ logging.info(f"Loading dataset from {file_path}")
26
+ try:
27
+ with open(file_path, 'r') as file:
28
+ data = json.load(file)
29
+ logging.info("Dataset loaded successfully")
30
+ except Exception as e:
31
+ logging.error(f"Failed to load dataset: {e}")
32
+
33
+ # Convert the dataset to Hugging Face Dataset format
34
+ try:
35
+ dataset = Dataset.from_dict({"text": [entry["text"] for entry in data]})
36
+ logging.info("Dataset converted to Hugging Face Dataset format")
37
+ except Exception as e:
38
+ logging.error(f"Failed to convert dataset: {e}")
39
+
40
+ # Initialize Tokenizer
41
+ try:
42
+ tokenizer = AutoTokenizer.from_pretrained("SweatyCrayfish/llama-3-8b-quantized", token=HF_API_TOKEN)
43
+ logging.info("Tokenizer loaded successfully")
44
+
45
+ # Add padding token if not already present
46
+ if tokenizer.pad_token is None:
47
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
48
+ logging.info("Padding token added to the tokenizer")
49
+
50
+ tokenizer.save_pretrained('.')
51
+ except Exception as e:
52
+ logging.error(f"Failed to load or configure tokenizer: {e}")
53
+
54
+ # Tokenize the Dataset
55
+ def tokenize_function(examples):
56
+ return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=1024, return_tensors='pt')
57
+
58
+ try:
59
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
60
+ logging.info("Dataset tokenized successfully")
61
+ except Exception as e:
62
+ logging.error(f"Failed to tokenize the dataset: {e}")
63
+
64
+ # Setup Quantization Configuration
65
+ nf4_config = BitsAndBytesConfig(
66
+ load_in_4bit=True,
67
+ bnb_4bit_quant_type="nf4",
68
+ bnb_4bit_use_double_quant=True,
69
+ bnb_4bit_compute_dtype=torch.bfloat16
70
+ )
71
+
72
+ # Load the LLaMA 8B Model with Quantization
73
+ try:
74
+ model = AutoModelForCausalLM.from_pretrained(
75
+ "SweatyCrayfish/llama-3-8b-quantized",
76
+ quantization_config=nf4_config,
77
+ token=HF_API_TOKEN,
78
+ device_map="auto"
79
  )
80
 
81
+ model.resize_token_embeddings(len(tokenizer))
82
+ model.gradient_checkpointing_enable()
83
+ model.config.use_cache = False # Disable use_cache when using gradient checkpointing
84
+ logging.info("Model initialized and resized embeddings")
85
+
86
+ # Set up LoRa
87
+ lora_config = LoraConfig(
88
+ r=64,
89
+ lora_alpha=16,
90
+ lora_dropout=0.1,
91
+ bias="none",
92
+ task_type="CAUSAL_LM",
93
+ target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
94
+ )
95
+ model = PeftModel(model, lora_config)
96
+ logging.info("LoRa configuration applied to the model")
97
+
98
+ # Ensure only floating point parameters require gradients
99
+ for param in model.parameters():
100
+ if param.dtype in [torch.float16, torch.float32, torch.bfloat16, torch.complex64, torch.complex128]:
101
+ param.requires_grad = True
102
+ logging.info("Model parameters configured for gradient computation")
103
+ except Exception as e:
104
+ logging.error(f"Failed to initialize the model: {e}")
105
+
106
+ # Setup Training Arguments
107
+ try:
108
+ training_args = TrainingArguments(
109
+ output_dir="training_results",
110
+ evaluation_strategy="no", # Disable evaluation
111
+ save_strategy="epoch", # Save only at the end of each epoch
112
+ learning_rate=2e-4,
113
+ per_device_train_batch_size=5,
114
+ gradient_accumulation_steps=4,
115
+ num_train_epochs=12,
116
+ weight_decay=0.01,
117
+ save_total_limit=1,
118
+ logging_dir="training_logs",
119
+ logging_steps=50,
120
+ fp16=False,
121
+ bf16=True,
122
+ load_best_model_at_end=False, # Do not load the best model
123
+ greater_is_better=False,
124
+ report_to="none" # Disable reporting to external services
125
+ )
126
+ logging.info("Training arguments configured successfully")
127
+ except Exception as e:
128
+ logging.error(f"Failed to configure training arguments: {e}")
129
+
130
+ # Initialize the Trainer
131
+ try:
132
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
133
+ trainer = Trainer(
134
+ model=model,
135
+ args=training_args,
136
+ train_dataset=tokenized_dataset,
137
+ data_collator=data_collator
138
  )
139
+ logging.info("Trainer initialized successfully")
140
+ except Exception as e:
141
+ logging.error(f"Failed to initialize the Trainer: {e}")
142
+
143
+ # Implementing 120-Second Segmented Training
144
+ @spaces.GPU(duration=120)
145
+ def segmented_train(trainer):
146
+ start_time = time.time()
147
+ while time.time() - start_time < 120:
148
+ try:
149
+ trainer.train()
150
+ except torch.cuda.OutOfMemoryError as e:
151
+ logging.error(f"Out of memory error: {e}")
152
+ break
153
+ except Exception as e:
154
+ logging.error(f"Training error: {e}")
155
+ break
156
+ trainer.save_state()
157
+
158
+ try:
159
+ segmented_train(trainer)
160
+ logging.info("Model training completed successfully")
161
+ except Exception as e:
162
+ logging.error(f"Training failed: {e}")
163
+ import traceback
164
+ traceback.print_exc()
165
+
166
+ # Save the Model
167
+ try:
168
+ model.save_pretrained("llama3-8b-chat-finetuned-final-version")
169
+ tokenizer.save_pretrained("llama3-8b-chat-finetuned-final-version")
170
+ logging.info("Final fine-tuned model and tokenizer saved successfully")
171
+ except Exception as e:
172
+ logging.error(f"Failed to save the final fine-tuned model: {e}")
173
+
174
+ # Inference Function
175
+ @spaces.GPU
176
+ def generate_response(prompt, model, tokenizer, max_length=128, min_length=20, temperature=0.7, top_k=50, top_p=0.9):
177
+ try:
178
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
179
+ with torch.no_grad():
180
+ outputs = model.generate(
181
+ inputs.input_ids,
182
+ max_length=max_length,
183
+ min_length=min_length,
184
+ do_sample=True,
185
+ temperature=temperature,
186
+ top_k=top_k,
187
+ top_p=top_p,
188
+ repetition_penalty=1.3,
189
+ no_repeat_ngram_size=3,
190
+ eos_token_id=tokenizer.eos_token_id
191
+ )
192
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
193
+ return response
194
+ except Exception as e:
195
+ logging.error(f"Failed to generate response: {e}")
196
+ return ""
197
 
198
+ # Example Usage
199
+ prompt = "bro did u talk with DK today"
200
+ response = generate_response(prompt, model, tokenizer)
201
+ print(response)
202
+ logging.info(f"Generated response: {response}")