made1570 commited on
Commit
4a9540b
·
verified ·
1 Parent(s): abb8fc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -48
app.py CHANGED
@@ -1,50 +1,49 @@
1
- import os
2
  import torch
 
 
3
  import gradio as gr
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
-
6
- # Set environment variable for CUDA debugging (for local development or debugging in spaces)
7
- os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # Ensures errors are reported synchronously
8
-
9
- # Load the model and tokenizer
10
- device = "cpu"
11
- model = AutoModelForCausalLM.from_pretrained("adarsh3601/my_gemma3_pt", device_map="auto", torch_dtype=torch.float16).to(device)
12
- tokenizer = AutoTokenizer.from_pretrained("adarsh3601/my_gemma3_pt")
13
-
14
- # Function to handle input and response generation
15
- def generate_response(message):
16
- try:
17
- # Tokenize input
18
- inputs = tokenizer(message, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
19
-
20
- # Disable gradient computation for inference
21
- with torch.no_grad():
22
- # Generate response from model
23
- outputs = model.generate(
24
- **inputs,
25
- max_new_tokens=50, # Limit the length of the output
26
- temperature=0.7,
27
- top_p=0.95,
28
- )
29
-
30
- # Check for NaN or Inf values in the output before decoding
31
- if torch.any(torch.isnan(outputs)) or torch.any(torch.isinf(outputs)):
32
- raise ValueError("Model output contains NaN or Inf")
33
-
34
- # Decode the output
35
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
36
- return response
37
-
38
- except Exception as e:
39
- print(f"Error generating response: {e}")
40
- return "Error: Something went wrong."
41
-
42
- # Define the Gradio interface
43
- iface = gr.Interface(fn=generate_response,
44
- inputs="text",
45
- outputs="text",
46
- title="Chat with Gemma3",
47
- description="Chatbot based on Gemma3 model from Hugging Face.")
48
-
49
- # Launch the interface on Hugging Face Spaces
50
- iface.launch(share=True) # To create a public link, set share=True
 
 
1
  import torch
2
+ from transformers import AutoProcessor, AutoModelForImageTextToText, TextStreamer
3
+ from peft import PeftModel
4
  import gradio as gr
5
+
6
+ # Load base model and processor
7
+ base_model_id = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit"
8
+ adapter_model_id = "adarsh3601/my_gemma_pt3"
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+
11
+ processor = AutoProcessor.from_pretrained(base_model_id)
12
+ model = AutoModelForImageTextToText.from_pretrained(base_model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto")
13
+
14
+ # Apply adapter (LoRA)
15
+ model = PeftModel.from_pretrained(model, adapter_model_id)
16
+ model.eval()
17
+
18
+ streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
19
+
20
+ # Helper to format messages using the chat template
21
+ def format_chat(messages):
22
+ formatted = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
23
+ return formatted
24
+
25
+ # Chat function
26
+ def chat(message, history):
27
+ messages = []
28
+
29
+ # Format history into messages
30
+ for user_msg, bot_msg in history:
31
+ messages.append({"role": "user", "content": user_msg})
32
+ messages.append({"role": "assistant", "content": bot_msg})
33
+
34
+ messages.append({"role": "user", "content": message})
35
+ prompt = format_chat(messages)
36
+
37
+ inputs = processor(prompt, return_tensors="pt").to(device)
38
+
39
+ with torch.no_grad():
40
+ outputs = model.generate(**inputs, max_new_tokens=512, streamer=streamer)
41
+
42
+ decoded = processor.batch_decode(outputs, skip_special_tokens=True)[0]
43
+ response = decoded.split("<end_of_turn>")[0].strip().split("<start_of_turn>model")[-1].strip()
44
+ return response
45
+
46
+ # Gradio interface
47
+ gui = gr.ChatInterface(fn=chat, title="Gemma-3 Chatbot", description="Fine-tuned on adarsh3601/my_gemma_pt3")
48
+
49
+ gui.launch()