Spaces:

made1570
/

TestingModelAPI

Paused

App Files Files Community

made1570 commited on Apr 19

Commit

76ca090

verified ·

1 Parent(s): 4cbaeb1

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -35

app.py CHANGED Viewed

@@ -1,55 +1,48 @@
 import torch
-from transformers import AutoModelForCausalLM, AutoProcessor
 import gradio as gr
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load model and processor
-model = AutoModelForCausalLM.from_pretrained("adarsh3601/my_gemma3_pt", device_map="auto")
-processor = AutoProcessor.from_pretrained("adarsh3601/my_gemma3_pt")
 def chat(user_input, history):
-    # Format history as messages
     messages = []
-    for i, (user_msg, bot_msg) in enumerate(history):
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": user_input})
-    try:
-        # Try using chat template
-        prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-    except Exception as e:
-        print(f"[WARNING] Failed to apply chat_template: {e}")
-        prompt = None
-    # Fallback if prompt fails
-    if not prompt:
-        prompt = "<bos>"
-        for i, msg in enumerate(messages):
-            role = "model" if msg["role"] == "assistant" else msg["role"]
-            prompt += f"<start_of_turn>{role}\n{msg['content'].strip()}<end_of_turn>\n"
-        prompt += "<start_of_turn>model\n"
-    print(f"[DEBUG] Prompt:\n{prompt}")
-    inputs = processor(prompt, return_tensors="pt").to(device)
     outputs = model.generate(
         **inputs,
-        max_new_tokens=512,
-        do_sample=False,
-        num_beams=1,
-        eos_token_id=processor.tokenizer.eos_token_id,
-        pad_token_id=processor.tokenizer.pad_token_id
     )
-    response = processor.decode(outputs[0], skip_special_tokens=True)
-    # Extract only the assistant response (after last <start_of_turn>model)
-    if "<start_of_turn>model" in response:
-        response = response.split("<start_of_turn>model")[-1].strip()
     return response
-# Launch the Gradio interface
-iface = gr.ChatInterface(fn=chat, title="Gemma-3 Chat").launch(share=True)

 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
 import gradio as gr
+# Load model and tokenizer using Unsloth-style
+model_name = "adarsh3601/my_gemma3_pt"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
+device = "cuda" if torch.cuda.is_available() else "cpu"
 def chat(user_input, history):
     messages = []
+    for user_msg, bot_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": user_input})
+    # Apply chat template
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=False
+    )
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
     outputs = model.generate(
         **inputs,
+        max_new_tokens=1024,
+        temperature=1.0,
+        top_p=0.95,
+        top_k=64,
+        do_sample=True,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id
     )
+    # Decode and extract just the last assistant message
+    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    if "<start_of_turn>assistant" in decoded:
+        response = decoded.split("<start_of_turn>assistant")[-1].strip()
+    else:
+        response = decoded
     return response
+gr.ChatInterface(fn=chat, title="Chat with Gemma-3").launch(share=True)