Spaces:

made1570
/

TestingModelAPI

Paused

App Files Files Community

made1570 commited on 27 days ago

Commit

da40d5c

verified ·

1 Parent(s): 76ca090

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -44

app.py CHANGED Viewed

@@ -1,48 +1,61 @@
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
 import gradio as gr
-# Load model and tokenizer using Unsloth-style
-model_name = "adarsh3601/my_gemma3_pt"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-def chat(user_input, history):
-    messages = []
-    for user_msg, bot_msg in history:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "assistant", "content": bot_msg})
-    messages.append({"role": "user", "content": user_input})
-    # Apply chat template
-    prompt = tokenizer.apply_chat_template(
         messages,
-        add_generation_prompt=True,
-        tokenize=False
-    )
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=1024,
-        temperature=1.0,
-        top_p=0.95,
-        top_k=64,
-        do_sample=True,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id
     )
-    # Decode and extract just the last assistant message
-    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    if "<start_of_turn>assistant" in decoded:
-        response = decoded.split("<start_of_turn>assistant")[-1].strip()
-    else:
-        response = decoded
-    return response
-gr.ChatInterface(fn=chat, title="Chat with Gemma-3").launch(share=True)

+import os
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
+from unsloth import FastModel
+# Set environment for Hugging Face Spaces
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+# Load the model from Hugging Face Model Hub
+model_repo_id = 'adarsh3601/my_gemma3_pt'
+# Load model and tokenizer using FastModel
+model, tokenizer = FastModel.from_pretrained(
+    model_name=model_repo_id,
+    max_seq_length=64000,
+    load_in_4bit=True,  # Load model with 4-bit quantization
+    load_in_8bit=False,
+    full_finetuning=False
+)
+# Function to generate text based on user input
+def generate_text(user_input):
+    # Prepare the input as per the model's expected format
+    messages = [{
+        "role": "user",
+        "content": [{"type" : "text", "text" : user_input}]
+    }]
+    text = tokenizer.apply_chat_template(
         messages,
+        add_generation_prompt=True,  # Must add for generation
     )
+    # Generate output with model
+    with torch.no_grad():
+        output = model.generate(
+            **tokenizer([text], return_tensors="pt").to("cuda"),
+            max_new_tokens=64000,  # Adjust if you need more tokens
+            temperature=1.0,
+            top_p=0.95,
+            top_k=64,
+            streamer=None  # You can set a streamer if needed
+        )
+    # Decode the model output and return the result
+    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
+    return decoded_output
+# Build the Gradio interface
+iface = gr.Interface(
+    fn=generate_text,
+    inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."),
+    outputs=gr.Textbox(lines=2, placeholder="Generated text will appear here..."),
+    title="Gemma-3 Model",
+    description="This is a simple interface to interact with the Gemma-3 model. Enter a prompt and see the generated response."
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()