# import gradio as gr # demo = gr.load("models/NSTiwari/fine_tuned_science_gemma2b-it") # demo.launch() import gradio as gr import transformers from transformers import AutoTokenizer, AutoModelForCausalLM import torch import time # Replace with your fine-tuned model ID from Hugging Face Hub model_id = "NSTiwari/fine_tuned_science_gemma2b-it" # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto") def inference(input_text): """ Performs inference on the science question and returns answer and latency. """ start_time = time.time() input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) input_length = input_ids["input_ids"].shape[1] outputs = model.generate( input_ids=input_ids["input_ids"], max_length=512, # Adjust max_length as needed do_sample=False ) generated_sequence = outputs[:, input_length:].tolist() response = tokenizer.decode(generated_sequence[0]) end_time = time.time() return {"answer": response, "latency": f"{end_time - start_time:.2f} seconds"} def gradio_interface(question): """ Gradio interface function that calls inference and returns answer/latency. """ result = inference(question) return result["answer"], result["latency"] # Gradio interface definition iface = gr.Interface( fn=gradio_interface, inputs=gr.Textbox(label="Science Question", lines=4), outputs=[gr.Textbox(label="Answer"), gr.Textbox(label="Latency")], title="SciGemma", description="Ask a science question and get an answer from the fine-tuned Gemma 2b-it model.", examples=[ ["What does air consist of?"], ["What is an atom?"], ] ) if __name__ == "__main__": iface.launch()