import transformers import gradio as gr import torch from transformers import LlamaTokenizer, LlamaForCausalLM, AutoModelForQuestionAnswering assert ( "LlamaTokenizer" in transformers._import_structure["models.llama"] ) tokenizer = LlamaTokenizer.from_pretrained("medalpaca/medalpaca-7b") model = LlamaForCausalLM.from_pretrained( "medalpaca/medalpaca-7b", torch_dtype=torch.float32, device_map="auto", offload_folder="." ) def answer_question(question, context): # Tokenize the input text inputs = tokenizer(question, context, return_tensors="pt") input_ids = inputs.input_ids.to('cuda') # Generate the model output outputs = model.generate(input_ids, max_length=30, do_sample=True) # Decode the predicted tokens to text answer = tokenizer.decode(outputs[0], skip_special_tokens=True) return answer iface = gr.Interface( fn=answer_question, inputs=[ gr.inputs.Textbox(label="Question"), gr.inputs.Textbox(label="Context") ], outputs=gr.outputs.Textbox(label="Answer") ) iface.launch(share=True)