import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_name = "microsoft/phi-2" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ) base_model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map="cuda:0", trust_remote_code=True, #token=True, ) model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token_id = tokenizer.eos_token_id def generate_answer(question): #inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") inputs = tokenizer(question, return_tensors="pt") outputs = model.generate(**inputs, max_length=250, num_return_sequences=1, do_sample=True) answer = tokenizer.decode(outputs[0], skip_special_tokens=True) return answer iface = gr.Interface( fn=generate_answer, inputs="text", outputs="text", title="The Art of Prompt Engineering", description="Definiere deine Prompt, am besten auf Deutsch", ) iface.launch(share=True) # Deploy the interface