import gradio as gr from transformers import ( AutoModelForCausalLM, AutoConfig, AutoTokenizer, BitsAndBytesConfig, ) import transformers import torch model_name = "tiiuae/falcon-40b" config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model, quantization_config=BitsAndBytesConfig(load_in_4bit=True), trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained(model_name) def falcon(input_text): input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda") outputs = model.generate(input_ids, max_length=100, do_sample=True, top_k=10) decoded = tokenizer.decode(outputs[0]) return decoded iface = gr.Interface(fn=falcon, inputs="text", outputs="text") iface.launch() # To create a public link, set `share=True`