from transformers import AutoModelForCausalLM, AutoTokenizer import torch from gradio import Interface # Load the model and tokenizer model_name = "tiiuae/falcon-7b-instruct" device = "cuda" if torch.cuda.is_available() else "CPU" model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", # Automatically map to available devices offload_folder="./offload", # Add this line to specify the folder low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", # This uses Accelerate for better resource allocation low_cpu_mem_usage=True, # Optimized memory usage ) # Function to generate text def generate_text(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(device) outputs = model.generate( **inputs, max_new_tokens=200, do_sample=True, top_k=10, temperature=0.7, ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Gradio Interface interface = Interface( fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Text Generation", ) interface.launch()