import os from fastapi import FastAPI, Request from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Set Hugging Face cache directory os.environ["HF_HOME"] = "/home/user/cache" # Get Hugging Face API token HF_API_TOKEN = os.getenv("HF_API_TOKEN") if not HF_API_TOKEN: raise ValueError("HF_API_TOKEN environment variable is not set!") app = FastAPI() # Load Falcon 7B model MODEL_NAME = "SpiceyToad/demo-falc" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_API_TOKEN) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16, token=HF_API_TOKEN ) # Ensure tokenizer has a padding token if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Use the EOS token as the padding token @app.post("/generate") async def generate_text(request: Request): data = await request.json() prompt = data.get("prompt", "") max_length = data.get("max_length", 50) # Tokenize with padding and attention mask inputs = tokenizer( prompt, return_tensors="pt", padding=True, truncation=True ).to(model.device) outputs = model.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=max_length ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"generated_text": response}