from fastapi import FastAPI from fastapi.responses import StreamingResponse from huggingface_hub import hf_hub_download from llama_cpp import Llama import asyncio from fastapi.middleware.cors import CORSMiddleware app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Download the GGUF file model_id = "muhammadnoman76/cortex_q4" gguf_filename = "unsloth.Q4_K_M.gguf" # Replace with the correct filename model_path = hf_hub_download( repo_id=model_id, filename=gguf_filename, local_dir=".", local_dir_use_symlinks=False ) alpaca_prompt = """ Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: You are an intelligent agent that analyzes user requests and breaks them down into structured components. Your task is to: 1. Identify the specific actions needed to complete the request 2. Determine which intent-based tools would be appropriate (selecting only from the available intent list) 3. Provide brief justifications for why each intent is relevant 4. Define the high-level goals the request aims to accomplish 5. Generate a concise instruction prompt summarizing how to fulfill the request Available intents = ["schedule", "email", "sms", "whatsapp", "web_search", "parse_document", "visualize_data", "analyze_data", "analyze_image", "gen_code", "gen_image", "calculate", "execute_code", "academic_search", "finance_news", "translation", "url", "database", "social_media"] Important notes: - Provide only the intent category (e.g., "email"), not specific tool names - If you identify a needed intent that isn't in the list above, include it with "(new)" notation - Be concise but thorough in your analysis - Focus on practical implementation rather than theoretical discussion ### Input: {} ### Response: """ # Load model from local file in the copied folder llm = Llama( model_path= r'.//unsloth.Q4_K_M.gguf', n_ctx=2048, n_batch=512, verbose=False ) async def stream_llm_response(task_description: str): prompt = alpaca_prompt.format(task_description) stream = llm( prompt, max_tokens=2048, stream=True, ) for output in stream: yield output["choices"][0]["text"] await asyncio.sleep(0) @app.get("/stream") async def stream_response(task: str = "make an agent which send mail by searching top 5 website from google"): return StreamingResponse(stream_llm_response(task), media_type="text/plain") if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)