from fastapi import FastAPI, Form from llama_cpp import Llama from typing import List import json # Initialize FastAPI app app = FastAPI() # Load the Llama model llm = Llama.from_pretrained( repo_id="HuggingFaceTB/SmolLM2-360M-Instruct-GGUF", filename="smollm2-360m-instruct-q8_0.gguf", # Replace with the actual path to your GGUF file ) # Endpoint to generate response from model based on user input @app.post("/ask/") async def ask_question(prompt: str = Form(...)): # Format the prompt as a chat message messages = [ {"role": "user", "content": prompt} ] # Generate a response using Llama response = llm.create_chat_completion(messages=messages) response_content = response["choices"][0]["message"]["content"] return {"response": response_content} # Endpoint to test a simple query (optional) @app.get("/test/") async def test(): # Test the model with a simple question messages = [{"role": "user", "content": "What is the capital of France?"}] response = llm.create_chat_completion(messages=messages) response_content = response["choices"][0]["message"]["content"] return {"test_response": response_content}