from fastapi import FastAPI, Request, Response from pydantic import BaseModel from typing import Union, Dict, List, Any import requests import base64 from typing import List, Dict, Union import json import requests import base64 from typing import List, Dict, Union class LLM: def __init__(self, model: str, system_message: str = "You are a Helpful AI."): self.model = model self.conversation_history = [{"role": "system", "content": system_message}] def chat(self, messages: List[Dict[str, str]]) -> Union[str, None]: url = "https://api.deepinfra.com/v1/openai/chat/completions" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,es-ES;q=0.7,es;q=0.6,en-US;q=0.5,am;q=0.4,de;q=0.3', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/json', 'Origin': 'https://deepinfra.com', 'Pragma': 'no-cache', 'Referer': 'https://deepinfra.com/', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-site', 'X-Deepinfra-Source': 'web-embed', 'accept': 'text/event-stream', 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"' } data = json.dumps( { 'model': self.model, 'messages': messages, 'temperature': 0.7, 'max_tokens': 8028, 'stop': [], 'stream': False #dont change it }, separators=(',', ':') ) try: result = requests.post(url=url, data=data, headers=headers) return result.json()['choices'][0]['message']['content'] except: return None app = FastAPI() class Model(BaseModel): id: str object: str created: int owned_by: str class Message(BaseModel): role: str content: str class CompletionRequest(BaseModel): model: str messages: List[Message] class CompletionResponse(BaseModel): id: str object: str created: int model: str choices: List[Dict[str, Any]] usage: Dict[str, int] models = [ {"id": "meta-llama/Meta-Llama-3-70B-Instruct", "object": "model", "created": 1686935002, "owned_by": "meta"}, {"id": "google/gemma-2-27b-it", "object": "model", "created": 1686935002, "owned_by": "meta"}, {"id": "google/gemma-2-9b-it", "object": "model", "created": 1686935002, "owned_by": "ConsiousAI"}, {"id": "cognitivecomputations/dolphin-2.9.1-llama-3-70b", "object": "model", "created": 1686935002, "owned_by": "cognitivecomputations"}, {"id": "nvidia/Nemotron-4-340B-Instruct", "object": "model", "created": 1686935002, "owned_by": "nvidia"}, {"id": "Qwen/Qwen2-72B-Instruct", "object": "model", "created": 1686935002, "owned_by": "qwen"}, {"id": "google/gemma-2-9b-it", "object": "model", "created": 1686935002, "owned_by": "ConsiousAI"}, {"id": "openchat/openchat-3.6-8b", "object": "model", "created": 1686935002, "owned_by": "unknown"}, {"id": "mistralai/Mistral-7B-Instruct-v0.3", "object": "model", "created": 1686935002, "owned_by": "mistral"}, {"id": "meta-llama/Meta-Llama-3-8B-Instruct", "object": "model", "created": 1686935002, "owned_by": "meta"}, {"id": "mistralai/Mixtral-8x22B-Instruct-v0.1", "object": "model", "created": 1686935002, "owned_by": "mistral"}, {"id": "mistralai/Mixtral-8x7B-Instruct-v0.1", "object": "model", "created": 1686935002, "owned_by": "mistral"}, {"id": "Qwen/Qwen2-7B-Instruct", "object": "model", "created": 1686935002, "owned_by": "Qwen"}, {"id": "meta-llama/Meta-Llama-3.1-405B-Instruct", "object": "model", "created": 1686935002, "owned_by": "meta"} ] @app.post("/v1/chat/completions") def handle_completions(completion_request: CompletionRequest): system_prompt = next((message.content for message in completion_request.messages if message.role == 'system'), None) user_query = next((message.content for message in completion_request.messages if message.role == 'user'), None) response_text = generative(query=user_query, system_prompt=system_prompt, model=completion_request.model) response = CompletionResponse( id="chatcmpl-1", object="chat.completion", created=1234567890, model=completion_request.model, choices=[{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}], usage={"prompt_tokens": sum(len(message.content.split()) for message in completion_request.messages), "total_tokens": sum(len(message.content.split()) for message in completion_request.messages) + len(response_text.split())} ) return response @app.get("/v1/models") def get_models(): return {"object": "list", "data": models} @app.post("/v1/completions") def create_completion(prompt: str, model: str, best_of: int = 1, echo: bool = False, frequency_penalty: float = 0.0): response_text = generative(prompt, "you are an helpful assistant", model) response = { "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7", "object": "text_completion", "created": 1589478378, "model": model, "system_fingerprint": "fp_44709d6fcb", "choices": [{"text": response_text, "index": 0, "logprobs": None, "finish_reason": "length"}] } return response def generative(system_prompt, query, model): llm = LLM(model=model, system_message=system_prompt) messages = [{"role": "user", "content": query}] response = llm.chat(messages) return response if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)