Spaces:
Running
Running
from fastapi import FastAPI, HTTPException, Depends, Security, BackgroundTasks | |
from fastapi.security import APIKeyHeader | |
from fastapi.responses import StreamingResponse | |
from pydantic import BaseModel, Field | |
from typing import Literal, List, Dict | |
import os | |
from functools import lru_cache | |
from openai import OpenAI | |
from uuid import uuid4 | |
import tiktoken | |
import sqlite3 | |
import time | |
from datetime import datetime, timedelta | |
import asyncio | |
import requests | |
from prompts import CODING_ASSISTANT_PROMPT, NEWS_ASSISTANT_PROMPT, generate_news_prompt, SEARCH_ASSISTANT_PROMPT, generate_search_prompt | |
from fastapi_cache import FastAPICache | |
from fastapi_cache.backends.inmemory import InMemoryBackend | |
from fastapi_cache.decorator import cache | |
app = FastAPI() | |
API_KEY_NAME = "X-API-Key" | |
API_KEY = os.environ.get("CHAT_AUTH_KEY", "default_secret_key") | |
api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False) | |
ModelID = Literal[ | |
"meta-llama/llama-3-70b-instruct", | |
"anthropic/claude-3.5-sonnet", | |
"deepseek/deepseek-coder", | |
"anthropic/claude-3-haiku", | |
"openai/gpt-3.5-turbo-instruct", | |
"qwen/qwen-72b-chat", | |
"google/gemma-2-27b-it" | |
] | |
class QueryModel(BaseModel): | |
user_query: str = Field(..., description="User's coding query") | |
model_id: ModelID = Field( | |
default="meta-llama/llama-3-70b-instruct", | |
description="ID of the model to use for response generation" | |
) | |
conversation_id: str = Field(default_factory=lambda: str(uuid4()), description="Unique identifier for the conversation") | |
user_id: str = Field(..., description="Unique identifier for the user") | |
class Config: | |
schema_extra = { | |
"example": { | |
"user_query": "How do I implement a binary search in Python?", | |
"model_id": "meta-llama/llama-3-70b-instruct", | |
"conversation_id": "123e4567-e89b-12d3-a456-426614174000", | |
"user_id": "user123" | |
} | |
} | |
class NewsQueryModel(BaseModel): | |
query: str = Field(..., description="News topic to search for") | |
model_id: ModelID = Field( | |
default="meta-llama/llama-3-70b-instruct", | |
description="ID of the model to use for response generation" | |
) | |
class Config: | |
schema_extra = { | |
"example": { | |
"query": "Latest developments in AI", | |
"model_id": "meta-llama/llama-3-70b-instruct" | |
} | |
} | |
def get_api_keys(): | |
return { | |
"OPENROUTER_API_KEY": f"sk-or-v1-{os.environ['OPENROUTER_API_KEY']}", | |
"BRAVE_API_KEY": os.environ['BRAVE_API_KEY'] | |
} | |
api_keys = get_api_keys() | |
or_client = OpenAI(api_key=api_keys["OPENROUTER_API_KEY"], base_url="https://openrouter.ai/api/v1") | |
# In-memory storage for conversations | |
conversations: Dict[str, List[Dict[str, str]]] = {} | |
last_activity: Dict[str, float] = {} | |
# Token encoding | |
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") | |
def limit_tokens(input_string, token_limit=6000): | |
return encoding.decode(encoding.encode(input_string)[:token_limit]) | |
def calculate_tokens(msgs): | |
return sum(len(encoding.encode(str(m))) for m in msgs) | |
def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_history=4, max_output_tokens=2500): | |
while calculate_tokens(messages) > (8000 - max_output_tokens): | |
if len(messages) > max_llm_history: | |
messages = [messages[0]] + messages[-max_llm_history:] | |
else: | |
max_llm_history -= 1 | |
if max_llm_history < 2: | |
error_message = "Token limit exceeded. Please shorten your input or start a new conversation." | |
raise HTTPException(status_code=400, detail=error_message) | |
try: | |
response = or_client.chat.completions.create( | |
model=model, | |
messages=messages, | |
max_tokens=max_output_tokens, | |
stream=True | |
) | |
full_response = "" | |
for chunk in response: | |
if chunk.choices[0].delta.content is not None: | |
content = chunk.choices[0].delta.content | |
full_response += content | |
yield content | |
# After streaming, add the full response to the conversation history | |
messages.append({"role": "assistant", "content": full_response}) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error in model response: {str(e)}") | |
async def verify_api_key(api_key: str = Security(api_key_header)): | |
if api_key != API_KEY: | |
raise HTTPException(status_code=403, detail="Could not validate credentials") | |
return api_key | |
# SQLite setup | |
DB_PATH = '/app/data/conversations.db' | |
def init_db(): | |
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True) | |
conn = sqlite3.connect(DB_PATH) | |
c = conn.cursor() | |
c.execute('''CREATE TABLE IF NOT EXISTS conversations | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
user_id TEXT, | |
conversation_id TEXT, | |
message TEXT, | |
response TEXT, | |
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP)''') | |
conn.commit() | |
conn.close() | |
init_db() | |
def update_db(user_id, conversation_id, message, response): | |
conn = sqlite3.connect(DB_PATH) | |
c = conn.cursor() | |
c.execute('''INSERT INTO conversations (user_id, conversation_id, message, response) | |
VALUES (?, ?, ?, ?)''', (user_id, conversation_id, message, response)) | |
conn.commit() | |
conn.close() | |
async def clear_inactive_conversations(): | |
while True: | |
current_time = time.time() | |
inactive_convos = [conv_id for conv_id, last_time in last_activity.items() | |
if current_time - last_time > 1800] # 30 minutes | |
for conv_id in inactive_convos: | |
if conv_id in conversations: | |
del conversations[conv_id] | |
if conv_id in last_activity: | |
del last_activity[conv_id] | |
await asyncio.sleep(60) # Check every minute | |
async def startup_event(): | |
FastAPICache.init(InMemoryBackend(), prefix="fastapi-cache") | |
asyncio.create_task(clear_inactive_conversations()) | |
async def coding_assistant(query: QueryModel, background_tasks: BackgroundTasks, api_key: str = Depends(verify_api_key)): | |
""" | |
Coding assistant endpoint that provides programming help based on user queries. | |
Available models: | |
- meta-llama/llama-3-70b-instruct (default) | |
- anthropic/claude-3.5-sonnet | |
- deepseek/deepseek-coder | |
- anthropic/claude-3-haiku | |
- openai/gpt-3.5-turbo-instruct | |
- qwen/qwen-72b-chat | |
- google/gemma-2-27b-it | |
Requires API Key authentication via X-API-Key header. | |
""" | |
if query.conversation_id not in conversations: | |
conversations[query.conversation_id] = [ | |
{"role": "system", "content": "You are a helpful assistant proficient in coding tasks. Help the user in understanding and writing code."} | |
] | |
conversations[query.conversation_id].append({"role": "user", "content": query.user_query}) | |
last_activity[query.conversation_id] = time.time() | |
# Limit tokens in the conversation history | |
limited_conversation = conversations[query.conversation_id] | |
def process_response(): | |
full_response = "" | |
for content in chat_with_llama_stream(limited_conversation, model=query.model_id): | |
full_response += content | |
yield content | |
background_tasks.add_task(update_db, query.user_id, query.conversation_id, query.user_query, full_response) | |
return StreamingResponse(process_response(), media_type="text/event-stream") | |
# New functions for news assistant | |
def internet_search(query, type = "web", num_results=20): | |
if type == "web": | |
url = "https://api.search.brave.com/res/v1/web/search" | |
else: | |
url = "https://api.search.brave.com/res/v1/news/search" | |
headers = { | |
"Accept": "application/json", | |
"Accept-Encoding": "gzip", | |
"X-Subscription-Token": api_keys["BRAVE_API_KEY"] | |
} | |
params = {"q": query} | |
response = requests.get(url, headers=headers, params=params) | |
if response.status_code != 200: | |
return [] | |
if type == "web": | |
search_data = response.json()["web"]["results"] | |
else: | |
search_data = response.json()["results"] | |
processed_results = [] | |
for item in search_data: | |
if not item.get("extra_snippets"): | |
continue | |
result = { | |
"title": item["title"], | |
"snippet": item["extra_snippets"][0], | |
"last_updated": item.get("age", "") | |
} | |
processed_results.append(result) | |
return processed_results[:num_results] | |
def cached_internet_search(query: str): | |
return internet_search(query, type = "news") | |
def analyze_news(query): | |
news_data = cached_internet_search(query) | |
if not news_data: | |
return "Failed to fetch news data.", [] | |
# Prepare the prompt for the AI | |
# Use the imported function to generate the prompt (now includes today's date) | |
prompt = generate_news_prompt(query, news_data) | |
messages = [ | |
{"role": "system", "content": NEWS_ASSISTANT_PROMPT}, | |
{"role": "user", "content": prompt} | |
] | |
return messages | |
async def news_assistant(query: NewsQueryModel, api_key: str = Depends(verify_api_key)): | |
""" | |
News assistant endpoint that provides summaries and analysis of recent news based on user queries. | |
Requires API Key authentication via X-API-Key header. | |
""" | |
messages = analyze_news(query.query) | |
if not messages: | |
raise HTTPException(status_code=500, detail="Failed to fetch news data") | |
def process_response(): | |
for content in chat_with_llama_stream(messages, model=query.model_id): | |
yield content | |
#meta-llama/llama-3-70b-instruct google/gemini-pro-1.5 | |
return StreamingResponse(process_response(), media_type="text/event-stream") | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=7860) |