Spaces:
Runtime error
Runtime error
from openai import OpenAI | |
from config import get_api_keys | |
import logging | |
import tiktoken | |
import time | |
import asyncio | |
logger = logging.getLogger(__name__) | |
api_keys = get_api_keys() | |
or_client = OpenAI(api_key=api_keys["OPENROUTER_API_KEY"], base_url="https://openrouter.ai/api/v1") | |
# Token encoding | |
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") | |
def limit_tokens(input_string, token_limit=6000): | |
return encoding.decode(encoding.encode(input_string)[:token_limit]) | |
def calculate_tokens(msgs): | |
return sum(len(encoding.encode(str(m))) for m in msgs) | |
# In-memory storage for conversations | |
conversations = {} | |
last_activity = {} | |
async def clear_inactive_conversations(): | |
while True: | |
current_time = time.time() | |
inactive_convos = [conv_id for conv_id, last_time in last_activity.items() | |
if current_time - last_time > 3600*24] # 24 hour | |
for conv_id in inactive_convos: | |
if conv_id in conversations: | |
del conversations[conv_id] | |
if conv_id in last_activity: | |
del last_activity[conv_id] | |
logger.info(f"Cleared {len(inactive_convos)} inactive conversations") | |
await asyncio.sleep(600) # Check every hour | |
def chat_with_llama_stream(messages, model="openai/gpt-4o-mini", max_llm_history=4, max_output_tokens=2500): | |
logger.info(f"Starting chat with model: {model}") | |
while calculate_tokens(messages) > (8000 - max_output_tokens): | |
if len(messages) > max_llm_history: | |
messages = [messages[0]] + messages[-max_llm_history:] | |
else: | |
max_llm_history -= 1 | |
if max_llm_history < 2: | |
error_message = "Token limit exceeded. Please shorten your input or start a new conversation." | |
logger.error(error_message) | |
raise Exception(error_message) | |
try: | |
response = or_client.chat.completions.create( | |
model=model, | |
messages=messages, | |
max_tokens=max_output_tokens, | |
stream=True | |
) | |
full_response = "" | |
for chunk in response: | |
if chunk.choices[0].delta.content is not None: | |
content = chunk.choices[0].delta.content | |
full_response += content | |
yield content | |
# After streaming, add the full response to the conversation history | |
messages.append({"role": "assistant", "content": full_response}) | |
logger.info("Chat completed successfully") | |
except Exception as e: | |
logger.error(f"Error in model response: {str(e)}") | |
raise Exception(f"Error in model response: {str(e)}") |