File size: 2,672 Bytes
fa7a61c
 
 
be1349b
 
 
fa7a61c
 
 
 
 
be1349b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa7a61c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from openai import OpenAI
from config import get_api_keys
import logging
import tiktoken
import time
import asyncio

logger = logging.getLogger(__name__)
api_keys = get_api_keys()
or_client = OpenAI(api_key=api_keys["OPENROUTER_API_KEY"], base_url="https://openrouter.ai/api/v1")


# Token encoding
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

def limit_tokens(input_string, token_limit=6000):
    return encoding.decode(encoding.encode(input_string)[:token_limit])

def calculate_tokens(msgs):
    return sum(len(encoding.encode(str(m))) for m in msgs)

# In-memory storage for conversations
conversations = {}
last_activity = {}

async def clear_inactive_conversations():
    while True:
        current_time = time.time()
        inactive_convos = [conv_id for conv_id, last_time in last_activity.items() 
                           if current_time - last_time > 3600*24]  # 24 hour
        for conv_id in inactive_convos:
            if conv_id in conversations:
                del conversations[conv_id]
            if conv_id in last_activity:
                del last_activity[conv_id]
        logger.info(f"Cleared {len(inactive_convos)} inactive conversations")
        await asyncio.sleep(600)  # Check every hour


def chat_with_llama_stream(messages, model="openai/gpt-4o-mini", max_llm_history=4, max_output_tokens=2500):
    logger.info(f"Starting chat with model: {model}")
    while calculate_tokens(messages) > (8000 - max_output_tokens):
        if len(messages) > max_llm_history:
            messages = [messages[0]] + messages[-max_llm_history:]
        else:
            max_llm_history -= 1
            if max_llm_history < 2:
                error_message = "Token limit exceeded. Please shorten your input or start a new conversation."
                logger.error(error_message)
                raise Exception(error_message)

    try:
        response = or_client.chat.completions.create(
            model=model,
            messages=messages,
            max_tokens=max_output_tokens,
            stream=True
        )
        
        full_response = ""
        for chunk in response:
            if chunk.choices[0].delta.content is not None:
                content = chunk.choices[0].delta.content
                full_response += content
                yield content
        
        # After streaming, add the full response to the conversation history
        messages.append({"role": "assistant", "content": full_response})
        logger.info("Chat completed successfully")
    except Exception as e:
        logger.error(f"Error in model response: {str(e)}")
        raise Exception(f"Error in model response: {str(e)}")