File size: 5,797 Bytes
4b878db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
783d533
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# a monkey patch to use llama-index completion
import os
import time
import gradio as gr
from functools import wraps
from threading import Lock
from typing import Union
import src.translation_agent.utils as utils

from llama_index.llms.groq import Groq
from llama_index.llms.cohere import Cohere
from llama_index.llms.openai import OpenAI
from llama_index.llms.together import TogetherLLM
from llama_index.llms.ollama import Ollama
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

from llama_index.core import Settings
from llama_index.core.llms import ChatMessage

RPM = 60

# Add your LLMs here
def model_load(
        endpoint: str,
        model: str,
        api_key: str = None,
        context_window: int = 4096,
        num_output: int = 512,
        rpm: int = RPM,
):
    if endpoint == "Groq":
        llm = Groq(
            model=model,
            api_key=api_key if api_key else os.getenv("GROQ_API_KEY"),
        )
    elif endpoint == "Cohere":
        llm = Cohere(
            model=model,
            api_key=api_key if api_key else os.getenv("COHERE_API_KEY"),
        )
    elif endpoint == "OpenAI":
        llm = OpenAI(
            model=model,
            api_key=api_key if api_key else os.getenv("OPENAI_API_KEY"),
        )
    elif endpoint == "TogetherAI":
        llm = TogetherLLM(
            model=model,
            api_key=api_key if api_key else os.getenv("TOGETHER_API_KEY"),
        )
    elif endpoint == "Ollama":
        llm = Ollama(
            model=model,
            request_timeout=120.0)
    elif endpoint == "Huggingface":
        llm = HuggingFaceInferenceAPI(
            model_name=model,
            token=api_key if api_key else os.getenv("HF_TOKEN"),
            task="text-generation",
        )

    global RPM
    RPM = rpm

    Settings.llm = llm
    # maximum input size to the LLM
    Settings.context_window = context_window

    # number of tokens reserved for text generation.
    Settings.num_output = num_output

def rate_limit(get_max_per_minute):
    def decorator(func):
        lock = Lock()
        last_called = [0.0]

        @wraps(func)
        def wrapper(*args, **kwargs):
            with lock:
                max_per_minute = get_max_per_minute()
                min_interval = 60.0 / max_per_minute
                elapsed = time.time() - last_called[0]
                left_to_wait = min_interval - elapsed

                if left_to_wait > 0:
                    time.sleep(left_to_wait)

                ret = func(*args, **kwargs)
                last_called[0] = time.time()
                return ret
        return wrapper
    return decorator

@rate_limit(lambda: RPM)
def get_completion(
        prompt: str,
        system_message: str = "You are a helpful assistant.",
        temperature: float = 0.3,
        json_mode: bool = False,
    ) -> Union[str, dict]:
        """
            Generate a completion using the OpenAI API.

        Args:
            prompt (str): The user's prompt or query.
            system_message (str, optional): The system message to set the context for the assistant.
                Defaults to "You are a helpful assistant.".
            temperature (float, optional): The sampling temperature for controlling the randomness of the generated text.
                Defaults to 0.3.
            json_mode (bool, optional): Whether to return the response in JSON format.
                Defaults to False.

        Returns:
            Union[str, dict]: The generated completion.
                If json_mode is True, returns the complete API response as a dictionary.
                If json_mode is False, returns the generated text as a string.
        """
        llm = Settings.llm
        if llm.class_name() == "HuggingFaceInferenceAPI":
            llm.system_prompt = system_message
            messages = [
                ChatMessage(
                    role="user", content=prompt),
            ]
            try:
                response = llm.chat(
                    messages=messages,
                    temperature=temperature,
                )
                return response.message.content
            except Exception as e:
                raise gr.Error(f"An unexpected error occurred: {e}")
        else:
            messages = [
                ChatMessage(
                    role="system", content=system_message),
                ChatMessage(
                    role="user", content=prompt),
            ]

            if json_mode:
                response = llm.chat(
                    temperature=temperature,
                    response_format={"type": "json_object"},
                    messages=messages,
                )
                return response.message.content
            else:
                try:
                    response = llm.chat(
                        temperature=temperature,
                        messages=messages,
                    )
                    return response.message.content
                except Exception as e:
                    raise gr.Error(f"An unexpected error occurred: {e}")

utils.get_completion = get_completion

one_chunk_initial_translation = utils.one_chunk_initial_translation
one_chunk_reflect_on_translation = utils.one_chunk_reflect_on_translation
one_chunk_improve_translation = utils.one_chunk_improve_translation
one_chunk_translate_text = utils.one_chunk_translate_text
num_tokens_in_string = utils.num_tokens_in_string
multichunk_initial_translation = utils.multichunk_initial_translation
multichunk_reflect_on_translation = utils.multichunk_reflect_on_translation
multichunk_improve_translation = utils.multichunk_improve_translation
multichunk_translation = utils.multichunk_translation
calculate_chunk_size =utils.calculate_chunk_size