Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ from llama_cpp import Llama
|
|
2 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3 |
import re
|
4 |
import uvicorn
|
5 |
-
from fastapi import FastAPI
|
6 |
from fastapi.middleware.cors import CORSMiddleware
|
7 |
import os
|
8 |
from dotenv import load_dotenv
|
@@ -16,9 +16,7 @@ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
|
16 |
|
17 |
global_data = {'models': {}, 'tokens': {'eos': 'eos_token', 'pad': 'pad_token', 'padding': 'padding_token', 'unk': 'unk_token', 'bos': 'bos_token', 'sep': 'sep_token', 'cls': 'cls_token', 'mask': 'mask_token'}}
|
18 |
|
19 |
-
model_configs = [
|
20 |
-
{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}
|
21 |
-
]
|
22 |
|
23 |
models = {}
|
24 |
|
@@ -40,7 +38,6 @@ for config in model_configs:
|
|
40 |
print(f"Failed to load model {config['name']}. Exiting.")
|
41 |
exit(1)
|
42 |
|
43 |
-
|
44 |
class ChatRequest(BaseModel):
|
45 |
message: str
|
46 |
|
@@ -62,7 +59,7 @@ def generate_model_response(model, inputs):
|
|
62 |
try:
|
63 |
if model is None:
|
64 |
return "Model loading failed."
|
65 |
-
response = model(inputs)
|
66 |
return remove_duplicates(response['choices'][0]['text'])
|
67 |
except Exception as e:
|
68 |
print(f"Error generating response: {e}")
|
@@ -81,20 +78,25 @@ app.add_middleware(
|
|
81 |
@app.post("/generate")
|
82 |
async def generate(request: ChatRequest):
|
83 |
inputs = normalize_input(request.message)
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
async def process_message(message, history):
|
100 |
try:
|
@@ -106,12 +108,10 @@ async def process_message(message, history):
|
|
106 |
except requests.exceptions.RequestException as e:
|
107 |
return history, f"Error communicating with the backend: {e}"
|
108 |
|
|
|
109 |
iface = gr.Interface(
|
110 |
fn=process_message,
|
111 |
-
inputs=[
|
112 |
-
gr.Textbox(lines=2, placeholder="Enter your message here..."),
|
113 |
-
gr.State([])
|
114 |
-
],
|
115 |
outputs=[gr.Chatbot(), gr.State([])],
|
116 |
title="Multi-Model LLM API",
|
117 |
description="Enter a message and get responses from multiple LLMs.",
|
|
|
2 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3 |
import re
|
4 |
import uvicorn
|
5 |
+
from fastapi import FastAPI
|
6 |
from fastapi.middleware.cors import CORSMiddleware
|
7 |
import os
|
8 |
from dotenv import load_dotenv
|
|
|
16 |
|
17 |
global_data = {'models': {}, 'tokens': {'eos': 'eos_token', 'pad': 'pad_token', 'padding': 'padding_token', 'unk': 'unk_token', 'bos': 'bos_token', 'sep': 'sep_token', 'cls': 'cls_token', 'mask': 'mask_token'}}
|
18 |
|
19 |
+
model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}]
|
|
|
|
|
20 |
|
21 |
models = {}
|
22 |
|
|
|
38 |
print(f"Failed to load model {config['name']}. Exiting.")
|
39 |
exit(1)
|
40 |
|
|
|
41 |
class ChatRequest(BaseModel):
|
42 |
message: str
|
43 |
|
|
|
59 |
try:
|
60 |
if model is None:
|
61 |
return "Model loading failed."
|
62 |
+
response = model(inputs, max_tokens=-1)
|
63 |
return remove_duplicates(response['choices'][0]['text'])
|
64 |
except Exception as e:
|
65 |
print(f"Error generating response: {e}")
|
|
|
78 |
@app.post("/generate")
|
79 |
async def generate(request: ChatRequest):
|
80 |
inputs = normalize_input(request.message)
|
81 |
+
chunk_size = 500
|
82 |
+
chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)]
|
83 |
+
overall_response = ""
|
84 |
+
for chunk in chunks:
|
85 |
+
with ThreadPoolExecutor() as executor:
|
86 |
+
futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()]
|
87 |
+
responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(models.keys(), as_completed(futures))]
|
88 |
+
|
89 |
+
unique_responses = {}
|
90 |
+
for response in responses:
|
91 |
+
if response['model'] not in unique_responses and response['response']:
|
92 |
+
unique_responses[response['model']] = response['response']
|
93 |
+
|
94 |
+
chunk_response = ""
|
95 |
+
for model, response in unique_responses.items():
|
96 |
+
chunk_response += f"**{model}:**\n{response}\n\n"
|
97 |
+
overall_response += chunk_response
|
98 |
+
|
99 |
+
return {"response": overall_response}
|
100 |
|
101 |
async def process_message(message, history):
|
102 |
try:
|
|
|
108 |
except requests.exceptions.RequestException as e:
|
109 |
return history, f"Error communicating with the backend: {e}"
|
110 |
|
111 |
+
|
112 |
iface = gr.Interface(
|
113 |
fn=process_message,
|
114 |
+
inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])],
|
|
|
|
|
|
|
115 |
outputs=[gr.Chatbot(), gr.State([])],
|
116 |
title="Multi-Model LLM API",
|
117 |
description="Enter a message and get responses from multiple LLMs.",
|