Hjgugugjhuhjggg commited on
Commit
8806695
·
verified ·
1 Parent(s): 5a6f7e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -24
app.py CHANGED
@@ -2,7 +2,7 @@ from llama_cpp import Llama
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
  import re
4
  import uvicorn
5
- from fastapi import FastAPI, HTTPException
6
  from fastapi.middleware.cors import CORSMiddleware
7
  import os
8
  from dotenv import load_dotenv
@@ -16,9 +16,7 @@ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
16
 
17
  global_data = {'models': {}, 'tokens': {'eos': 'eos_token', 'pad': 'pad_token', 'padding': 'padding_token', 'unk': 'unk_token', 'bos': 'bos_token', 'sep': 'sep_token', 'cls': 'cls_token', 'mask': 'mask_token'}}
18
 
19
- model_configs = [
20
- {"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}
21
- ]
22
 
23
  models = {}
24
 
@@ -40,7 +38,6 @@ for config in model_configs:
40
  print(f"Failed to load model {config['name']}. Exiting.")
41
  exit(1)
42
 
43
-
44
  class ChatRequest(BaseModel):
45
  message: str
46
 
@@ -62,7 +59,7 @@ def generate_model_response(model, inputs):
62
  try:
63
  if model is None:
64
  return "Model loading failed."
65
- response = model(inputs)
66
  return remove_duplicates(response['choices'][0]['text'])
67
  except Exception as e:
68
  print(f"Error generating response: {e}")
@@ -81,20 +78,25 @@ app.add_middleware(
81
  @app.post("/generate")
82
  async def generate(request: ChatRequest):
83
  inputs = normalize_input(request.message)
84
- with ThreadPoolExecutor() as executor:
85
- futures = [executor.submit(generate_model_response, model, inputs) for model in models.values()]
86
- responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(models.keys(), as_completed(futures))]
87
-
88
- unique_responses = {}
89
- for response in responses:
90
- if response['model'] not in unique_responses and response['response']:
91
- unique_responses[response['model']] = response['response']
92
-
93
- formatted_response = ""
94
- for model, response in unique_responses.items():
95
- formatted_response += f"**{model}:**\n{response}\n\n"
96
-
97
- return {"response": formatted_response}
 
 
 
 
 
98
 
99
  async def process_message(message, history):
100
  try:
@@ -106,12 +108,10 @@ async def process_message(message, history):
106
  except requests.exceptions.RequestException as e:
107
  return history, f"Error communicating with the backend: {e}"
108
 
 
109
  iface = gr.Interface(
110
  fn=process_message,
111
- inputs=[
112
- gr.Textbox(lines=2, placeholder="Enter your message here..."),
113
- gr.State([])
114
- ],
115
  outputs=[gr.Chatbot(), gr.State([])],
116
  title="Multi-Model LLM API",
117
  description="Enter a message and get responses from multiple LLMs.",
 
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
  import re
4
  import uvicorn
5
+ from fastapi import FastAPI
6
  from fastapi.middleware.cors import CORSMiddleware
7
  import os
8
  from dotenv import load_dotenv
 
16
 
17
  global_data = {'models': {}, 'tokens': {'eos': 'eos_token', 'pad': 'pad_token', 'padding': 'padding_token', 'unk': 'unk_token', 'bos': 'bos_token', 'sep': 'sep_token', 'cls': 'cls_token', 'mask': 'mask_token'}}
18
 
19
+ model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}]
 
 
20
 
21
  models = {}
22
 
 
38
  print(f"Failed to load model {config['name']}. Exiting.")
39
  exit(1)
40
 
 
41
  class ChatRequest(BaseModel):
42
  message: str
43
 
 
59
  try:
60
  if model is None:
61
  return "Model loading failed."
62
+ response = model(inputs, max_tokens=-1)
63
  return remove_duplicates(response['choices'][0]['text'])
64
  except Exception as e:
65
  print(f"Error generating response: {e}")
 
78
  @app.post("/generate")
79
  async def generate(request: ChatRequest):
80
  inputs = normalize_input(request.message)
81
+ chunk_size = 500
82
+ chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)]
83
+ overall_response = ""
84
+ for chunk in chunks:
85
+ with ThreadPoolExecutor() as executor:
86
+ futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()]
87
+ responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(models.keys(), as_completed(futures))]
88
+
89
+ unique_responses = {}
90
+ for response in responses:
91
+ if response['model'] not in unique_responses and response['response']:
92
+ unique_responses[response['model']] = response['response']
93
+
94
+ chunk_response = ""
95
+ for model, response in unique_responses.items():
96
+ chunk_response += f"**{model}:**\n{response}\n\n"
97
+ overall_response += chunk_response
98
+
99
+ return {"response": overall_response}
100
 
101
  async def process_message(message, history):
102
  try:
 
108
  except requests.exceptions.RequestException as e:
109
  return history, f"Error communicating with the backend: {e}"
110
 
111
+
112
  iface = gr.Interface(
113
  fn=process_message,
114
+ inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])],
 
 
 
115
  outputs=[gr.Chatbot(), gr.State([])],
116
  title="Multi-Model LLM API",
117
  description="Enter a message and get responses from multiple LLMs.",