Spaces:
Sleeping
Sleeping
Hjgugugjhuhjggg
commited on
Commit
•
0a3c752
1
Parent(s):
50c545e
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,8 @@ from dotenv import load_dotenv
|
|
8 |
from pydantic import BaseModel
|
9 |
import requests
|
10 |
import traceback
|
|
|
|
|
11 |
|
12 |
load_dotenv()
|
13 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
@@ -27,7 +29,9 @@ global_data = {
|
|
27 |
}
|
28 |
|
29 |
model_configs = [
|
30 |
-
{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}
|
|
|
|
|
31 |
]
|
32 |
|
33 |
models = {}
|
@@ -36,7 +40,7 @@ def load_model(model_config):
|
|
36 |
model_name = model_config['name']
|
37 |
if model_name not in models:
|
38 |
try:
|
39 |
-
model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
|
40 |
models[model_name] = model
|
41 |
global_data['models'] = models
|
42 |
return model
|
@@ -80,7 +84,7 @@ def generate_model_response(model, inputs, max_tokens_per_part):
|
|
80 |
|
81 |
text = response['choices'][0]['text']
|
82 |
if text:
|
83 |
-
|
84 |
|
85 |
return responses
|
86 |
|
@@ -89,6 +93,7 @@ def generate_model_response(model, inputs, max_tokens_per_part):
|
|
89 |
traceback.print_exc()
|
90 |
return [f"Error: {e}"]
|
91 |
|
|
|
92 |
app = FastAPI()
|
93 |
origins = ["*"]
|
94 |
app.add_middleware(
|
@@ -99,28 +104,45 @@ app.add_middleware(
|
|
99 |
allow_headers=["*"],
|
100 |
)
|
101 |
|
|
|
|
|
102 |
@app.post("/generate")
|
103 |
async def generate(request: ChatRequest):
|
104 |
inputs = normalize_input(request.message)
|
|
|
|
|
|
|
|
|
105 |
with ThreadPoolExecutor() as executor:
|
106 |
futures = [executor.submit(generate_model_response, model, inputs, request.max_tokens_per_part) for model in models.values()]
|
107 |
responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(models.keys(), as_completed(futures))]
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
|
|
|
|
|
|
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
formatted_response += f"**{model}:**\n"
|
120 |
-
for i, part in enumerate(response_parts):
|
121 |
-
formatted_response += f"Part {i+1}:\n{part}\n\n"
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
|
|
|
|
124 |
return {"response": formatted_response}
|
125 |
|
126 |
if __name__ == "__main__":
|
|
|
8 |
from pydantic import BaseModel
|
9 |
import requests
|
10 |
import traceback
|
11 |
+
from sentence_transformers import SentenceTransformer, util
|
12 |
+
from gptcache import Cache, get_cache
|
13 |
|
14 |
load_dotenv()
|
15 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
|
|
29 |
}
|
30 |
|
31 |
model_configs = [
|
32 |
+
{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"},
|
33 |
+
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "meta-llama-3.1-70b"},
|
34 |
+
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "gemma-2-27b"}
|
35 |
]
|
36 |
|
37 |
models = {}
|
|
|
40 |
model_name = model_config['name']
|
41 |
if model_name not in models:
|
42 |
try:
|
43 |
+
model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN, verbose=True)
|
44 |
models[model_name] = model
|
45 |
global_data['models'] = models
|
46 |
return model
|
|
|
84 |
|
85 |
text = response['choices'][0]['text']
|
86 |
if text:
|
87 |
+
responses.append(remove_duplicates(text))
|
88 |
|
89 |
return responses
|
90 |
|
|
|
93 |
traceback.print_exc()
|
94 |
return [f"Error: {e}"]
|
95 |
|
96 |
+
|
97 |
app = FastAPI()
|
98 |
origins = ["*"]
|
99 |
app.add_middleware(
|
|
|
104 |
allow_headers=["*"],
|
105 |
)
|
106 |
|
107 |
+
cache = get_cache()
|
108 |
+
|
109 |
@app.post("/generate")
|
110 |
async def generate(request: ChatRequest):
|
111 |
inputs = normalize_input(request.message)
|
112 |
+
cached_result = cache.get(inputs)
|
113 |
+
if cached_result:
|
114 |
+
return {"response": cached_result}
|
115 |
+
|
116 |
with ThreadPoolExecutor() as executor:
|
117 |
futures = [executor.submit(generate_model_response, model, inputs, request.max_tokens_per_part) for model in models.values()]
|
118 |
responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(models.keys(), as_completed(futures))]
|
119 |
|
120 |
+
model_embeddings = {}
|
121 |
+
sentence_model = SentenceTransformer('all-mpnet-base-v2')
|
122 |
+
|
123 |
+
all_responses = {}
|
124 |
+
for res in responses:
|
125 |
+
all_responses[res['model']] = " ".join(res['response'])
|
126 |
|
127 |
+
for model_name, response_text in all_responses.items():
|
128 |
+
embeddings = sentence_model.encode(response_text)
|
129 |
+
model_embeddings[model_name] = embeddings
|
130 |
|
131 |
+
best_model = None
|
132 |
+
best_similarity = -1
|
|
|
|
|
|
|
133 |
|
134 |
+
for model_name1, embedding1 in model_embeddings.items():
|
135 |
+
avg_similarity = 0
|
136 |
+
for model_name2, embedding2 in model_embeddings.items():
|
137 |
+
cosine_scores = util.cos_sim(embedding1, embedding2)
|
138 |
+
avg_similarity += cosine_scores.item()
|
139 |
+
avg_similarity /= len(model_embeddings)
|
140 |
+
if avg_similarity > best_similarity:
|
141 |
+
best_similarity = avg_similarity
|
142 |
+
best_model = model_name1
|
143 |
|
144 |
+
formatted_response = f"**Best Model ({best_model}):**\n{all_responses[best_model]}\n\n"
|
145 |
+
cache.set(inputs, formatted_response)
|
146 |
return {"response": formatted_response}
|
147 |
|
148 |
if __name__ == "__main__":
|