Hjgugugjhuhjggg commited on
Commit
0a3c752
1 Parent(s): 50c545e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -14
app.py CHANGED
@@ -8,6 +8,8 @@ from dotenv import load_dotenv
8
  from pydantic import BaseModel
9
  import requests
10
  import traceback
 
 
11
 
12
  load_dotenv()
13
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
@@ -27,7 +29,9 @@ global_data = {
27
  }
28
 
29
  model_configs = [
30
- {"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}
 
 
31
  ]
32
 
33
  models = {}
@@ -36,7 +40,7 @@ def load_model(model_config):
36
  model_name = model_config['name']
37
  if model_name not in models:
38
  try:
39
- model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
40
  models[model_name] = model
41
  global_data['models'] = models
42
  return model
@@ -80,7 +84,7 @@ def generate_model_response(model, inputs, max_tokens_per_part):
80
 
81
  text = response['choices'][0]['text']
82
  if text:
83
- responses.append(remove_duplicates(text))
84
 
85
  return responses
86
 
@@ -89,6 +93,7 @@ def generate_model_response(model, inputs, max_tokens_per_part):
89
  traceback.print_exc()
90
  return [f"Error: {e}"]
91
 
 
92
  app = FastAPI()
93
  origins = ["*"]
94
  app.add_middleware(
@@ -99,28 +104,45 @@ app.add_middleware(
99
  allow_headers=["*"],
100
  )
101
 
 
 
102
  @app.post("/generate")
103
  async def generate(request: ChatRequest):
104
  inputs = normalize_input(request.message)
 
 
 
 
105
  with ThreadPoolExecutor() as executor:
106
  futures = [executor.submit(generate_model_response, model, inputs, request.max_tokens_per_part) for model in models.values()]
107
  responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(models.keys(), as_completed(futures))]
108
 
109
- unique_responses = {}
110
- for response_set in responses:
111
- model_name = response_set['model']
112
- if model_name not in unique_responses:
113
- unique_responses[model_name] = []
114
- unique_responses[model_name].extend(response_set['response'])
115
 
 
 
 
116
 
117
- formatted_response = ""
118
- for model, response_parts in unique_responses.items():
119
- formatted_response += f"**{model}:**\n"
120
- for i, part in enumerate(response_parts):
121
- formatted_response += f"Part {i+1}:\n{part}\n\n"
122
 
 
 
 
 
 
 
 
 
 
123
 
 
 
124
  return {"response": formatted_response}
125
 
126
  if __name__ == "__main__":
 
8
  from pydantic import BaseModel
9
  import requests
10
  import traceback
11
+ from sentence_transformers import SentenceTransformer, util
12
+ from gptcache import Cache, get_cache
13
 
14
  load_dotenv()
15
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
29
  }
30
 
31
  model_configs = [
32
+ {"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"},
33
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "meta-llama-3.1-70b"},
34
+ {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "gemma-2-27b"}
35
  ]
36
 
37
  models = {}
 
40
  model_name = model_config['name']
41
  if model_name not in models:
42
  try:
43
+ model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN, verbose=True)
44
  models[model_name] = model
45
  global_data['models'] = models
46
  return model
 
84
 
85
  text = response['choices'][0]['text']
86
  if text:
87
+ responses.append(remove_duplicates(text))
88
 
89
  return responses
90
 
 
93
  traceback.print_exc()
94
  return [f"Error: {e}"]
95
 
96
+
97
  app = FastAPI()
98
  origins = ["*"]
99
  app.add_middleware(
 
104
  allow_headers=["*"],
105
  )
106
 
107
+ cache = get_cache()
108
+
109
  @app.post("/generate")
110
  async def generate(request: ChatRequest):
111
  inputs = normalize_input(request.message)
112
+ cached_result = cache.get(inputs)
113
+ if cached_result:
114
+ return {"response": cached_result}
115
+
116
  with ThreadPoolExecutor() as executor:
117
  futures = [executor.submit(generate_model_response, model, inputs, request.max_tokens_per_part) for model in models.values()]
118
  responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(models.keys(), as_completed(futures))]
119
 
120
+ model_embeddings = {}
121
+ sentence_model = SentenceTransformer('all-mpnet-base-v2')
122
+
123
+ all_responses = {}
124
+ for res in responses:
125
+ all_responses[res['model']] = " ".join(res['response'])
126
 
127
+ for model_name, response_text in all_responses.items():
128
+ embeddings = sentence_model.encode(response_text)
129
+ model_embeddings[model_name] = embeddings
130
 
131
+ best_model = None
132
+ best_similarity = -1
 
 
 
133
 
134
+ for model_name1, embedding1 in model_embeddings.items():
135
+ avg_similarity = 0
136
+ for model_name2, embedding2 in model_embeddings.items():
137
+ cosine_scores = util.cos_sim(embedding1, embedding2)
138
+ avg_similarity += cosine_scores.item()
139
+ avg_similarity /= len(model_embeddings)
140
+ if avg_similarity > best_similarity:
141
+ best_similarity = avg_similarity
142
+ best_model = model_name1
143
 
144
+ formatted_response = f"**Best Model ({best_model}):**\n{all_responses[best_model]}\n\n"
145
+ cache.set(inputs, formatted_response)
146
  return {"response": formatted_response}
147
 
148
  if __name__ == "__main__":