Update app.py
Browse files
app.py
CHANGED
@@ -2,12 +2,13 @@ from fastapi import FastAPI, HTTPException, Request
|
|
2 |
from pydantic import BaseModel
|
3 |
import uvicorn
|
4 |
import requests
|
5 |
-
import io
|
6 |
import asyncio
|
|
|
|
|
|
|
7 |
from typing import List, Dict, Any
|
8 |
from llama_cpp import Llama # Ajusta según la biblioteca que estés utilizando
|
9 |
-
import
|
10 |
-
|
11 |
|
12 |
app = FastAPI()
|
13 |
|
@@ -38,96 +39,124 @@ class ModelManager:
|
|
38 |
self.load_lock = asyncio.Lock()
|
39 |
self.index_lock = asyncio.Lock()
|
40 |
self.part_size = 1024 * 1024 # Tamaño de cada parte en bytes (1 MB)
|
|
|
41 |
|
42 |
async def download_model_to_memory(self, model_config):
|
43 |
url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
|
|
|
44 |
try:
|
45 |
response = requests.get(url)
|
46 |
response.raise_for_status()
|
|
|
47 |
return io.BytesIO(response.content)
|
48 |
except requests.RequestException as e:
|
49 |
raise HTTPException(status_code=500, detail=f"Error al descargar el modelo: {e}")
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
async def load_model(self, model_config):
|
52 |
async with self.load_lock:
|
53 |
try:
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
except Exception as e:
|
74 |
print(f"Error al cargar el modelo: {e}")
|
75 |
|
76 |
-
async def handle_large_model(self,
|
77 |
-
total_size =
|
78 |
num_parts = (total_size + self.part_size - 1) // self.part_size
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
85 |
|
86 |
async def index_model_part(self, model_part, part_index):
|
87 |
async with self.index_lock:
|
88 |
part_name = f"part_{part_index}"
|
|
|
89 |
llama_part = Llama(model_part)
|
90 |
self.model_parts[part_name] = llama_part
|
|
|
91 |
|
92 |
async def generate_response(self, user_input):
|
|
|
93 |
tasks = [self.generate_chat_response(user_input, model_data) for model_data in self.models.values()]
|
94 |
responses = await asyncio.gather(*tasks)
|
95 |
return responses
|
96 |
|
97 |
async def generate_chat_response(self, user_input, model_data):
|
98 |
try:
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
except Exception as e:
|
119 |
-
print(f"Error al generar
|
120 |
-
return {
|
121 |
|
122 |
-
@app.post("/
|
123 |
-
async def
|
124 |
-
|
125 |
-
user_input =
|
126 |
if not user_input:
|
127 |
-
raise HTTPException(status_code=400, detail="
|
128 |
|
129 |
try:
|
130 |
model_manager = ModelManager()
|
|
|
|
|
131 |
responses = await model_manager.generate_response(user_input)
|
132 |
return {"responses": responses}
|
133 |
except Exception as e:
|
|
|
2 |
from pydantic import BaseModel
|
3 |
import uvicorn
|
4 |
import requests
|
|
|
5 |
import asyncio
|
6 |
+
import os
|
7 |
+
import io
|
8 |
+
import time
|
9 |
from typing import List, Dict, Any
|
10 |
from llama_cpp import Llama # Ajusta según la biblioteca que estés utilizando
|
11 |
+
from tqdm import tqdm
|
|
|
12 |
|
13 |
app = FastAPI()
|
14 |
|
|
|
39 |
self.load_lock = asyncio.Lock()
|
40 |
self.index_lock = asyncio.Lock()
|
41 |
self.part_size = 1024 * 1024 # Tamaño de cada parte en bytes (1 MB)
|
42 |
+
self.max_loading_time = 0 # Tiempo máximo en segundos para cargar un modelo
|
43 |
|
44 |
async def download_model_to_memory(self, model_config):
|
45 |
url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
|
46 |
+
print(f"Descargando modelo desde {url}")
|
47 |
try:
|
48 |
response = requests.get(url)
|
49 |
response.raise_for_status()
|
50 |
+
print(f"Descarga completa para {model_config['name']}")
|
51 |
return io.BytesIO(response.content)
|
52 |
except requests.RequestException as e:
|
53 |
raise HTTPException(status_code=500, detail=f"Error al descargar el modelo: {e}")
|
54 |
|
55 |
+
async def save_model_to_temp_file(self, model_config):
|
56 |
+
model_file = await self.download_model_to_memory(model_config)
|
57 |
+
temp_filename = f"/tmp/{model_config['filename']}"
|
58 |
+
print(f"Guardando el modelo en {temp_filename}")
|
59 |
+
with open(temp_filename, 'wb') as f:
|
60 |
+
f.write(model_file.getvalue())
|
61 |
+
print(f"Modelo guardado en {temp_filename}")
|
62 |
+
return temp_filename
|
63 |
+
|
64 |
async def load_model(self, model_config):
|
65 |
async with self.load_lock:
|
66 |
try:
|
67 |
+
start_time = time.time()
|
68 |
+
temp_filename = await self.save_model_to_temp_file(model_config)
|
69 |
+
elapsed_time = time.time() - start_time
|
70 |
+
if elapsed_time > self.max_loading_time:
|
71 |
+
print(f"El modelo {model_config['name']} tardó {elapsed_time:.2f} segundos en cargar. Dividiendo el modelo.")
|
72 |
+
await self.handle_large_model(temp_filename, model_config)
|
73 |
+
else:
|
74 |
+
print(f"Cargando modelo desde {temp_filename}")
|
75 |
+
llama = Llama(temp_filename) # Ajusta según la biblioteca y clase correctas
|
76 |
+
|
77 |
+
tokenizer = llama.tokenizer
|
78 |
+
model_data = {
|
79 |
+
'model': llama,
|
80 |
+
'tokenizer': tokenizer,
|
81 |
+
'pad_token': tokenizer.pad_token,
|
82 |
+
'pad_token_id': tokenizer.pad_token_id,
|
83 |
+
'eos_token': tokenizer.eos_token,
|
84 |
+
'eos_token_id': tokenizer.eos_token_id,
|
85 |
+
'bos_token': tokenizer.bos_token,
|
86 |
+
'bos_token_id': tokenizer.bos_token_id,
|
87 |
+
'unk_token': tokenizer.unk_token,
|
88 |
+
'unk_token_id': tokenizer.unk_token_id
|
89 |
+
}
|
90 |
+
|
91 |
+
self.models[model_config['name']] = model_data
|
92 |
+
print(f"Modelo {model_config['name']} cargado correctamente")
|
93 |
except Exception as e:
|
94 |
print(f"Error al cargar el modelo: {e}")
|
95 |
|
96 |
+
async def handle_large_model(self, model_filename, model_config):
|
97 |
+
total_size = os.path.getsize(model_filename)
|
98 |
num_parts = (total_size + self.part_size - 1) // self.part_size
|
99 |
|
100 |
+
print(f"Modelo {model_config['name']} dividido en {num_parts} partes")
|
101 |
+
with open(model_filename, 'rb') as file:
|
102 |
+
for i in tqdm(range(num_parts), desc=f"Indexando {model_config['name']}"):
|
103 |
+
start = i * self.part_size
|
104 |
+
end = min(start + self.part_size, total_size)
|
105 |
+
file.seek(start)
|
106 |
+
model_part = io.BytesIO(file.read(end - start))
|
107 |
+
await self.index_model_part(model_part, i)
|
108 |
|
109 |
async def index_model_part(self, model_part, part_index):
|
110 |
async with self.index_lock:
|
111 |
part_name = f"part_{part_index}"
|
112 |
+
print(f"Indexando parte {part_index}")
|
113 |
llama_part = Llama(model_part)
|
114 |
self.model_parts[part_name] = llama_part
|
115 |
+
print(f"Parte {part_index} indexada")
|
116 |
|
117 |
async def generate_response(self, user_input):
|
118 |
+
print("Generando respuestas")
|
119 |
tasks = [self.generate_chat_response(user_input, model_data) for model_data in self.models.values()]
|
120 |
responses = await asyncio.gather(*tasks)
|
121 |
return responses
|
122 |
|
123 |
async def generate_chat_response(self, user_input, model_data):
|
124 |
try:
|
125 |
+
print(f"Generando respuesta usando el modelo {model_data['model']}")
|
126 |
+
start_time = time.time()
|
127 |
+
generated_text = model_data['model'].generate(user_input)
|
128 |
+
elapsed_time = time.time() - start_time
|
129 |
+
|
130 |
+
if len(generated_text) > 1000:
|
131 |
+
parts = []
|
132 |
+
while len(generated_text) > 1000:
|
133 |
+
part = generated_text[:1000]
|
134 |
+
parts.append(part)
|
135 |
+
generated_text = generated_text[1000:]
|
136 |
+
parts.append(generated_text)
|
137 |
+
else:
|
138 |
+
parts = [generated_text]
|
139 |
+
|
140 |
+
print(f"Respuesta generada usando el modelo {model_data['model']} en {elapsed_time:.2f} segundos")
|
141 |
+
return {
|
142 |
+
'model_name': model_data['model'],
|
143 |
+
'generated_text_parts': parts
|
144 |
+
}
|
145 |
except Exception as e:
|
146 |
+
print(f"Error al generar respuesta con el modelo {model_data['model']}: {e}")
|
147 |
+
return {'model_name': model_data['model'], 'error': str(e)}
|
148 |
|
149 |
+
@app.post("/generate/")
|
150 |
+
async def generate(request: Request):
|
151 |
+
data = await request.json()
|
152 |
+
user_input = data.get('input', '')
|
153 |
if not user_input:
|
154 |
+
raise HTTPException(status_code=400, detail="Se requiere una entrada de usuario.")
|
155 |
|
156 |
try:
|
157 |
model_manager = ModelManager()
|
158 |
+
tasks = [model_manager.load_model(config) for config in model_configs]
|
159 |
+
await asyncio.gather(*tasks)
|
160 |
responses = await model_manager.generate_response(user_input)
|
161 |
return {"responses": responses}
|
162 |
except Exception as e:
|