Yjhhh commited on
Commit
f122391
1 Parent(s): d7a8f97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -94
app.py CHANGED
@@ -1,18 +1,16 @@
1
- from fastapi import FastAPI, HTTPException, Request
2
- from pydantic import BaseModel
3
- import uvicorn
4
- import requests
5
- import asyncio
6
  import os
7
  import io
8
  import time
9
- from typing import List, Dict, Any
10
- from llama_cpp import Llama # Ajusta según la biblioteca que estés utilizando
11
  from tqdm import tqdm
 
 
 
12
 
13
  app = FastAPI()
14
 
15
- # Configuración de los modelos
16
  model_configs = [
17
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
18
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
@@ -35,25 +33,24 @@ model_configs = [
35
  class ModelManager:
36
  def __init__(self):
37
  self.models = {}
38
- self.model_parts = {}
39
- self.load_lock = asyncio.Lock()
40
- self.index_lock = asyncio.Lock()
41
  self.part_size = 1024 * 1024 # Tamaño de cada parte en bytes (1 MB)
42
- self.max_loading_time = 0 # Tiempo máximo en segundos para cargar un modelo
43
 
44
  async def download_model_to_memory(self, model_config):
45
  url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
46
  print(f"Descargando modelo desde {url}")
47
  try:
 
48
  response = requests.get(url)
49
  response.raise_for_status()
50
- print(f"Descarga completa para {model_config['name']}")
51
- return io.BytesIO(response.content)
 
 
 
52
  except requests.RequestException as e:
53
  raise HTTPException(status_code=500, detail=f"Error al descargar el modelo: {e}")
54
 
55
- async def save_model_to_temp_file(self, model_config):
56
- model_file = await self.download_model_to_memory(model_config)
57
  temp_filename = f"/tmp/{model_config['filename']}"
58
  print(f"Guardando el modelo en {temp_filename}")
59
  with open(temp_filename, 'wb') as f:
@@ -62,41 +59,39 @@ class ModelManager:
62
  return temp_filename
63
 
64
  async def load_model(self, model_config):
65
- async with self.load_lock:
66
- try:
67
- start_time = time.time()
68
- temp_filename = await self.save_model_to_temp_file(model_config)
69
- elapsed_time = time.time() - start_time
70
- if elapsed_time > self.max_loading_time:
71
- print(f"El modelo {model_config['name']} tardó {elapsed_time:.2f} segundos en cargar. Dividiendo el modelo.")
72
- await self.handle_large_model(temp_filename, model_config)
73
- else:
74
- print(f"Cargando modelo desde {temp_filename}")
75
- llama = Llama(temp_filename) # Ajusta según la biblioteca y clase correctas
76
-
77
- tokenizer = llama.tokenizer
78
- model_data = {
79
- 'model': llama,
80
- 'tokenizer': tokenizer,
81
- 'pad_token': tokenizer.pad_token,
82
- 'pad_token_id': tokenizer.pad_token_id,
83
- 'eos_token': tokenizer.eos_token,
84
- 'eos_token_id': tokenizer.eos_token_id,
85
- 'bos_token': tokenizer.bos_token,
86
- 'bos_token_id': tokenizer.bos_token_id,
87
- 'unk_token': tokenizer.unk_token,
88
- 'unk_token_id': tokenizer.unk_token_id
89
- }
90
-
91
- self.models[model_config['name']] = model_data
92
- print(f"Modelo {model_config['name']} cargado correctamente")
93
- except Exception as e:
94
- print(f"Error al cargar el modelo: {e}")
95
 
96
  async def handle_large_model(self, model_filename, model_config):
97
  total_size = os.path.getsize(model_filename)
98
  num_parts = (total_size + self.part_size - 1) // self.part_size
99
-
100
  print(f"Modelo {model_config['name']} dividido en {num_parts} partes")
101
  with open(model_filename, 'rb') as file:
102
  for i in tqdm(range(num_parts), desc=f"Indexando {model_config['name']}"):
@@ -107,42 +102,31 @@ class ModelManager:
107
  await self.index_model_part(model_part, i)
108
 
109
  async def index_model_part(self, model_part, part_index):
110
- async with self.index_lock:
111
- part_name = f"part_{part_index}"
112
- print(f"Indexando parte {part_index}")
113
- llama_part = Llama.from_bytes(model_part.getvalue()) # Ajusta según la biblioteca y clase correctas
114
- self.model_parts[part_name] = llama_part
 
115
 
116
  async def generate_response(self, user_input):
117
- tasks = [self.generate_chat_response(user_input, model_data) for model_data in self.models.values()]
118
- responses = await asyncio.gather(*tasks)
119
- return responses
120
-
121
- async def generate_chat_response(self, user_input, model_data):
122
- try:
123
- print(f"Generando respuesta usando el modelo {model_data['model']}")
124
- start_time = time.time()
125
- generated_text = model_data['model'].generate(user_input)
126
- elapsed_time = time.time() - start_time
127
-
128
- if len(generated_text) > 1000:
129
- parts = []
130
- while len(generated_text) > 1000:
131
- part = generated_text[:1000]
132
- parts.append(part)
133
- generated_text = generated_text[1000:]
134
- parts.append(generated_text)
135
- else:
136
- parts = [generated_text]
137
-
138
- print(f"Respuesta generada usando el modelo {model_data['model']} en {elapsed_time:.2f} segundos")
139
- return {
140
- 'model_name': model_data['model'],
141
- 'generated_text_parts': parts
142
- }
143
- except Exception as e:
144
- print(f"Error al generar respuesta con el modelo {model_data['model']}: {e}")
145
- return {'model_name': model_data['model'], 'error': str(e)}
146
 
147
  @app.post("/generate/")
148
  async def generate(request: Request):
@@ -151,21 +135,14 @@ async def generate(request: Request):
151
  if not user_input:
152
  raise HTTPException(status_code=400, detail="Se requiere una entrada de usuario.")
153
 
154
- try:
155
- model_manager = ModelManager()
156
- tasks = [model_manager.load_model(config) for config in model_configs]
157
- await asyncio.gather(*tasks)
158
- responses = await model_manager.generate_response(user_input)
159
- return {"responses": responses}
160
- except Exception as e:
161
- raise HTTPException(status_code=500, detail=str(e))
162
 
163
  def start_uvicorn():
164
  uvicorn.run(app, host="0.0.0.0", port=7860)
165
 
166
  if __name__ == "__main__":
167
- loop = asyncio.get_event_loop()
168
- model_manager = ModelManager()
169
- tasks = [model_manager.load_model(config) for config in model_configs]
170
- loop.run_until_complete(asyncio.gather(*tasks))
171
- start_uvicorn()
 
 
 
 
 
 
1
  import os
2
  import io
3
  import time
4
+ import asyncio
5
+ import requests
6
  from tqdm import tqdm
7
+ from fastapi import FastAPI, HTTPException, Request
8
+ import uvicorn
9
+ from llama_cpp import Llama
10
 
11
  app = FastAPI()
12
 
13
+ # Configuración de modelos
14
  model_configs = [
15
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
16
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
 
33
  class ModelManager:
34
  def __init__(self):
35
  self.models = {}
 
 
 
36
  self.part_size = 1024 * 1024 # Tamaño de cada parte en bytes (1 MB)
 
37
 
38
  async def download_model_to_memory(self, model_config):
39
  url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
40
  print(f"Descargando modelo desde {url}")
41
  try:
42
+ start_time = time.time()
43
  response = requests.get(url)
44
  response.raise_for_status()
45
+ model_file = io.BytesIO(response.content)
46
+ end_time = time.time()
47
+ download_duration = end_time - start_time
48
+ print(f"Descarga completa para {model_config['name']} en {download_duration:.2f} segundos")
49
+ return model_file
50
  except requests.RequestException as e:
51
  raise HTTPException(status_code=500, detail=f"Error al descargar el modelo: {e}")
52
 
53
+ async def save_model_to_temp_file(self, model_file, model_config):
 
54
  temp_filename = f"/tmp/{model_config['filename']}"
55
  print(f"Guardando el modelo en {temp_filename}")
56
  with open(temp_filename, 'wb') as f:
 
59
  return temp_filename
60
 
61
  async def load_model(self, model_config):
62
+ model_file = await self.download_model_to_memory(model_config)
63
+ temp_filename = await self.save_model_to_temp_file(model_file, model_config)
64
+ try:
65
+ start_time = time.time()
66
+ print(f"Cargando modelo desde {temp_filename}")
67
+ llama = Llama.load(temp_filename)
68
+ end_time = time.time()
69
+ load_duration = end_time - start_time
70
+ if load_duration > 0:
71
+ print(f"Modelo {model_config['name']} tardó {load_duration:.2f} segundos en cargar, dividiendo automáticamente")
72
+ await self.handle_large_model(temp_filename, model_config)
73
+ else:
74
+ print(f"Modelo {model_config['name']} cargado correctamente en {load_duration:.2f} segundos")
75
+ tokenizer = llama.tokenizer
76
+ model_data = {
77
+ 'model': llama,
78
+ 'tokenizer': tokenizer,
79
+ 'pad_token': tokenizer.pad_token,
80
+ 'pad_token_id': tokenizer.pad_token_id,
81
+ 'eos_token': tokenizer.eos_token,
82
+ 'eos_token_id': tokenizer.eos_token_id,
83
+ 'bos_token': tokenizer.bos_token,
84
+ 'bos_token_id': tokenizer.bos_token_id,
85
+ 'unk_token': tokenizer.unk_token,
86
+ 'unk_token_id': tokenizer.unk_token_id
87
+ }
88
+ self.models[model_config['name']] = model_data
89
+ except Exception as e:
90
+ print(f"Error al cargar el modelo: {e}")
 
91
 
92
  async def handle_large_model(self, model_filename, model_config):
93
  total_size = os.path.getsize(model_filename)
94
  num_parts = (total_size + self.part_size - 1) // self.part_size
 
95
  print(f"Modelo {model_config['name']} dividido en {num_parts} partes")
96
  with open(model_filename, 'rb') as file:
97
  for i in tqdm(range(num_parts), desc=f"Indexando {model_config['name']}"):
 
102
  await self.index_model_part(model_part, i)
103
 
104
  async def index_model_part(self, model_part, part_index):
105
+ part_name = f"part_{part_index}"
106
+ print(f"Indexando parte {part_index}")
107
+ temp_filename = f"/tmp/{part_name}.gguf"
108
+ with open(temp_filename, 'wb') as f:
109
+ f.write(model_part.getvalue())
110
+ print(f"Parte {part_index} indexada y guardada")
111
 
112
  async def generate_response(self, user_input):
113
+ results = []
114
+ for model_name, model_data in self.models.items():
115
+ print(f"Generando respuesta con el modelo {model_name}")
116
+ try:
117
+ tokenizer = model_data['tokenizer']
118
+ input_ids = tokenizer(user_input, return_tensors="pt").input_ids
119
+ outputs = model_data['model'].generate(input_ids)
120
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
121
+ parts = [generated_text[i:i + 1000] for i in range(0, len(generated_text), 1000)]
122
+ results.append({
123
+ 'model_name': model_name,
124
+ 'generated_text_parts': parts
125
+ })
126
+ except Exception as e:
127
+ print(f"Error al generar respuesta con el modelo {model_name}: {e}")
128
+ results.append({'model_name': model_name, 'error': str(e)})
129
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  @app.post("/generate/")
132
  async def generate(request: Request):
 
135
  if not user_input:
136
  raise HTTPException(status_code=400, detail="Se requiere una entrada de usuario.")
137
 
138
+ model_manager = ModelManager()
139
+ tasks = [model_manager.load_model(config) for config in model_configs]
140
+ await asyncio.gather(*tasks)
141
+ responses = await model_manager.generate_response(user_input)
142
+ return {"responses": responses}
 
 
 
143
 
144
  def start_uvicorn():
145
  uvicorn.run(app, host="0.0.0.0", port=7860)
146
 
147
  if __name__ == "__main__":
148
+ asyncio.run(start_uvicorn())