File size: 4,092 Bytes
b212c94
 
 
53eee33
b212c94
 
 
18000a9
b212c94
 
 
 
 
53eee33
b212c94
 
 
 
 
 
 
36c9f0a
71df925
18000a9
b212c94
 
 
 
 
 
 
 
 
18000a9
36c9f0a
b212c94
 
 
 
 
 
 
e038371
b212c94
e038371
b212c94
36c9f0a
18000a9
36c9f0a
 
b212c94
2164e21
b212c94
 
 
 
53eee33
b212c94
 
 
 
 
 
 
 
 
 
 
 
 
 
36c9f0a
 
 
18000a9
 
 
2164e21
18000a9
b212c94
53eee33
2164e21
18000a9
53eee33
 
18000a9
b212c94
18000a9
2164e21
 
 
e038371
b212c94
18000a9
e038371
18000a9
 
e038371
 
18000a9
 
e038371
b212c94
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import uvicorn
from dotenv import load_dotenv
from difflib import SequenceMatcher
from tqdm import tqdm  # Importa tqdm para la barra de progreso

load_dotenv()

app = FastAPI()

# Configuraci贸n de los modelos
models = [
    {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
]

# Cargar modelos en memoria solo una vez
llms = [Llama.from_pretrained(repo_id=model['repo_id'], filename=model['filename']) for model in models]
print(f"Modelos cargados: {[model['repo_id'] for model in models]}")

class ChatRequest(BaseModel):
    message: str
    top_k: int = 50
    top_p: float = 0.95
    temperature: float = 0.7

def generate_chat_response(request, llm):
    try:
        # Normalizaci贸n del mensaje para manejo robusto
        user_input = normalize_input(request.message)
        response = llm.create_chat_completion(
            messages=[{"role": "user", "content": user_input}],
            top_k=request.top_k,
            top_p=request.top_p,
            temperature=request.temperature
        )
        reply = response['choices'][0]['message']['content']
        return {"response": reply, "literal": user_input}
    except Exception as e:
        return {"response": f"Error: {str(e)}", "literal": user_input}

def normalize_input(input_text):
    # Implementar aqu铆 cualquier l贸gica de normalizaci贸n que sea necesaria
    return input_text.strip()

def select_best_response(responses, request):
    coherent_responses = filter_by_coherence(responses, request)
    best_response = filter_by_similarity(coherent_responses)
    return best_response

def filter_by_coherence(responses, request):
    # Implementa aqu铆 un filtro de coherencia si es necesario
    return responses

def filter_by_similarity(responses):
    responses.sort(key=len, reverse=True)
    best_response = responses[0]
    for i in range(1, len(responses)):
        ratio = SequenceMatcher(None, best_response, responses[i]).ratio()
        if ratio < 0.9:
            best_response = responses[i]
            break
    return best_response

@app.post("/generate_chat")
async def generate_chat(request: ChatRequest):
    if not request.message.strip():
        raise HTTPException(status_code=400, detail="The message cannot be empty.")
    
    print(f"Procesando solicitud: {request.message}")

    # Crear un ThreadPoolExecutor para ejecutar las tareas en paralelo
    with ThreadPoolExecutor() as executor:
        # Usar tqdm para mostrar la barra de progreso
        futures = [executor.submit(generate_chat_response, request, llm) for llm in llms]
        responses = []

        for future in tqdm(as_completed(futures), total=len(futures), desc="Generando respuestas"):
            response = future.result()
            responses.append(response)
            print(f"Modelo procesado: {response['literal'][:30]}...")  # Muestra los primeros 30 caracteres de la respuesta
    
    # Verificar si hay errores en las respuestas
    error_responses = [resp for resp in responses if "Error" in resp['response']]
    if error_responses:
        error_response = error_responses[0]
        raise HTTPException(status_code=500, detail=error_response['response'])
    
    best_response = select_best_response([resp['response'] for resp in responses], request)
    
    print(f"Mejor respuesta seleccionada: {best_response}")

    return {
        "best_response": best_response,
        "all_responses": [resp['response'] for resp in responses],
        "literal_inputs": [resp['literal'] for resp in responses]
    }

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)