import os import subprocess import asyncio import time import threading import gc import psutil import torch from fastapi import FastAPI, HTTPException from pydantic import BaseModel import uvicorn from langchain.prompts import PromptTemplate from langchain_community.chat_models import ChatOllama from langchain_community.tools import DuckDuckGoSearchRun from langchain_community.utilities import DuckDuckGoSearchAPIWrapper from langchain_core.output_parsers import JsonOutputParser, StrOutputParser from langgraph.graph import END, StateGraph from typing_extensions import TypedDict # Asegúrate de tener la librería Ollama instalada # pip install ollama langchain langchain_community langgraph OLLAMA_PATH = os.path.expanduser("/home/user/ollama") # Ruta del ejecutable de Ollama # Función para descargar Ollama sin root async def download_ollama_model(model_name='hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S'): try: if not os.path.exists(OLLAMA_PATH): print("Ollama no encontrado, descargando...") # Descargar el ejecutable de Ollama para Linux (ajustar si es otro sistema operativo) subprocess.run(f"curl -L https://ollama.com/download/ollama-linux-amd64 -o {OLLAMA_PATH}", shell=True) os.chmod(OLLAMA_PATH, 0o777) # Asegurar permisos de ejecución print(f"Ollama descargado en {OLLAMA_PATH}") except subprocess.CalledProcessError as e: print(f"Error al descargar el modelo: {e}") raise # Ejecutar Ollama de manera asíncrona async def start_ollama_service(): try: print("Iniciando el servicio de Ollama...") subprocess.run(["/home/user/ollama", "serve"], check=True) except subprocess.CalledProcessError as e: print(f"Error al iniciar el servicio de Ollama: {e}") raise # Función para descargar e iniciar Ollama asíncronamente async def start_ollama_model(model_name): try: print(f"Iniciando el servicio de Ollama model: {model_name}...") print(f"Descargando el modelo: {model_name}") subprocess.run(["/home/user/ollama", "pull", model_name], check=True) except subprocess.CalledProcessError as e: print(f"Error al iniciar el servicio de Ollama: {e}") raise # Función para descargar e iniciar Ollama async def setup_ollama(model_name): await download_ollama_model(model_name) await start_ollama_service() await start_ollama_model(model_name) # Iniciar el servidor Ollama y descargar el modelo asyncio.run(setup_ollama('hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S')) # Configuración de FastAPI app = FastAPI() # Definición del modelo de datos para recibir las consultas class QueryRequest(BaseModel): query: str # Definir el modelo de lenguaje de Ollama (sin 'temperature') local_llm = 'hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S' llama3 = ChatOllama(model=local_llm) # Definir la herramienta de búsqueda web usando DuckDuckGo wrapper = DuckDuckGoSearchAPIWrapper(max_results=1) web_search_tool = DuckDuckGoSearchRun(api_wrapper=wrapper) # Definición de los prompts para generación y enrutamiento generate_prompt = PromptTemplate( template=""" <|begin_of_text|> <|start_header_id|>system<|end_header_id|> You are an AI assistant for Research Question Tasks, that synthesizes web search results. Strictly use the following pieces of web search context to answer the question. If you don't know the answer, just say that you don't know. Keep the answer concise, but provide all of the details you can in the form of a research report. Only make direct references to material if provided in the context. <|eot_id|> <|start_header_id|>user<|end_header_id|> Question: {question} Web Search Context: {context} Answer: <|eot_id|> <|start_header_id|>assistant<|end_header_id|>""", input_variables=["question", "context"], ) generate_chain = generate_prompt | llama3 | StrOutputParser() router_prompt = PromptTemplate( template=""" <|begin_of_text|> <|start_header_id|>system<|end_header_id|> You are an expert at routing a user question to either the generation stage or web search. Use the web search for questions that require more context for a better answer, or recent events. Otherwise, you can skip and go straight to the generation phase to respond. You do not need to be stringent with the keywords in the question related to these topics. Give a binary choice 'web_search' or 'generate' based on the question. Return the JSON with a single key 'choice' with no premable or explanation. Question to route: {question} <|eot_id|> <|start_header_id|>assistant<|end_header_id|>""", input_variables=["question"], ) question_router = router_prompt | llama3 | JsonOutputParser() query_prompt = PromptTemplate( template=""" <|begin_of_text|> <|start_header_id|>system<|end_header_id|> You are an expert at crafting web search queries for research questions. More often than not, a user will ask a basic question that they wish to learn more about, however it might not be in the best format. Reword their query to be the most effective web search string possible. Return the JSON with a single key 'query' with no premable or explanation. Question to transform: {question} <|eot_id|> <|start_header_id|>assistant<|end_header_id|>""", input_variables=["question"], ) query_chain = query_prompt | llama3 | JsonOutputParser() # Definición del estado del grafo class GraphState(TypedDict): question: str generation: str search_query: str context: str # Nodos de procesamiento def generate(state): print("Step: Generating Final Response") question = state["question"] context = state["context"] generation = generate_chain.invoke({"context": context, "question": question}) return {"generation": generation} def transform_query(state): print("Step: Optimizing Query for Web Search") question = state['question'] gen_query = query_chain.invoke({"question": question}) search_query = gen_query.get("query", "") # Asegurarnos de que estamos obteniendo la clave correcta return {"search_query": search_query} def web_search(state): search_query = state['search_query'] print(f'Step: Searching the Web for: "{search_query}"') try: search_result = web_search_tool.invoke(search_query) if isinstance(search_result, str): # Si la respuesta es una cadena, la convertimos en un diccionario print(f"Respuesta de búsqueda web es cadena: {search_result}") return {"context": search_result} elif isinstance(search_result, dict): # Si es un diccionario, lo usamos directamente return {"context": search_result} else: raise ValueError("Respuesta de búsqueda web no es válida") except Exception as e: print(f"Web search failed: {e}") return None # Si la búsqueda falla, no devuelve contexto def route_question(state): print("Step: Routing Query") question = state['question'] output = question_router.invoke({"question": question}) if output.get('choice') == "web_search": print("Step: Routing Query to Web Search") return "websearch" elif output.get('choice') == 'generate': print("Step: Routing Query to Generation") return "generate" # Crear el grafo de estado workflow = StateGraph(GraphState) workflow.add_node("websearch", web_search) workflow.add_node("transform_query", transform_query) workflow.add_node("generate", generate) workflow.set_conditional_entry_point( route_question, { "websearch": "transform_query", "generate": "generate", }, ) workflow.add_edge("transform_query", "websearch") workflow.add_edge("websearch", "generate") workflow.add_edge("generate", END) # Compilar el agente local_agent = workflow.compile() # Función para ejecutar el agente async def run_agent_parallel(query): output = local_agent.invoke({"question": query}) if "generation" not in output: # Si la búsqueda web falló y no hubo respuesta de generación print("Web search failed, using Ollama model directly.") return {"response": "Lo siento, no se pudo encontrar una respuesta."} else: return {"response": output["generation"]} @app.post("/ask/") async def ask_question(request: QueryRequest): try: query = request.query result = await run_agent_parallel(query) return result except Exception as e: raise HTTPException(status_code=500, detail=f"Error al procesar la solicitud: {e}") # Iniciar la API con Uvicorn if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)