eudr_chabo_generator

Running on CPU Upgrade

App Files Files Community

mtyrrell commited on Sep 8

Commit

ec4377c

1 Parent(s): f346328

updated for streaming

Browse files

Files changed (2) hide show

app.py +13 -11
utils/generator.py +77 -10

app.py CHANGED Viewed

@@ -1,13 +1,11 @@
 import gradio as gr
-from utils.generator import generate
 # ---------------------------------------------------------------------
-# Gradio Interface with MCP support
 # ---------------------------------------------------------------------
 ui = gr.Interface(
-    fn=generate,
     inputs=[
         gr.Textbox(
             label="Query",
@@ -22,10 +20,15 @@ ui = gr.Interface(
             info="Provide the context/documents to use for answering. The API expects a list of dictionaries, but the UI should except anything"
         ),
     ],
-    outputs=[gr.Text(label="Generated Answer", lines=6, show_copy_button=True)],
-            title="ChatFed Generation Module",
-            description="Ask questions based on provided context. Intended for use in RAG pipelines as an MCP server with other ChatFed modules (i.e. context supplied by semantic retriever service).",
-            api_name="generate"
 )
 # Launch with MCP server enabled
@@ -33,7 +36,6 @@ if __name__ == "__main__":
     ui.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        #mcp_server=True,
         show_error=True
     )

 import gradio as gr
+from .generator import generate, generate_streaming
 # ---------------------------------------------------------------------
+# Gradio Interface with MCP support and streaming
 # ---------------------------------------------------------------------
 ui = gr.Interface(
+    fn=generate_streaming,  # Use streaming function
     inputs=[
         gr.Textbox(
             label="Query",
             info="Provide the context/documents to use for answering. The API expects a list of dictionaries, but the UI should except anything"
         ),
     ],
+    outputs=gr.Textbox(
+        label="Generated Answer",
+        lines=6,
+        show_copy_button=True,
+        streaming=True  # Enable streaming in the output
+    ),
+    title="ChatFed Generation Module",
+    description="Ask questions based on provided context. Intended for use in RAG pipelines as an MCP server with other ChatFed modules (i.e. context supplied by semantic retriever service).",
+    api_name="generate"
 )
 # Launch with MCP server enabled
     ui.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        mcp_server=True,
         show_error=True
     )

utils/generator.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import asyncio
 import json
 import ast
-from typing import List, Dict, Any, Union
 from dotenv import load_dotenv
 # LangChain imports
@@ -24,8 +24,6 @@ PROVIDER = config.get("generator", "PROVIDER")
 MODEL = config.get("generator", "MODEL")
 MAX_TOKENS = int(config.get("generator", "MAX_TOKENS"))
 TEMPERATURE = float(config.get("generator", "TEMPERATURE"))
-INFERENCE_PROVIDER = config.get("generator", "INFERENCE_PROVIDER")
-ORGANIZATION = config.get("generator", "ORGANIZATION")
 # Set up authentication for the selected provider
 auth_config = get_auth(PROVIDER)
@@ -41,18 +39,21 @@ def get_chat_model():
         return ChatOpenAI(
             model=MODEL,
             openai_api_key=auth_config["api_key"],
             **common_params
         )
     elif PROVIDER == "anthropic":
         return ChatAnthropic(
             model=MODEL,
             anthropic_api_key=auth_config["api_key"],
             **common_params
         )
     elif PROVIDER == "cohere":
         return ChatCohere(
             model=MODEL,
             cohere_api_key=auth_config["api_key"],
             **common_params
         )
     elif PROVIDER == "huggingface":
@@ -61,10 +62,9 @@ def get_chat_model():
             repo_id=MODEL,
             huggingfacehub_api_token=auth_config["api_key"],
             task="text-generation",
-            provider=INFERENCE_PROVIDER,
-            server_kwargs={"bill_to": ORGANIZATION},
             temperature=TEMPERATURE,
-            max_new_tokens=MAX_TOKENS
         )
         return ChatHuggingFace(llm=llm)
     else:
@@ -143,7 +143,7 @@ def format_context_from_results(processed_results: List[Dict[str, Any]]) -> str:
 # ---------------------------------------------------------------------
 async def _call_llm(messages: list) -> str:
     """
-    Provider-agnostic LLM call using LangChain.
     Args:
         messages: List of LangChain message objects
@@ -159,6 +159,25 @@ async def _call_llm(messages: list) -> str:
         logging.exception(f"LLM generation failed with provider '{PROVIDER}' and model '{MODEL}': {e}")
         raise
 def build_messages(question: str, context: str) -> list:
     """
     Build messages in LangChain format.
@@ -222,9 +241,57 @@ async def generate(query: str, context: Union[str, List[Dict[str, Any]]]) -> str
     try:
         messages = build_messages(query, formatted_context)
         answer = await _call_llm(messages)
         return answer
     except Exception as e:
         logging.exception("Generation failed")
-        return f"Error: {str(e)}"

 import asyncio
 import json
 import ast
+from typing import List, Dict, Any, Union, Generator, AsyncGenerator
 from dotenv import load_dotenv
 # LangChain imports
 MODEL = config.get("generator", "MODEL")
 MAX_TOKENS = int(config.get("generator", "MAX_TOKENS"))
 TEMPERATURE = float(config.get("generator", "TEMPERATURE"))
 # Set up authentication for the selected provider
 auth_config = get_auth(PROVIDER)
         return ChatOpenAI(
             model=MODEL,
             openai_api_key=auth_config["api_key"],
+            streaming=True,  # Enable streaming
             **common_params
         )
     elif PROVIDER == "anthropic":
         return ChatAnthropic(
             model=MODEL,
             anthropic_api_key=auth_config["api_key"],
+            streaming=True,  # Enable streaming
             **common_params
         )
     elif PROVIDER == "cohere":
         return ChatCohere(
             model=MODEL,
             cohere_api_key=auth_config["api_key"],
+            streaming=True,  # Enable streaming
             **common_params
         )
     elif PROVIDER == "huggingface":
             repo_id=MODEL,
             huggingfacehub_api_token=auth_config["api_key"],
             task="text-generation",
             temperature=TEMPERATURE,
+            max_new_tokens=MAX_TOKENS,
+            streaming=True  # Enable streaming
         )
         return ChatHuggingFace(llm=llm)
     else:
 # ---------------------------------------------------------------------
 async def _call_llm(messages: list) -> str:
     """
+    Provider-agnostic LLM call using LangChain (non-streaming).
     Args:
         messages: List of LangChain message objects
         logging.exception(f"LLM generation failed with provider '{PROVIDER}' and model '{MODEL}': {e}")
         raise
+async def _call_llm_streaming(messages: list) -> AsyncGenerator[str, None]:
+    """
+    Provider-agnostic streaming LLM call using LangChain.
+    Args:
+        messages: List of LangChain message objects
+    Yields:
+        Generated response chunks as strings
+    """
+    try:
+        # Use async stream for streaming responses
+        async for chunk in chat_model.astream(messages):
+            if hasattr(chunk, 'content') and chunk.content:
+                yield chunk.content
+    except Exception as e:
+        logging.exception(f"LLM streaming failed with provider '{PROVIDER}' and model '{MODEL}': {e}")
+        yield f"Error: {str(e)}"
 def build_messages(question: str, context: str) -> list:
     """
     Build messages in LangChain format.
     try:
         messages = build_messages(query, formatted_context)
         answer = await _call_llm(messages)
         return answer
     except Exception as e:
         logging.exception("Generation failed")
+        return f"Error: {str(e)}"
+async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]]]) -> AsyncGenerator[str, None]:
+    """
+    Generate a streaming answer to a query using provided context through RAG.
+    This function takes a user query and relevant context, then uses a language model
+    to generate a streaming answer based on the provided information.
+    Args:
+        query (str): User query
+        context (Union[str, List[Dict[str, Any]]]): Context as string or list of retrieval results
+    Yields:
+        str: Streaming chunks of the generated answer
+    """
+    if not query.strip():
+        yield "Error: Query cannot be empty"
+        return
+    # Handle both string context (for Gradio UI) and list context (from retriever)
+    if isinstance(context, list):
+        if not context:
+            yield "Error: No retrieval results provided"
+            return
+        # Process the retrieval results
+        processed_results = extract_relevant_fields(context)
+        formatted_context = format_context_from_results(processed_results)
+        if not formatted_context.strip():
+            yield "Error: No valid content found in retrieval results"
+            return
+    elif isinstance(context, str):
+        if not context.strip():
+            yield "Error: Context cannot be empty"
+            return
+        formatted_context = context
+    else:
+        yield "Error: Context must be either a string or list of retrieval results"
+        return
+    try:
+        messages = build_messages(query, formatted_context)
+        async for chunk in _call_llm_streaming(messages):
+            yield chunk
+    except Exception as e:
+        logging.exception("Streaming generation failed")
+        yield f"Error: {str(e)}"