eudr_chabo_generator

Running on CPU Upgrade

App Files Files Community

mtyrrell commited on Sep 29

Commit

f256208

1 Parent(s): d049b68

cleanup

Browse files

Files changed (1) hide show

utils/generator.py +129 -342

utils/generator.py CHANGED Viewed

@@ -3,7 +3,7 @@ import asyncio
 import json
 import ast
 import re
-from typing import List, Dict, Any, Union, Generator, AsyncGenerator
 from dotenv import load_dotenv
 # LangChain imports
@@ -17,188 +17,150 @@ from langchain_core.messages import SystemMessage, HumanMessage
 from .utils import getconfig, get_auth
 # ---------------------------------------------------------------------
-# Model / client initialization (non exaustive list of providers)
 # ---------------------------------------------------------------------
 config = getconfig("params.cfg")
 PROVIDER = config.get("generator", "PROVIDER")
 MODEL = config.get("generator", "MODEL")
 MAX_TOKENS = int(config.get("generator", "MAX_TOKENS"))
 TEMPERATURE = float(config.get("generator", "TEMPERATURE"))
-# Set up authentication for the selected provider
 auth_config = get_auth(PROVIDER)
-def get_chat_model():
     """Initialize the appropriate LangChain chat model based on provider"""
-    common_params = {
-        "temperature": TEMPERATURE,
-        "max_tokens": MAX_TOKENS,
     }
-    if PROVIDER == "openai":
-        return ChatOpenAI(
-            model=MODEL,
-            openai_api_key=auth_config["api_key"],
-            streaming=True,  # Enable streaming
-            **common_params
-        )
-    elif PROVIDER == "anthropic":
-        return ChatAnthropic(
-            model=MODEL,
-            anthropic_api_key=auth_config["api_key"],
-            streaming=True,  # Enable streaming
-            **common_params
-        )
-    elif PROVIDER == "cohere":
-        return ChatCohere(
-            model=MODEL,
-            cohere_api_key=auth_config["api_key"],
-            streaming=True,  # Enable streaming
-            **common_params
-        )
-    elif PROVIDER == "huggingface":
-        # Initialize HuggingFaceEndpoint with explicit parameters
-        llm = HuggingFaceEndpoint(
-            repo_id=MODEL,
-            huggingfacehub_api_token=auth_config["api_key"],
-            task="text-generation",
-            temperature=TEMPERATURE,
-            max_new_tokens=MAX_TOKENS,
-            streaming=True  # Enable streaming
-        )
-        return ChatHuggingFace(llm=llm)
-    else:
         raise ValueError(f"Unsupported provider: {PROVIDER}")
-# Initialize provider-agnostic chat model
-chat_model = get_chat_model()
 # ---------------------------------------------------------------------
-# Citation parsing and source filtering
 # ---------------------------------------------------------------------
-def parse_citations_from_response(response: str) -> List[int]:
-    """
-    Parse citation numbers from the generated response.
-    Args:
-        response: The generated response text
-    Returns:
-        List of unique citation numbers found in the response
-    """
-    # Find all citation patterns like [1], [2], [1][2], etc.
     citation_pattern = r'\[(\d+)\]'
     matches = re.findall(citation_pattern, response)
-    # Convert to integers and return unique values
-    citation_numbers = [int(match) for match in matches]
-    return sorted(list(set(citation_numbers)))
-def filter_sources_by_citations(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
-    """
-    Filter sources to only include those that were cited in the response.
-    Args:
-        processed_results: All processed retrieval results
-        cited_numbers: List of citation numbers found in the response
-    Returns:
-        List of sources that were actually cited
-    """
     if not cited_numbers:
         return []
-    # Filter sources based on citation numbers (1-indexed)
     cited_sources = []
     for citation_num in cited_numbers:
-        # Convert to 0-indexed for list access
         source_index = citation_num - 1
         if 0 <= source_index < len(processed_results):
             cited_sources.append(processed_results[source_index])
     return cited_sources
-# ---------------------------------------------------------------------
-# Context processing - may need further refinement (i.e. to manage other data sources)
-# ---------------------------------------------------------------------
-def extract_relevant_fields(retrieval_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """
-    Extract only relevant fields from retrieval results.
-    Args:
-        retrieval_results: List of JSON objects from retriever
-    Returns:
-        List of processed objects with only relevant fields
-    """
-    if isinstance(retrieval_results, str):
-        retrieval_results = ast.literal_eval(retrieval_results)
     processed_results = []
-    for result in retrieval_results:
-        # Extract the answer content
-        answer = result.get('answer', '')
-        # Extract document identification from metadata
-        metadata = result.get('answer_metadata', {})
-        doc_info = {
-            'answer': answer,
-            'filename': metadata.get('filename', 'Unknown'),
-            'page': metadata.get('page', 'Unknown'),
-            'year': metadata.get('year', 'Unknown'),
-            'source': metadata.get('source', 'Unknown'),
-            'document_id': metadata.get('_id', 'Unknown')
-        }
-        processed_results.append(doc_info)
-    return processed_results
-def format_context_from_results(processed_results: List[Dict[str, Any]]) -> str:
-    """
-    Format processed retrieval results into a context string for the LLM.
-    Args:
-        processed_results: List of processed objects with relevant fields
-    Returns:
-        Formatted context string
-    """
-    if not processed_results:
-        return ""
-    context_parts = []
-    for i, result in enumerate(processed_results, 1):
-        doc_reference = f"[Document {i}: {result['filename']}"
-        if result['page'] != 'Unknown':
-            doc_reference += f", Page {result['page']}"
-        if result['year'] != 'Unknown':
-            doc_reference += f", Year {result['year']}"
-        doc_reference += "]"
-        context_part = f"{doc_reference}\n{result['answer']}\n"
-        context_parts.append(context_part)
-    return "\n".join(context_parts)
 # ---------------------------------------------------------------------
-# Core generation function for both Gradio UI and MCP
 # ---------------------------------------------------------------------
 async def _call_llm(messages: list) -> str:
-    """
-    Provider-agnostic LLM call using LangChain (non-streaming).
-    Args:
-        messages: List of LangChain message objects
-    Returns:
-        Generated response content as string
-    """
     try:
-        # Use async invoke for better performance
         response = await chat_model.ainvoke(messages)
         return response.content.strip()
     except Exception as e:
@@ -206,17 +168,8 @@ async def _call_llm(messages: list) -> str:
         raise
 async def _call_llm_streaming(messages: list) -> AsyncGenerator[str, None]:
-    """
-    Provider-agnostic streaming LLM call using LangChain.
-    Args:
-        messages: List of LangChain message objects
-    Yields:
-        Generated response chunks as strings
-    """
     try:
-        # Use async stream for streaming responses
         async for chunk in chat_model.astream(messages):
             if hasattr(chunk, 'content') and chunk.content:
                 yield chunk.content
@@ -224,191 +177,50 @@ async def _call_llm_streaming(messages: list) -> AsyncGenerator[str, None]:
         logging.exception(f"LLM streaming failed with provider '{PROVIDER}' and model '{MODEL}': {e}")
         yield f"Error: {str(e)}"
-def build_messages(question: str, context: str) -> list:
-    """
-    Build messages in LangChain format.
-    Args:
-        question: The user's question
-        context: The relevant context for answering
-    Returns:
-        List of LangChain message objects
-    """
-    system_content = """
-        You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. \
-        You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports.\
-        Provide a clear and structured answer based on the passages/context provided and the guidelines.
-        Guidelines:
-        - If the passages have useful facts or numbers, use them in your answer.
-        - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
-        - If it makes sense, use bullet points and lists to make your answers easier to understand.
-        - You do not need to use every passage. Only use the ones that help answer the question.
-        - Answer the USER question using only the CONTEXT provided.
-        - When referencing information from the context, use inline citations in square brackets like [1], [2], etc. to reference the document numbers shown in the context.
-        - Use multiple citations when information comes from multiple documents, like [1][2].
-        - Do not use the sentence 'Doc x says ...' to say where information came from, but rather just include the citation at the end of the sentence.
-        - If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
-"""
-    user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
-    return [
-        SystemMessage(content=system_content),
-        HumanMessage(content=user_content)
-    ]
 async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui_format: bool = False) -> Union[str, Dict[str, Any]]:
-    """
-    Generate an answer to a query using provided context through RAG.
-    This function takes a user query and relevant context, then uses a language model
-    to generate a comprehensive answer based on the provided information.
-    Args:
-        query (str): User query
-        context (Union[str, List[Dict[str, Any]]]): Context as string or list of retrieval results
-        chatui_format (bool): If True, return ChatUI format with sources
-    Returns:
-        Union[str, Dict]: The generated answer or ChatUI format response
-    """
     if not query.strip():
-        return {"error": "Query cannot be empty"} if chatui_format else "Error: Query cannot be empty"
-    processed_results = []
-    # Handle both string context (for Gradio UI) and list context (from retriever)
-    if isinstance(context, list):
-        if not context:
-            return {"error": "No retrieval results provided"} if chatui_format else "Error: No retrieval results provided"
-        # Process the retrieval results
-        processed_results = extract_relevant_fields(context)
-        formatted_context = format_context_from_results(processed_results)
-        if not formatted_context.strip():
-            return {"error": "No valid content found in retrieval results"} if chatui_format else "Error: No valid content found in retrieval results"
-    elif isinstance(context, str):
-        if not context.strip():
-            return {"error": "Context cannot be empty"} if chatui_format else "Error: Context cannot be empty"
-        formatted_context = context
-    else:
-        return {"error": "Context must be either a string or list of retrieval results"} if chatui_format else "Error: Context must be either a string or list of retrieval results"
     try:
-        messages = build_messages(query, formatted_context)
         answer = await _call_llm(messages)
         if chatui_format:
-            # Return ChatUI format
             result = {"answer": answer}
             if processed_results:
-                # Parse citations from the response
-                cited_numbers = parse_citations_from_response(answer)
-                # Filter sources to only include cited ones
-                cited_sources = filter_sources_by_citations(processed_results, cited_numbers)
-                # Extract sources for ChatUI
-                sources = []
-                for result_item in cited_sources:  # Only cited sources
-                    filename = result_item.get('filename', 'Unknown')
-                    page = result_item.get('page', 'Unknown')
-                    year = result_item.get('year', 'Unknown')
-                    # Create link using doc:// scheme
-                    link = f"doc://{filename}"
-                    # Create descriptive title
-                    title_parts = [filename]
-                    if page != 'Unknown':
-                        title_parts.append(f"Page {page}")
-                    if year != 'Unknown':
-                        title_parts.append(f"({year})")
-                    title = " - ".join(title_parts)
-                    sources.append({
-                        "link": link,
-                        "title": title
-                    })
-                result["sources"] = sources
             return result
         else:
             return answer
     except Exception as e:
         logging.exception("Generation failed")
-        return {"error": str(e)} if chatui_format else f"Error: {str(e)}"
 async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]]], chatui_format: bool = False) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
-    """
-    Generate a streaming answer to a query using provided context through RAG.
-    This function takes a user query and relevant context, then uses a language model
-    to generate a streaming answer based on the provided information.
-    Args:
-        query (str): User query
-        context (Union[str, List[Dict[str, Any]]]): Context as string or list of retrieval results
-        chatui_format (bool): If True, yield ChatUI format events
-    Yields:
-        Union[str, Dict]: Streaming chunks or ChatUI format events
-    """
     if not query.strip():
         if chatui_format:
-            yield {"event": "error", "data": {"error": "Query cannot be empty"}}
         else:
-            yield "Error: Query cannot be empty"
-        return
-    processed_results = []
-    # Handle both string context (for Gradio UI) and list context (from retriever)
-    if isinstance(context, list):
-        if not context:
-            if chatui_format:
-                yield {"event": "error", "data": {"error": "No retrieval results provided"}}
-            else:
-                yield "Error: No retrieval results provided"
-            return
-        # Process the retrieval results
-        processed_results = extract_relevant_fields(context)
-        formatted_context = format_context_from_results(processed_results)
-        if not formatted_context.strip():
-            if chatui_format:
-                yield {"event": "error", "data": {"error": "No valid content found in retrieval results"}}
-            else:
-                yield "Error: No valid content found in retrieval results"
-            return
-    elif isinstance(context, str):
-        if not context.strip():
-            if chatui_format:
-                yield {"event": "error", "data": {"error": "Context cannot be empty"}}
-            else:
-                yield "Error: Context cannot be empty"
-            return
-        formatted_context = context
-    else:
-        if chatui_format:
-            yield {"event": "error", "data": {"error": "Context must be either a string or list of retrieval results"}}
-        else:
-            yield "Error: Context must be either a string or list of retrieval results"
         return
     try:
-        messages = build_messages(query, formatted_context)
-        # Stream the text response and accumulate it for citation parsing
         accumulated_response = ""
         async for chunk in _call_llm_streaming(messages):
             accumulated_response += chunk
@@ -419,44 +231,19 @@ async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]
         # Send sources at the end if available and in ChatUI format
         if chatui_format and processed_results:
-            # Parse citations from the complete response
-            cited_numbers = parse_citations_from_response(accumulated_response)
-            # Filter sources to only include cited ones
-            cited_sources = filter_sources_by_citations(processed_results, cited_numbers)
-            sources = []
-            for result in cited_sources:  # Only cited sources
-                filename = result.get('filename', 'Unknown')
-                page = result.get('page', 'Unknown')
-                year = result.get('year', 'Unknown')
-                # Create link using doc:// scheme
-                link = f"doc://{filename}"
-                # Create descriptive title
-                title_parts = [filename]
-                if page != 'Unknown':
-                    title_parts.append(f"Page {page}")
-                if year != 'Unknown':
-                    title_parts.append(f"({year})")
-                title = " - ".join(title_parts)
-                sources.append({
-                    "link": link,
-                    "title": title
-                })
             yield {"event": "sources", "data": {"sources": sources}}
-        # Send end event for ChatUI format
         if chatui_format:
             yield {"event": "end", "data": {}}
     except Exception as e:
         logging.exception("Streaming generation failed")
         if chatui_format:
-            yield {"event": "error", "data": {"error": str(e)}}
         else:
-            yield f"Error: {str(e)}"

 import json
 import ast
 import re
+from typing import List, Dict, Any, Union, AsyncGenerator
 from dotenv import load_dotenv
 # LangChain imports
 from .utils import getconfig, get_auth
 # ---------------------------------------------------------------------
+# Configuration and Model Initialization
 # ---------------------------------------------------------------------
 config = getconfig("params.cfg")
 PROVIDER = config.get("generator", "PROVIDER")
 MODEL = config.get("generator", "MODEL")
 MAX_TOKENS = int(config.get("generator", "MAX_TOKENS"))
 TEMPERATURE = float(config.get("generator", "TEMPERATURE"))
+# Initialize chat model
 auth_config = get_auth(PROVIDER)
+chat_model = _get_chat_model()
+def _get_chat_model():
     """Initialize the appropriate LangChain chat model based on provider"""
+    common_params = {"temperature": TEMPERATURE, "max_tokens": MAX_TOKENS}
+    providers = {
+        "openai": lambda: ChatOpenAI(model=MODEL, openai_api_key=auth_config["api_key"], streaming=True, **common_params),
+        "anthropic": lambda: ChatAnthropic(model=MODEL, anthropic_api_key=auth_config["api_key"], streaming=True, **common_params),
+        "cohere": lambda: ChatCohere(model=MODEL, cohere_api_key=auth_config["api_key"], streaming=True, **common_params),
+        "huggingface": lambda: ChatHuggingFace(llm=HuggingFaceEndpoint(
+            repo_id=MODEL, huggingfacehub_api_token=auth_config["api_key"],
+            task="text-generation", temperature=TEMPERATURE, max_new_tokens=MAX_TOKENS, streaming=True
+        ))
     }
+    if PROVIDER not in providers:
         raise ValueError(f"Unsupported provider: {PROVIDER}")
+    return providers[PROVIDER]()
 # ---------------------------------------------------------------------
+# Core Processing Functions
 # ---------------------------------------------------------------------
+def _parse_citations(response: str) -> List[int]:
+    """Parse citation numbers from response text"""
     citation_pattern = r'\[(\d+)\]'
     matches = re.findall(citation_pattern, response)
+    return sorted(list(set(int(match) for match in matches)))
+def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
+    """Extract sources that were cited in the response"""
     if not cited_numbers:
         return []
     cited_sources = []
     for citation_num in cited_numbers:
         source_index = citation_num - 1
         if 0 <= source_index < len(processed_results):
             cited_sources.append(processed_results[source_index])
     return cited_sources
+def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
+    """Process context and return formatted context string and processed results"""
     processed_results = []
+    if isinstance(context, list):
+        if not context:
+            raise ValueError("No retrieval results provided")
+        # Extract relevant fields from retrieval results
+        for result in context:
+            if isinstance(result, str):
+                result = ast.literal_eval(result)
+            metadata = result.get('answer_metadata', {})
+            doc_info = {
+                'answer': result.get('answer', ''),
+                'filename': metadata.get('filename', 'Unknown'),
+                'page': metadata.get('page', 'Unknown'),
+                'year': metadata.get('year', 'Unknown'),
+                'source': metadata.get('source', 'Unknown'),
+                'document_id': metadata.get('_id', 'Unknown')
+            }
+            processed_results.append(doc_info)
+        # Format context string
+        context_parts = []
+        for i, result in enumerate(processed_results, 1):
+            doc_ref = f"[Document {i}: {result['filename']}"
+            if result['page'] != 'Unknown':
+                doc_ref += f", Page {result['page']}"
+            if result['year'] != 'Unknown':
+                doc_ref += f", Year {result['year']}"
+            doc_ref += "]"
+            context_parts.append(f"{doc_ref}\n{result['answer']}\n")
+        formatted_context = "\n".join(context_parts)
+    elif isinstance(context, str):
+        if not context.strip():
+            raise ValueError("Context cannot be empty")
+        formatted_context = context
+    else:
+        raise ValueError("Context must be either a string or list of retrieval results")
+    return formatted_context, processed_results
+def _build_messages(question: str, context: str) -> list:
+    """Build messages in LangChain format"""
+    system_content = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. \
+You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports.\
+Provide a clear and structured answer based on the passages/context provided and the guidelines.
+Guidelines:
+- If the passages have useful facts or numbers, use them in your answer.
+- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
+- If it makes sense, use bullet points and lists to make your answers easier to understand.
+- You do not need to use every passage. Only use the ones that help answer the question.
+- Answer the USER question using only the CONTEXT provided.
+- When referencing information from the context, use inline citations in square brackets like [1], [2], etc. to reference the document numbers shown in the context.
+- Use multiple citations when information comes from multiple documents, like [1][2].
+- Do not use the sentence 'Doc x says ...' to say where information came from, but rather just include the citation at the end of the sentence.
+- If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
+"""
+    user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
+    return [SystemMessage(content=system_content), HumanMessage(content=user_content)]
+def _create_sources_list(cited_sources: List[Dict[str, Any]]) -> List[Dict[str, str]]:
+    """Create sources list for ChatUI format"""
+    sources = []
+    for result in cited_sources:
+        filename = result.get('filename', 'Unknown')
+        page = result.get('page', 'Unknown')
+        year = result.get('year', 'Unknown')
+        link = f"doc://{filename}"
+        title_parts = [filename]
+        if page != 'Unknown':
+            title_parts.append(f"Page {page}")
+        if year != 'Unknown':
+            title_parts.append(f"({year})")
+        sources.append({"link": link, "title": " - ".join(title_parts)})
+    return sources
 # ---------------------------------------------------------------------
+# LLM Call Functions
 # ---------------------------------------------------------------------
 async def _call_llm(messages: list) -> str:
+    """Provider-agnostic LLM call using LangChain (non-streaming)"""
     try:
         response = await chat_model.ainvoke(messages)
         return response.content.strip()
     except Exception as e:
         raise
 async def _call_llm_streaming(messages: list) -> AsyncGenerator[str, None]:
+    """Provider-agnostic streaming LLM call using LangChain"""
     try:
         async for chunk in chat_model.astream(messages):
             if hasattr(chunk, 'content') and chunk.content:
                 yield chunk.content
         logging.exception(f"LLM streaming failed with provider '{PROVIDER}' and model '{MODEL}': {e}")
         yield f"Error: {str(e)}"
+# ---------------------------------------------------------------------
+# Main Generation Functions
+# ---------------------------------------------------------------------
 async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui_format: bool = False) -> Union[str, Dict[str, Any]]:
+    """Generate an answer to a query using provided context through RAG"""
     if not query.strip():
+        error_msg = "Query cannot be empty"
+        return {"error": error_msg} if chatui_format else f"Error: {error_msg}"
     try:
+        formatted_context, processed_results = _process_context(context)
+        messages = _build_messages(query, formatted_context)
         answer = await _call_llm(messages)
         if chatui_format:
             result = {"answer": answer}
             if processed_results:
+                cited_numbers = _parse_citations(answer)
+                cited_sources = _extract_sources(processed_results, cited_numbers)
+                result["sources"] = _create_sources_list(cited_sources)
             return result
         else:
             return answer
     except Exception as e:
         logging.exception("Generation failed")
+        error_msg = str(e)
+        return {"error": error_msg} if chatui_format else f"Error: {error_msg}"
 async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]]], chatui_format: bool = False) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
+    """Generate a streaming answer to a query using provided context through RAG"""
     if not query.strip():
+        error_msg = "Query cannot be empty"
         if chatui_format:
+            yield {"event": "error", "data": {"error": error_msg}}
         else:
+            yield f"Error: {error_msg}"
         return
     try:
+        formatted_context, processed_results = _process_context(context)
+        messages = _build_messages(query, formatted_context)
+        # Stream the response and accumulate for citation parsing (filter out any sources that were not cited)
         accumulated_response = ""
         async for chunk in _call_llm_streaming(messages):
             accumulated_response += chunk
         # Send sources at the end if available and in ChatUI format
         if chatui_format and processed_results:
+            cited_numbers = _parse_citations(accumulated_response)
+            cited_sources = _extract_sources(processed_results, cited_numbers)
+            sources = _create_sources_list(cited_sources)
             yield {"event": "sources", "data": {"sources": sources}}
+        # Send END event for ChatUI format
         if chatui_format:
             yield {"event": "end", "data": {}}
     except Exception as e:
         logging.exception("Streaming generation failed")
+        error_msg = str(e)
         if chatui_format:
+            yield {"event": "error", "data": {"error": error_msg}}
         else:
+            yield f"Error: {error_msg}"