Spaces:

quantumbit
/

rag-bajaj

Sleeping

File size: 4,754 Bytes

e8051be

import os
import re
import math
from typing import List
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_groq import ChatGroq
from dotenv import load_dotenv

load_dotenv()
TABULAR_VERBOSE = os.environ.get("TABULAR_VERBOSE", "0") in ("1", "true", "True", "yes", "YES")

# Initialize Groq LLM for tabular data using specialized API key
TABULAR_MODEL = os.environ.get("GROQ_TABULAR_MODEL", os.environ.get("GROQ_MODEL_TABULAR", "qwen/qwen3-32b"))
GROQ_LLM = ChatGroq(
    groq_api_key=os.environ.get("GROQ_API_KEY_TABULAR", os.environ.get("GROQ_API_KEY")),
    model_name=TABULAR_MODEL
)

def get_answer_for_tabluar(

    data: str,

    questions: List[str],

    batch_size: int = 10,

    verbose: bool = False

) -> List[str]:
    """

    Query Groq LLM for tabular data analysis, handling batches and preserving order of answers.



    Args:

        data (str): Tabular context in markdown or plain-text.

        questions (List[str]): List of questions to ask.

        batch_size (int): Max number of questions per batch.

        verbose (bool): If True, print raw LLM responses.



    Returns:

        List[str]: Ordered list of answers corresponding to input questions.

    """

    def parse_numbered_answers(text: str, expected: int) -> List[str]:
        """

        Parse answers from a numbered list format ('1.', '2.', etc.)

        Use non-greedy capture with lookahead to stop at the next number or end.

        """
        pattern = re.compile(r"^\s*(\d{1,2})[\.)\-]\s*(.*?)(?=\n\s*\d{1,2}[\.)\-]\s*|$)", re.MULTILINE | re.DOTALL)
        matches = pattern.findall(text)

        result = {}
        for num_str, answer in matches:
            try:
                num = int(num_str)
            except ValueError:
                continue
            if 1 <= num <= expected:
                clean_answer = re.sub(r'\s+', ' ', answer).strip()
                result[num] = clean_answer

        # If no structured matches, fall back to line-based heuristic
        if not result:
            lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()]
            for i in range(min(expected, len(lines))):
                result[i + 1] = lines[i]

        # Build fixed-length list
        answers = []
        for i in range(1, expected + 1):
            answers.append(result.get(i, f"Unable to answer question {i}"))

        return answers

    if not questions:
        return []

    # Process questions in batches
    all_answers = []
    total_batches = math.ceil(len(questions) / batch_size)
    
    for batch_idx in range(total_batches):
        start = batch_idx * batch_size
        end = min(start + batch_size, len(questions))
        batch_questions = questions[start:end]
        
        print(f"Processing batch {batch_idx + 1}/{total_batches} ({len(batch_questions)} questions)")
        
        # Create numbered question list
        numbered_questions = "\\n".join([f"{i+1}. {q}" for i, q in enumerate(batch_questions)])
        
        # Create prompt
        system_prompt = """You are an expert data analyst. Analyze the provided tabular data and answer the questions accurately.

        

Instructions:

- Answer each question based ONLY on the data provided

- If data is insufficient, state "Information not available in the provided data"

- Provide clear, concise answers

- Format your response as a numbered list (1., 2., 3., etc.)

- Do not add explanations unless specifically asked"""

        user_prompt = f"""Data:

{data}



Questions:

{numbered_questions}



Please provide numbered answers (1., 2., 3., etc.) for each question."""

        try:
            # Create messages
            messages = [
                SystemMessage(content=system_prompt),
                HumanMessage(content=user_prompt)
            ]
            
            # Get response from LLM
            response = GROQ_LLM.invoke(messages)
            raw_response = response.content or ""
            
            if verbose or TABULAR_VERBOSE:
                print(f"🟢 Raw LLM Response (batch {batch_idx + 1}):\n{raw_response[:1200]}\n--- END RAW ---")
            
            # Parse the response
            batch_answers = parse_numbered_answers(raw_response, len(batch_questions))
            all_answers.extend(batch_answers)
            
        except Exception as e:
            print(f"Error processing batch {batch_idx + 1}: {str(e)}")
            # Add error answers for this batch
            error_answers = [f"Error processing question: {str(e)}" for _ in batch_questions]
            all_answers.extend(error_answers)
    
    return all_answers