import os import re import math from typing import List from langchain_core.messages import SystemMessage, HumanMessage from langchain_groq import ChatGroq from dotenv import load_dotenv load_dotenv() TABULAR_VERBOSE = os.environ.get("TABULAR_VERBOSE", "0") in ("1", "true", "True", "yes", "YES") # Initialize Groq LLM for tabular data using specialized API key TABULAR_MODEL = os.environ.get("GROQ_TABULAR_MODEL", os.environ.get("GROQ_MODEL_TABULAR", "qwen/qwen3-32b")) GROQ_LLM = ChatGroq( groq_api_key=os.environ.get("GROQ_API_KEY_TABULAR", os.environ.get("GROQ_API_KEY")), model_name=TABULAR_MODEL ) def get_answer_for_tabluar( data: str, questions: List[str], batch_size: int = 10, verbose: bool = False ) -> List[str]: """ Query Groq LLM for tabular data analysis, handling batches and preserving order of answers. Args: data (str): Tabular context in markdown or plain-text. questions (List[str]): List of questions to ask. batch_size (int): Max number of questions per batch. verbose (bool): If True, print raw LLM responses. Returns: List[str]: Ordered list of answers corresponding to input questions. """ def parse_numbered_answers(text: str, expected: int) -> List[str]: """ Parse answers from a numbered list format ('1.', '2.', etc.) Use non-greedy capture with lookahead to stop at the next number or end. """ pattern = re.compile(r"^\s*(\d{1,2})[\.)\-]\s*(.*?)(?=\n\s*\d{1,2}[\.)\-]\s*|$)", re.MULTILINE | re.DOTALL) matches = pattern.findall(text) result = {} for num_str, answer in matches: try: num = int(num_str) except ValueError: continue if 1 <= num <= expected: clean_answer = re.sub(r'\s+', ' ', answer).strip() result[num] = clean_answer # If no structured matches, fall back to line-based heuristic if not result: lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()] for i in range(min(expected, len(lines))): result[i + 1] = lines[i] # Build fixed-length list answers = [] for i in range(1, expected + 1): answers.append(result.get(i, f"Unable to answer question {i}")) return answers if not questions: return [] # Process questions in batches all_answers = [] total_batches = math.ceil(len(questions) / batch_size) for batch_idx in range(total_batches): start = batch_idx * batch_size end = min(start + batch_size, len(questions)) batch_questions = questions[start:end] print(f"Processing batch {batch_idx + 1}/{total_batches} ({len(batch_questions)} questions)") # Create numbered question list numbered_questions = "\\n".join([f"{i+1}. {q}" for i, q in enumerate(batch_questions)]) # Create prompt system_prompt = """You are an expert data analyst. Analyze the provided tabular data and answer the questions accurately. Instructions: - Answer each question based ONLY on the data provided - If data is insufficient, state "Information not available in the provided data" - Provide clear, concise answers - Format your response as a numbered list (1., 2., 3., etc.) - Do not add explanations unless specifically asked""" user_prompt = f"""Data: {data} Questions: {numbered_questions} Please provide numbered answers (1., 2., 3., etc.) for each question.""" try: # Create messages messages = [ SystemMessage(content=system_prompt), HumanMessage(content=user_prompt) ] # Get response from LLM response = GROQ_LLM.invoke(messages) raw_response = response.content or "" if verbose or TABULAR_VERBOSE: print(f"🟢 Raw LLM Response (batch {batch_idx + 1}):\n{raw_response[:1200]}\n--- END RAW ---") # Parse the response batch_answers = parse_numbered_answers(raw_response, len(batch_questions)) all_answers.extend(batch_answers) except Exception as e: print(f"Error processing batch {batch_idx + 1}: {str(e)}") # Add error answers for this batch error_answers = [f"Error processing question: {str(e)}" for _ in batch_questions] all_answers.extend(error_answers) return all_answers