rag-bajaj / LLM /tabular_answer.py
quantumbit's picture
Upload 39 files
e8051be verified
import os
import re
import math
from typing import List
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_groq import ChatGroq
from dotenv import load_dotenv
load_dotenv()
TABULAR_VERBOSE = os.environ.get("TABULAR_VERBOSE", "0") in ("1", "true", "True", "yes", "YES")
# Initialize Groq LLM for tabular data using specialized API key
TABULAR_MODEL = os.environ.get("GROQ_TABULAR_MODEL", os.environ.get("GROQ_MODEL_TABULAR", "qwen/qwen3-32b"))
GROQ_LLM = ChatGroq(
groq_api_key=os.environ.get("GROQ_API_KEY_TABULAR", os.environ.get("GROQ_API_KEY")),
model_name=TABULAR_MODEL
)
def get_answer_for_tabluar(
data: str,
questions: List[str],
batch_size: int = 10,
verbose: bool = False
) -> List[str]:
"""
Query Groq LLM for tabular data analysis, handling batches and preserving order of answers.
Args:
data (str): Tabular context in markdown or plain-text.
questions (List[str]): List of questions to ask.
batch_size (int): Max number of questions per batch.
verbose (bool): If True, print raw LLM responses.
Returns:
List[str]: Ordered list of answers corresponding to input questions.
"""
def parse_numbered_answers(text: str, expected: int) -> List[str]:
"""
Parse answers from a numbered list format ('1.', '2.', etc.)
Use non-greedy capture with lookahead to stop at the next number or end.
"""
pattern = re.compile(r"^\s*(\d{1,2})[\.)\-]\s*(.*?)(?=\n\s*\d{1,2}[\.)\-]\s*|$)", re.MULTILINE | re.DOTALL)
matches = pattern.findall(text)
result = {}
for num_str, answer in matches:
try:
num = int(num_str)
except ValueError:
continue
if 1 <= num <= expected:
clean_answer = re.sub(r'\s+', ' ', answer).strip()
result[num] = clean_answer
# If no structured matches, fall back to line-based heuristic
if not result:
lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()]
for i in range(min(expected, len(lines))):
result[i + 1] = lines[i]
# Build fixed-length list
answers = []
for i in range(1, expected + 1):
answers.append(result.get(i, f"Unable to answer question {i}"))
return answers
if not questions:
return []
# Process questions in batches
all_answers = []
total_batches = math.ceil(len(questions) / batch_size)
for batch_idx in range(total_batches):
start = batch_idx * batch_size
end = min(start + batch_size, len(questions))
batch_questions = questions[start:end]
print(f"Processing batch {batch_idx + 1}/{total_batches} ({len(batch_questions)} questions)")
# Create numbered question list
numbered_questions = "\\n".join([f"{i+1}. {q}" for i, q in enumerate(batch_questions)])
# Create prompt
system_prompt = """You are an expert data analyst. Analyze the provided tabular data and answer the questions accurately.
Instructions:
- Answer each question based ONLY on the data provided
- If data is insufficient, state "Information not available in the provided data"
- Provide clear, concise answers
- Format your response as a numbered list (1., 2., 3., etc.)
- Do not add explanations unless specifically asked"""
user_prompt = f"""Data:
{data}
Questions:
{numbered_questions}
Please provide numbered answers (1., 2., 3., etc.) for each question."""
try:
# Create messages
messages = [
SystemMessage(content=system_prompt),
HumanMessage(content=user_prompt)
]
# Get response from LLM
response = GROQ_LLM.invoke(messages)
raw_response = response.content or ""
if verbose or TABULAR_VERBOSE:
print(f"🟢 Raw LLM Response (batch {batch_idx + 1}):\n{raw_response[:1200]}\n--- END RAW ---")
# Parse the response
batch_answers = parse_numbered_answers(raw_response, len(batch_questions))
all_answers.extend(batch_answers)
except Exception as e:
print(f"Error processing batch {batch_idx + 1}: {str(e)}")
# Add error answers for this batch
error_answers = [f"Error processing question: {str(e)}" for _ in batch_questions]
all_answers.extend(error_answers)
return all_answers