Spaces:
Sleeping
Sleeping
import os | |
import re | |
import math | |
from typing import List | |
from langchain_core.messages import SystemMessage, HumanMessage | |
from langchain_groq import ChatGroq | |
from dotenv import load_dotenv | |
load_dotenv() | |
TABULAR_VERBOSE = os.environ.get("TABULAR_VERBOSE", "0") in ("1", "true", "True", "yes", "YES") | |
# Initialize Groq LLM for tabular data using specialized API key | |
TABULAR_MODEL = os.environ.get("GROQ_TABULAR_MODEL", os.environ.get("GROQ_MODEL_TABULAR", "qwen/qwen3-32b")) | |
GROQ_LLM = ChatGroq( | |
groq_api_key=os.environ.get("GROQ_API_KEY_TABULAR", os.environ.get("GROQ_API_KEY")), | |
model_name=TABULAR_MODEL | |
) | |
def get_answer_for_tabluar( | |
data: str, | |
questions: List[str], | |
batch_size: int = 10, | |
verbose: bool = False | |
) -> List[str]: | |
""" | |
Query Groq LLM for tabular data analysis, handling batches and preserving order of answers. | |
Args: | |
data (str): Tabular context in markdown or plain-text. | |
questions (List[str]): List of questions to ask. | |
batch_size (int): Max number of questions per batch. | |
verbose (bool): If True, print raw LLM responses. | |
Returns: | |
List[str]: Ordered list of answers corresponding to input questions. | |
""" | |
def parse_numbered_answers(text: str, expected: int) -> List[str]: | |
""" | |
Parse answers from a numbered list format ('1.', '2.', etc.) | |
Use non-greedy capture with lookahead to stop at the next number or end. | |
""" | |
pattern = re.compile(r"^\s*(\d{1,2})[\.)\-]\s*(.*?)(?=\n\s*\d{1,2}[\.)\-]\s*|$)", re.MULTILINE | re.DOTALL) | |
matches = pattern.findall(text) | |
result = {} | |
for num_str, answer in matches: | |
try: | |
num = int(num_str) | |
except ValueError: | |
continue | |
if 1 <= num <= expected: | |
clean_answer = re.sub(r'\s+', ' ', answer).strip() | |
result[num] = clean_answer | |
# If no structured matches, fall back to line-based heuristic | |
if not result: | |
lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()] | |
for i in range(min(expected, len(lines))): | |
result[i + 1] = lines[i] | |
# Build fixed-length list | |
answers = [] | |
for i in range(1, expected + 1): | |
answers.append(result.get(i, f"Unable to answer question {i}")) | |
return answers | |
if not questions: | |
return [] | |
# Process questions in batches | |
all_answers = [] | |
total_batches = math.ceil(len(questions) / batch_size) | |
for batch_idx in range(total_batches): | |
start = batch_idx * batch_size | |
end = min(start + batch_size, len(questions)) | |
batch_questions = questions[start:end] | |
print(f"Processing batch {batch_idx + 1}/{total_batches} ({len(batch_questions)} questions)") | |
# Create numbered question list | |
numbered_questions = "\\n".join([f"{i+1}. {q}" for i, q in enumerate(batch_questions)]) | |
# Create prompt | |
system_prompt = """You are an expert data analyst. Analyze the provided tabular data and answer the questions accurately. | |
Instructions: | |
- Answer each question based ONLY on the data provided | |
- If data is insufficient, state "Information not available in the provided data" | |
- Provide clear, concise answers | |
- Format your response as a numbered list (1., 2., 3., etc.) | |
- Do not add explanations unless specifically asked""" | |
user_prompt = f"""Data: | |
{data} | |
Questions: | |
{numbered_questions} | |
Please provide numbered answers (1., 2., 3., etc.) for each question.""" | |
try: | |
# Create messages | |
messages = [ | |
SystemMessage(content=system_prompt), | |
HumanMessage(content=user_prompt) | |
] | |
# Get response from LLM | |
response = GROQ_LLM.invoke(messages) | |
raw_response = response.content or "" | |
if verbose or TABULAR_VERBOSE: | |
print(f"🟢 Raw LLM Response (batch {batch_idx + 1}):\n{raw_response[:1200]}\n--- END RAW ---") | |
# Parse the response | |
batch_answers = parse_numbered_answers(raw_response, len(batch_questions)) | |
all_answers.extend(batch_answers) | |
except Exception as e: | |
print(f"Error processing batch {batch_idx + 1}: {str(e)}") | |
# Add error answers for this batch | |
error_answers = [f"Error processing question: {str(e)}" for _ in batch_questions] | |
all_answers.extend(error_answers) | |
return all_answers | |