File size: 4,754 Bytes
e8051be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import re
import math
from typing import List
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_groq import ChatGroq
from dotenv import load_dotenv

load_dotenv()
TABULAR_VERBOSE = os.environ.get("TABULAR_VERBOSE", "0") in ("1", "true", "True", "yes", "YES")

# Initialize Groq LLM for tabular data using specialized API key
TABULAR_MODEL = os.environ.get("GROQ_TABULAR_MODEL", os.environ.get("GROQ_MODEL_TABULAR", "qwen/qwen3-32b"))
GROQ_LLM = ChatGroq(
    groq_api_key=os.environ.get("GROQ_API_KEY_TABULAR", os.environ.get("GROQ_API_KEY")),
    model_name=TABULAR_MODEL
)

def get_answer_for_tabluar(

    data: str,

    questions: List[str],

    batch_size: int = 10,

    verbose: bool = False

) -> List[str]:
    """

    Query Groq LLM for tabular data analysis, handling batches and preserving order of answers.



    Args:

        data (str): Tabular context in markdown or plain-text.

        questions (List[str]): List of questions to ask.

        batch_size (int): Max number of questions per batch.

        verbose (bool): If True, print raw LLM responses.



    Returns:

        List[str]: Ordered list of answers corresponding to input questions.

    """

    def parse_numbered_answers(text: str, expected: int) -> List[str]:
        """

        Parse answers from a numbered list format ('1.', '2.', etc.)

        Use non-greedy capture with lookahead to stop at the next number or end.

        """
        pattern = re.compile(r"^\s*(\d{1,2})[\.)\-]\s*(.*?)(?=\n\s*\d{1,2}[\.)\-]\s*|$)", re.MULTILINE | re.DOTALL)
        matches = pattern.findall(text)

        result = {}
        for num_str, answer in matches:
            try:
                num = int(num_str)
            except ValueError:
                continue
            if 1 <= num <= expected:
                clean_answer = re.sub(r'\s+', ' ', answer).strip()
                result[num] = clean_answer

        # If no structured matches, fall back to line-based heuristic
        if not result:
            lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()]
            for i in range(min(expected, len(lines))):
                result[i + 1] = lines[i]

        # Build fixed-length list
        answers = []
        for i in range(1, expected + 1):
            answers.append(result.get(i, f"Unable to answer question {i}"))

        return answers

    if not questions:
        return []

    # Process questions in batches
    all_answers = []
    total_batches = math.ceil(len(questions) / batch_size)
    
    for batch_idx in range(total_batches):
        start = batch_idx * batch_size
        end = min(start + batch_size, len(questions))
        batch_questions = questions[start:end]
        
        print(f"Processing batch {batch_idx + 1}/{total_batches} ({len(batch_questions)} questions)")
        
        # Create numbered question list
        numbered_questions = "\\n".join([f"{i+1}. {q}" for i, q in enumerate(batch_questions)])
        
        # Create prompt
        system_prompt = """You are an expert data analyst. Analyze the provided tabular data and answer the questions accurately.

        

Instructions:

- Answer each question based ONLY on the data provided

- If data is insufficient, state "Information not available in the provided data"

- Provide clear, concise answers

- Format your response as a numbered list (1., 2., 3., etc.)

- Do not add explanations unless specifically asked"""

        user_prompt = f"""Data:

{data}



Questions:

{numbered_questions}



Please provide numbered answers (1., 2., 3., etc.) for each question."""

        try:
            # Create messages
            messages = [
                SystemMessage(content=system_prompt),
                HumanMessage(content=user_prompt)
            ]
            
            # Get response from LLM
            response = GROQ_LLM.invoke(messages)
            raw_response = response.content or ""
            
            if verbose or TABULAR_VERBOSE:
                print(f"🟢 Raw LLM Response (batch {batch_idx + 1}):\n{raw_response[:1200]}\n--- END RAW ---")
            
            # Parse the response
            batch_answers = parse_numbered_answers(raw_response, len(batch_questions))
            all_answers.extend(batch_answers)
            
        except Exception as e:
            print(f"Error processing batch {batch_idx + 1}: {str(e)}")
            # Add error answers for this batch
            error_answers = [f"Error processing question: {str(e)}" for _ in batch_questions]
            all_answers.extend(error_answers)
    
    return all_answers