Spaces:

quantumbit
/

rag-bajaj

Sleeping

App Files Files Community

rag-bajaj / LLM /tabular_answer.py

quantumbit

Upload 39 files

e8051be verified 20 days ago

raw

history blame contribute delete

4.75 kB

	import os
	import re
	import math
	from typing import List
	from langchain_core.messages import SystemMessage, HumanMessage
	from langchain_groq import ChatGroq
	from dotenv import load_dotenv

	load_dotenv()
	TABULAR_VERBOSE = os.environ.get("TABULAR_VERBOSE", "0") in ("1", "true", "True", "yes", "YES")

	# Initialize Groq LLM for tabular data using specialized API key
	TABULAR_MODEL = os.environ.get("GROQ_TABULAR_MODEL", os.environ.get("GROQ_MODEL_TABULAR", "qwen/qwen3-32b"))
	GROQ_LLM = ChatGroq(
	groq_api_key=os.environ.get("GROQ_API_KEY_TABULAR", os.environ.get("GROQ_API_KEY")),
	model_name=TABULAR_MODEL
	)

	def get_answer_for_tabluar(
	data: str,
	questions: List[str],
	batch_size: int = 10,
	verbose: bool = False
	) -> List[str]:
	"""
	Query Groq LLM for tabular data analysis, handling batches and preserving order of answers.

	Args:
	data (str): Tabular context in markdown or plain-text.
	questions (List[str]): List of questions to ask.
	batch_size (int): Max number of questions per batch.
	verbose (bool): If True, print raw LLM responses.

	Returns:
	List[str]: Ordered list of answers corresponding to input questions.
	"""

	def parse_numbered_answers(text: str, expected: int) -> List[str]:
	"""
	Parse answers from a numbered list format ('1.', '2.', etc.)
	Use non-greedy capture with lookahead to stop at the next number or end.
	"""
	pattern = re.compile(r"^\s(\d{1,2})[\.)\-]\s(.?)(?=\n\s\d{1,2}[\.)\-]\s*\|$)", re.MULTILINE \| re.DOTALL)
	matches = pattern.findall(text)

	result = {}
	for num_str, answer in matches:
	try:
	num = int(num_str)
	except ValueError:
	continue
	if 1 <= num <= expected:
	clean_answer = re.sub(r'\s+', ' ', answer).strip()
	result[num] = clean_answer

	# If no structured matches, fall back to line-based heuristic
	if not result:
	lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()]
	for i in range(min(expected, len(lines))):
	result[i + 1] = lines[i]

	# Build fixed-length list
	answers = []
	for i in range(1, expected + 1):
	answers.append(result.get(i, f"Unable to answer question {i}"))

	return answers

	if not questions:
	return []

	# Process questions in batches
	all_answers = []
	total_batches = math.ceil(len(questions) / batch_size)

	for batch_idx in range(total_batches):
	start = batch_idx * batch_size
	end = min(start + batch_size, len(questions))
	batch_questions = questions[start:end]

	print(f"Processing batch {batch_idx + 1}/{total_batches} ({len(batch_questions)} questions)")

	# Create numbered question list
	numbered_questions = "\\n".join([f"{i+1}. {q}" for i, q in enumerate(batch_questions)])

	# Create prompt
	system_prompt = """You are an expert data analyst. Analyze the provided tabular data and answer the questions accurately.

	Instructions:
	- Answer each question based ONLY on the data provided
	- If data is insufficient, state "Information not available in the provided data"
	- Provide clear, concise answers
	- Format your response as a numbered list (1., 2., 3., etc.)
	- Do not add explanations unless specifically asked"""

	user_prompt = f"""Data:
	{data}

	Questions:
	{numbered_questions}

	Please provide numbered answers (1., 2., 3., etc.) for each question."""

	try:
	# Create messages
	messages = [
	SystemMessage(content=system_prompt),
	HumanMessage(content=user_prompt)
	]

	# Get response from LLM
	response = GROQ_LLM.invoke(messages)
	raw_response = response.content or ""

	if verbose or TABULAR_VERBOSE:
	print(f"🟢 Raw LLM Response (batch {batch_idx + 1}):\n{raw_response[:1200]}\n--- END RAW ---")

	# Parse the response
	batch_answers = parse_numbered_answers(raw_response, len(batch_questions))
	all_answers.extend(batch_answers)

	except Exception as e:
	print(f"Error processing batch {batch_idx + 1}: {str(e)}")
	# Add error answers for this batch
	error_answers = [f"Error processing question: {str(e)}" for _ in batch_questions]
	all_answers.extend(error_answers)

	return all_answers