Spaces:

Dhruv-Ty
/

chat

Sleeping

App Files Files Community

chat / src /utils.py

Dhruv-Ty

Update src/utils.py

64c1f54 verified 3 months ago

raw

history blame contribute delete

10.9 kB

	"""
	This module contains utility functions for text processing and other helper functions.
	"""

	import re
	import os
	import base64


	def has_meaningful_content(text):
	"""
	Check if explanation has meaningful content.

	Args:
	text (str): The text to check

	Returns:
	bool: True if the text has meaningful content, False otherwise
	"""
	if not text:
	return False

	# Check if the text is just equal signs or other separators
	stripped_text = text.strip()
	if re.match(r'^[=\-_*]+$', stripped_text.replace('\n', '')):
	return False

	# Check if the text only contains "## REASONING" with no actual content
	if "## REASONING" in stripped_text and len(stripped_text) < 20:
	return False

	return True


	def remove_reasoning_and_sources(text):
	"""
	Remove reasoning, follow-up questions, and sources sections from the main response text.

	Args:
	text (str): The text to clean

	Returns:
	str: Text without reasoning, follow-up questions, and sources sections
	"""
	if not text:
	return text

	# First, remove any reasoning sections
	pattern_reasoning = r'(?i)(\n+\sreasoning:\|\n+\s\{0,2}reasoning\{0,2}:?\|\n+\s#{1,3}\sreasoning).?(?=\n+\s(?:#{1,3}\|follow[ -]?up questions:\|sources:\|references:\|\Z))'
	cleaned_text = re.sub(pattern_reasoning, '', text, flags=re.DOTALL)

	# Remove follow-up questions sections
	pattern_followup = r'(?i)(\n+\sfollow[ -]?up questions:\|\n+\sadditional questions:\|\n+\s\{0,2}follow[ -]?up questions\{0,2}:?\|\n+\s#{1,3}\sfollow[ -]?up questions).?(?=\n+\s*(?:#{1,3}\|reasoning:\|sources:\|references:\|\Z))'
	cleaned_text = re.sub(pattern_followup, '', cleaned_text, flags=re.DOTALL)

	# Then, remove any sources/references sections
	pattern_sources = r'(?i)(\n+\ssources:\|\n+\sreferences:\|\n+\s\{0,2}sources\{0,2}:?\|\n+\s\{0,2}references\{0,2}:?\|\n+\s#{1,3}\ssources\|\n+\s#{1,3}\sreferences).?(?=\n+\s(?:#{1,3}\|\Z))'
	cleaned_text = re.sub(pattern_sources, '', cleaned_text, flags=re.DOTALL)

	# Also remove any source citations in the text (e.g., [1], [source_id])
	cleaned_text = re.sub(r'\[([\w\d:_\-\.+]+)\]', '', cleaned_text)

	# Process line by line to handle sections more comprehensively
	lines = cleaned_text.split('\n')
	filtered_lines = []
	skip_section = False

	for line in lines:
	# Check if we should skip this line (part of reasoning, follow-up questions, or sources section)
	if re.search(r'(?i)^(\sreasoning:\|\sfollow[ -]?up questions:\|\sadditional questions:\|\ssources:\|\sreferences:\|\s\{0,2}reasoning\{0,2}:?\|\s\{0,2}follow[ -]?up questions\{0,2}:?\|\s\{0,2}sources\{0,2}:?\|\s\{0,2}references\{0,2}:?\|\s#{1,3}\sreasoning\|\s#{1,3}\sfollow[ -]?up questions\|\s#{1,3}\ssources\|\s#{1,3}\s*references)', line):
	skip_section = True
	continue
	# Check if we're entering a new section
	elif skip_section and re.search(r'(?i)^(\s#{1,3}\|\s[a-zA-Z]+:)', line):
	skip_section = False

	# Only keep lines that aren't in sections we want to skip
	if not skip_section:
	filtered_lines.append(line)

	# Remove any trailing URL citations that might be left
	result = '\n'.join(filtered_lines).strip()
	result = re.sub(r'\[([^\]]+)\]$https?://[^)]+$', r'\1', result)

	# Also remove any sections starting with the headers Immediate Response or Main Response
	# We want to preserve this content but remove the header itself
	result = re.sub(r'(?i)^(\s#{1,3}\s)?immediate response:?\s*\n', '', result)
	result = re.sub(r'(?i)^(\s#{1,3}\s)?main response:?\s*\n', '', result)

	return result


	def clean_explanation(text):
	"""
	Remove duplicate sources sections and data availability notes from explanation.

	Args:
	text (str): The explanation text to clean

	Returns:
	str: Cleaned explanation text
	"""
	if not text:
	return text

	# Remove DATA AVAILABILITY NOTE section
	pattern_data_note = r'\n+\s#{1,3}\sDATA AVAILABILITY NOTE.?(?=\n+\s#{1,3}\|\Z)'
	cleaned_text = re.sub(pattern_data_note, '', text, flags=re.DOTALL)

	# Fix formatting issues with reasoning points - ensure consistent formatting
	pattern_reasoning_headers = r'(#{1,3}\sREASONING[^#]?)#{1,3}\s*(\d+\.\s+)'
	cleaned_text = re.sub(pattern_reasoning_headers, r'\1\2', cleaned_text, flags=re.DOTALL)

	# Remove any "REASONING1." pattern which creates the heading effect
	cleaned_text = re.sub(r'(#{1,3}\s*REASONING)(\d+\.)', r'\1', cleaned_text)

	# Normalize all reasoning points to use the same format
	cleaned_text = re.sub(r'(\n+)(\d+\.)', r'\1 \2', cleaned_text)

	# SIMPLER APPROACH: Remove all sources sections except the last one
	# First, split the text by source section headers
	pattern_sources = r'(\n+\s#{1,3}\s+(?:SOURCES\|Sources)(?:\s+USED)?[^\n])'
	sections = re.split(pattern_sources, cleaned_text)

	# Find all source sections
	source_sections = []
	current_section = ""
	in_source = False
	source_content = ""

	for i, section in enumerate(sections):
	# If this is a source section header
	if re.match(r'\s*#{1,3}\s+(?:SOURCES\|Sources)(?:\s+USED)?', section.strip()):
	in_source = True
	current_section = section
	# If this is content after a source header
	elif in_source and i > 0:
	source_content = section
	current_section += section
	source_sections.append(current_section)
	in_source = False
	current_section = ""

	# Remove all sources sections from the text
	for section in source_sections:
	cleaned_text = cleaned_text.replace(section, '')

	# Clean up any double newlines
	cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)

	# Add the sources section back with a consistent heading
	if source_content.strip():
	# Extract just the content without the header
	source_content = source_content.strip()

	# If the source content starts with bullet points, make sure they're properly formatted
	source_content = re.sub(r'^(\s*)(\d+\.)', r'\1•', source_content, flags=re.MULTILINE)

	# Add a clean, consistent "Sources" heading
	cleaned_text = cleaned_text.strip()
	if cleaned_text:
	cleaned_text += "\n\n"
	cleaned_text += "## Sources\n" + source_content

	return cleaned_text.strip()


	def get_image_base64(image_path):
	"""
	Encode image to base64.

	Args:
	image_path (str): Path to the image file

	Returns:
	str: Base64 encoded image or None if error
	"""
	try:
	if os.path.exists(image_path):
	with open(image_path, "rb") as img_file:
	return base64.b64encode(img_file.read()).decode()
	else:
	print(f"Image not found: {image_path}")
	return None
	except Exception as e:
	print(f"Error loading image: {e}")
	return None


	def format_conversation_history(history, patient_info=None):
	"""
	Format the conversation history into a string suitable for LLM processing.

	Args:
	history (list): List of message dictionaries
	patient_info (dict, optional): Dictionary with patient information

	Returns:
	str: Formatted conversation text for report generation
	"""
	formatted_text = "# Medical Consultation\n\n"

	# Add patient info if provided
	if patient_info:
	formatted_text += "## Patient Information\n"
	formatted_text += f"* Name: {patient_info.get('name', '')}\n"
	formatted_text += f"* Age: {patient_info.get('age', '')}\n"
	formatted_text += f"* Gender: {patient_info.get('gender', '')}\n\n"

	formatted_text += "## Conversation Transcript\n\n"

	for message in history:
	role = message.get("role", "").strip()
	content = message.get("content", "").strip()

	if not content:
	continue # Skip empty messages

	if role.lower() == "user":
	formatted_text += f"PATIENT: {content}\n\n"
	elif role.lower() == "assistant":
	formatted_text += f"ASSISTANT: {content}\n\n"
	# Include explanations which often contain diagnostic reasoning
	if "explanation" in message and message["explanation"]:
	explanation = message.get("explanation", "").strip()
	if explanation:
	formatted_text += f"REASONING: {explanation}\n\n"

	return formatted_text


	def format_follow_up_questions(questions_text):
	"""
	Format follow-up questions text for display.

	Args:
	questions_text (str): Raw follow-up questions text

	Returns:
	str: Formatted follow-up questions
	"""
	if not questions_text:
	return ""

	# Clean up any header text
	cleaned_text = re.sub(r'(?i)^(\s#{1,3}\s)?follow[ -]?up questions:?\s*\n', '', questions_text)

	# Ensure questions are numbered consistently
	lines = cleaned_text.split('\n')
	formatted_lines = []
	question_num = 1

	for line in lines:
	# Check if this is a question line (starts with a number or bullet)
	question_match = re.match(r'^\s(?:\d+\.\|\-\|\•\|\)\s(.)', line)
	if question_match:
	# Replace the existing number/bullet with a consistent format
	formatted_lines.append(f"{question_num}. {question_match.group(1).strip()}")
	question_num += 1
	elif line.strip():
	# If it's not empty and doesn't look like a numbered question,
	# treat it as a continuation of the previous question or a new question
	if formatted_lines and formatted_lines[-1].endswith('?'):
	# If the previous line ends with a question mark, this is likely a new question
	formatted_lines.append(f"{question_num}. {line.strip()}")
	question_num += 1
	elif formatted_lines:
	# Otherwise it's a continuation of the previous question
	formatted_lines[-1] += " " + line.strip()
	else:
	# If there's no previous line, start a new question
	formatted_lines.append(f"{question_num}. {line.strip()}")
	question_num += 1

	# Ensure each question ends with a question mark
	for i in range(len(formatted_lines)):
	if not formatted_lines[i].endswith('?'):
	formatted_lines[i] += '?'

	return '\n'.join(formatted_lines)