Spaces:

mriusero
/

A-Mistral-Agent

Sleeping

A-Mistral-Agent / src /tools /analyze_document.py

mriusero

feat: improve tools

dc1621b 4 months ago

2.69 kB

	from src.utils.tooling import tool
	import PyPDF2
	import re

	@tool
	def analyze_document(file_path: str, keywords: list) -> str:
	"""
	Extracts specific information from a local PDF or local text document based on given keywords.
	(WARNING: This tool does not support URLs or web pages as input.)
	Args:
	file_path (str): The path to the PDF or text document to analyze.
	keywords (list): A list of keywords to search for in the document.
	Returns:
	str: The extracted information as text.
	"""
	def extract_text_from_pdf(file_path: str) -> str:
	"""
	Extracts text from a PDF file.
	Args:
	file_path (str): The path to the PDF file.
	Returns:
	str: The extracted text from the PDF.
	"""
	try:
	with open(file_path, 'rb') as file:
	reader = PyPDF2.PdfFileReader(file)
	text = ''
	for page_num in range(reader.numPages):
	page = reader.getPage(page_num)
	text += page.extract_text()
	return text
	except Exception as e:
	raise Exception(f"Error reading PDF file: {e}")

	def extract_text_from_txt(file_path: str) -> str:
	"""
	Extracts text from a text file.
	Args:
	file_path (str): The path to the text file.
	Returns:
	str: The extracted text from the text file.
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()
	except Exception as e:
	raise Exception(f"Error reading text file: {e}")

	def extract_information(text: str, keywords: list) -> str:
	"""
	Extracts information based on keywords from the text.
	Args:
	text (str): The text to analyze.
	keywords (list): A list of keywords to search for in the text.
	Returns:
	str: The extracted information as text.
	"""
	extracted_info = []
	for keyword in keywords:
	pattern = re.compile(r'\b{}\b'.format(re.escape(keyword)), re.IGNORECASE)
	matches = pattern.findall(text)
	if matches:
	extracted_info.append(f"Keyword '{keyword}': {', '.join(matches)}")
	return "\n".join(extracted_info)

	if file_path.lower().endswith('.pdf'):
	text = extract_text_from_pdf(file_path)
	elif file_path.lower().endswith('.txt'):
	text = extract_text_from_txt(file_path)
	else:
	raise ValueError("Unsupported file format. Please provide a PDF or text file.")

	return extract_information(text, keywords)