Spaces:

mriusero
/

A-Mistral-Agent

Sleeping

File size: 2,689 Bytes

from src.utils.tooling import tool
import PyPDF2
import re

@tool
def analyze_document(file_path: str, keywords: list) -> str:
    """
    Extracts specific information from a local PDF or local text document based on given keywords.
    (WARNING: This tool does not support URLs or web pages as input.)
    Args:
        file_path (str): The path to the PDF or text document to analyze.
        keywords (list): A list of keywords to search for in the document.
    Returns:
        str: The extracted information as text.
    """
    def extract_text_from_pdf(file_path: str) -> str:
        """
        Extracts text from a PDF file.
        Args:
            file_path (str): The path to the PDF file.
        Returns:
            str: The extracted text from the PDF.
        """
        try:
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfFileReader(file)
                text = ''
                for page_num in range(reader.numPages):
                    page = reader.getPage(page_num)
                    text += page.extract_text()
                return text
        except Exception as e:
            raise Exception(f"Error reading PDF file: {e}")

    def extract_text_from_txt(file_path: str) -> str:
        """
        Extracts text from a text file.
        Args:
            file_path (str): The path to the text file.
        Returns:
            str: The extracted text from the text file.
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except Exception as e:
            raise Exception(f"Error reading text file: {e}")

    def extract_information(text: str, keywords: list) -> str:
        """
        Extracts information based on keywords from the text.
        Args:
            text (str): The text to analyze.
            keywords (list): A list of keywords to search for in the text.
        Returns:
            str: The extracted information as text.
        """
        extracted_info = []
        for keyword in keywords:
            pattern = re.compile(r'\b{}\b'.format(re.escape(keyword)), re.IGNORECASE)
            matches = pattern.findall(text)
            if matches:
                extracted_info.append(f"Keyword '{keyword}': {', '.join(matches)}")
        return "\n".join(extracted_info)

    if file_path.lower().endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif file_path.lower().endswith('.txt'):
        text = extract_text_from_txt(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or text file.")

    return extract_information(text, keywords)