Spaces:
Sleeping
Sleeping
File size: 2,689 Bytes
6078833 197e03a dc1621b 6078833 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
from src.utils.tooling import tool
import PyPDF2
import re
@tool
def analyze_document(file_path: str, keywords: list) -> str:
"""
Extracts specific information from a local PDF or local text document based on given keywords.
(WARNING: This tool does not support URLs or web pages as input.)
Args:
file_path (str): The path to the PDF or text document to analyze.
keywords (list): A list of keywords to search for in the document.
Returns:
str: The extracted information as text.
"""
def extract_text_from_pdf(file_path: str) -> str:
"""
Extracts text from a PDF file.
Args:
file_path (str): The path to the PDF file.
Returns:
str: The extracted text from the PDF.
"""
try:
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfFileReader(file)
text = ''
for page_num in range(reader.numPages):
page = reader.getPage(page_num)
text += page.extract_text()
return text
except Exception as e:
raise Exception(f"Error reading PDF file: {e}")
def extract_text_from_txt(file_path: str) -> str:
"""
Extracts text from a text file.
Args:
file_path (str): The path to the text file.
Returns:
str: The extracted text from the text file.
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
raise Exception(f"Error reading text file: {e}")
def extract_information(text: str, keywords: list) -> str:
"""
Extracts information based on keywords from the text.
Args:
text (str): The text to analyze.
keywords (list): A list of keywords to search for in the text.
Returns:
str: The extracted information as text.
"""
extracted_info = []
for keyword in keywords:
pattern = re.compile(r'\b{}\b'.format(re.escape(keyword)), re.IGNORECASE)
matches = pattern.findall(text)
if matches:
extracted_info.append(f"Keyword '{keyword}': {', '.join(matches)}")
return "\n".join(extracted_info)
if file_path.lower().endswith('.pdf'):
text = extract_text_from_pdf(file_path)
elif file_path.lower().endswith('.txt'):
text = extract_text_from_txt(file_path)
else:
raise ValueError("Unsupported file format. Please provide a PDF or text file.")
return extract_information(text, keywords) |