Final_Assignment_Template / utils /document_parser_tool.py
chevisli
Alpha version app
bfb26a0
import os
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from langchain.tools import Tool
from utils.file_downloader import FileDownloader
from dotenv import load_dotenv
load_dotenv()
class DocumentParserTool:
"""A tool for parsing PDF and XLSX documents."""
def __init__(self):
"""Initialize the DocumentParserTool with FileDownloader."""
self.downloader = FileDownloader()
def parse_document_from_url_or_path(self, path_or_url: str) -> str:
"""
Parse a document from URL or file path. Downloads if URL, uses directly if path.
Args:
path_or_url (str): URL to download from or file path to use
Returns:
str: Parsed content of the document
"""
try:
# Get file path (download if URL, verify if file path)
file_path = self.downloader.get_file_path(path_or_url)
# Parse the document
result = self.parse_document(file_path)
# Add context about the source
source_info = f"Source: {'Downloaded from ' + path_or_url if self.downloader.is_url(path_or_url) else 'File at ' + path_or_url}\n"
source_info += f"Local file path: {file_path}\n\n"
return source_info + result
except Exception as e:
return f"Error processing {path_or_url}: {str(e)}"
def parse_document(self, document_path: str) -> str:
"""
Parse a document from the given file path.
Args:
document_path (str): Path to the document file
Returns:
str: Parsed content of the document
"""
if not os.path.exists(document_path):
return f"Error: File not found at path: {document_path}"
try:
file_extension = os.path.splitext(document_path)[1].lower()
if file_extension == ".pdf":
return self._parse_pdf(document_path)
elif file_extension in [".xlsx", ".xls"]:
return self._parse_excel(document_path)
else:
return f"Error: Unsupported file format '{file_extension}'. Supported formats: PDF (.pdf), Excel (.xlsx, .xls)"
except Exception as e:
return f"Error parsing document: {str(e)}"
def _parse_pdf(self, document_path: str) -> str:
"""Parse PDF document and extract text content."""
try:
loader = PyPDFLoader(document_path)
pages = loader.load_and_split()
pdf_text = " ".join(page.page_content for page in pages)
if not pdf_text.strip():
return (
"Warning: PDF appears to be empty or contains no extractable text."
)
return (
f"PDF Content (from {os.path.basename(document_path)}):\n\n{pdf_text}"
)
except Exception as e:
return f"Error parsing PDF: {str(e)}"
def _parse_excel(self, document_path: str) -> str:
"""Parse Excel document and extract structured data."""
try:
# Read all sheets from the Excel file
excel_file = pd.ExcelFile(document_path)
sheet_names = excel_file.sheet_names
if not sheet_names:
return "Warning: Excel file contains no sheets."
parsed_content = (
f"Excel Content (from {os.path.basename(document_path)}):\n\n"
)
parsed_content += f"Number of sheets: {len(sheet_names)}\n"
parsed_content += f"Sheet names: {', '.join(sheet_names)}\n\n"
for sheet_name in sheet_names:
try:
df = pd.read_excel(document_path, sheet_name=sheet_name)
parsed_content += f"--- Sheet: {sheet_name} ---\n"
parsed_content += (
f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns\n"
)
if df.empty:
parsed_content += "Sheet is empty.\n\n"
continue
parsed_content += (
f"Columns: {', '.join(df.columns.astype(str))}\n\n"
)
# Include first few rows as sample data
sample_rows = min(5, len(df))
parsed_content += f"Sample data (first {sample_rows} rows):\n"
parsed_content += df.head(sample_rows).to_string(index=False)
parsed_content += "\n\n"
# Include summary statistics for numeric columns
numeric_cols = df.select_dtypes(include=["number"]).columns
if not numeric_cols.empty:
parsed_content += "Summary statistics for numeric columns:\n"
parsed_content += df[numeric_cols].describe().to_string()
parsed_content += "\n\n"
except Exception as sheet_error:
parsed_content += (
f"Error reading sheet '{sheet_name}': {str(sheet_error)}\n\n"
)
return parsed_content
except Exception as e:
return f"Error parsing Excel file: {str(e)}"
# Create the DocumentParserTool instance
document_parser_tool_instance = DocumentParserTool()
# Create a LangChain Tool wrapper for the document parser (file paths only)
document_parser_tool = Tool(
name="document_parser",
description=(
"Parse PDF and Excel (.xlsx, .xls) documents to extract their content. "
"For PDFs, extracts all text content. For Excel files, provides structured data "
"including sheet names, dimensions, column headers, sample data, and summary statistics. "
"Input should be a file path to the document."
),
func=document_parser_tool_instance.parse_document,
)
# Create a LangChain Tool wrapper for the document parser with URL/path support
document_parser_url_tool = Tool(
name="document_parser_url",
description=(
"Parse PDF and Excel (.xlsx, .xls) documents from URLs or file paths. "
"If URL is provided, downloads the file first. If file path is provided, uses it directly. "
"For PDFs, extracts all text content. For Excel files, provides structured data "
"including sheet names, dimensions, column headers, sample data, and summary statistics. "
"Input can be either a URL (http/https) or a local file path."
),
func=document_parser_tool_instance.parse_document_from_url_or_path,
)
if __name__ == "__main__":
print("Start testing document parser tool with file downloader integration")
# Import here to avoid circular import
from utils.agent_executor import create_agent_executor
# Initialize file downloader
downloader = FileDownloader()
# Test with both URLs and file paths
test_files = [
"https://arxiv.org/pdf/2501.00147", # URL - should be downloaded
# "https://agents-course-unit4-scoring.hf.space/files/7bd855d8-463d-4ed5-93ca-5fe35145f733", # URL - should be downloaded
# "./test_document.pdf", # File path - should be used directly (if exists)
]
downloaded_files = [] # Keep track of downloaded files for cleanup
for test_input in test_files:
print(f"\n--- Processing: {test_input} ---")
try:
# Get file path (download if URL, verify if file path)
file_path = downloader.get_file_path(test_input)
print(f"Using file path: {file_path}")
# Track downloaded files for cleanup
if downloader.is_url(test_input):
downloaded_files.append(file_path)
# Test document parser with the file
result = document_parser_tool_instance.parse_document(file_path)
print(
f"Parse result preview: {result[:500] + '...' if len(result) > 500 else result}"
)
# Test with agent executor using the URL-capable tool
tools = [document_parser_url_tool]
agent_executor = create_agent_executor(tools=tools)
# Create a comprehensive prompt that includes the original input
prompt_with_input = f"""Please analyze the document from this source: {test_input}
Use the document_parser_url tool to download (if URL) and analyze the content.
Provide a comprehensive summary of what you find in the document.
The tool will handle both URLs (by downloading) and file paths (by using directly)."""
print(f"\n--- Testing with Agent Executor (URL-capable tool) ---")
response = agent_executor.invoke({"input": prompt_with_input})
print("Agent Response:")
print(response["output"])
except Exception as e:
print(f"Error processing {test_input}: {str(e)}")
# Cleanup downloaded files
print(f"\n--- Cleanup ---")
for file_path in downloaded_files:
try:
downloader.delete_file(file_path)
except Exception as e:
print(f"Warning: Could not delete {file_path}: {e}")
print(f"Final downloader state: {repr(downloader)}")