| | import os |
| | import pandas as pd |
| | from langchain_community.document_loaders import PyPDFLoader |
| | from langchain.tools import Tool |
| | from utils.file_downloader import FileDownloader |
| | from dotenv import load_dotenv |
| |
|
| | load_dotenv() |
| |
|
| |
|
| | class DocumentParserTool: |
| | """A tool for parsing PDF and XLSX documents.""" |
| |
|
| | def __init__(self): |
| | """Initialize the DocumentParserTool with FileDownloader.""" |
| | self.downloader = FileDownloader() |
| |
|
| | def parse_document_from_url_or_path(self, path_or_url: str) -> str: |
| | """ |
| | Parse a document from URL or file path. Downloads if URL, uses directly if path. |
| | |
| | Args: |
| | path_or_url (str): URL to download from or file path to use |
| | |
| | Returns: |
| | str: Parsed content of the document |
| | """ |
| | try: |
| | |
| | file_path = self.downloader.get_file_path(path_or_url) |
| |
|
| | |
| | result = self.parse_document(file_path) |
| |
|
| | |
| | source_info = f"Source: {'Downloaded from ' + path_or_url if self.downloader.is_url(path_or_url) else 'File at ' + path_or_url}\n" |
| | source_info += f"Local file path: {file_path}\n\n" |
| |
|
| | return source_info + result |
| |
|
| | except Exception as e: |
| | return f"Error processing {path_or_url}: {str(e)}" |
| |
|
| | def parse_document(self, document_path: str) -> str: |
| | """ |
| | Parse a document from the given file path. |
| | |
| | Args: |
| | document_path (str): Path to the document file |
| | |
| | Returns: |
| | str: Parsed content of the document |
| | """ |
| | if not os.path.exists(document_path): |
| | return f"Error: File not found at path: {document_path}" |
| |
|
| | try: |
| | file_extension = os.path.splitext(document_path)[1].lower() |
| |
|
| | if file_extension == ".pdf": |
| | return self._parse_pdf(document_path) |
| | elif file_extension in [".xlsx", ".xls"]: |
| | return self._parse_excel(document_path) |
| | else: |
| | return f"Error: Unsupported file format '{file_extension}'. Supported formats: PDF (.pdf), Excel (.xlsx, .xls)" |
| |
|
| | except Exception as e: |
| | return f"Error parsing document: {str(e)}" |
| |
|
| | def _parse_pdf(self, document_path: str) -> str: |
| | """Parse PDF document and extract text content.""" |
| | try: |
| | loader = PyPDFLoader(document_path) |
| | pages = loader.load_and_split() |
| | pdf_text = " ".join(page.page_content for page in pages) |
| |
|
| | if not pdf_text.strip(): |
| | return ( |
| | "Warning: PDF appears to be empty or contains no extractable text." |
| | ) |
| |
|
| | return ( |
| | f"PDF Content (from {os.path.basename(document_path)}):\n\n{pdf_text}" |
| | ) |
| |
|
| | except Exception as e: |
| | return f"Error parsing PDF: {str(e)}" |
| |
|
| | def _parse_excel(self, document_path: str) -> str: |
| | """Parse Excel document and extract structured data.""" |
| | try: |
| | |
| | excel_file = pd.ExcelFile(document_path) |
| | sheet_names = excel_file.sheet_names |
| |
|
| | if not sheet_names: |
| | return "Warning: Excel file contains no sheets." |
| |
|
| | parsed_content = ( |
| | f"Excel Content (from {os.path.basename(document_path)}):\n\n" |
| | ) |
| | parsed_content += f"Number of sheets: {len(sheet_names)}\n" |
| | parsed_content += f"Sheet names: {', '.join(sheet_names)}\n\n" |
| |
|
| | for sheet_name in sheet_names: |
| | try: |
| | df = pd.read_excel(document_path, sheet_name=sheet_name) |
| |
|
| | parsed_content += f"--- Sheet: {sheet_name} ---\n" |
| | parsed_content += ( |
| | f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns\n" |
| | ) |
| |
|
| | if df.empty: |
| | parsed_content += "Sheet is empty.\n\n" |
| | continue |
| |
|
| | parsed_content += ( |
| | f"Columns: {', '.join(df.columns.astype(str))}\n\n" |
| | ) |
| |
|
| | |
| | sample_rows = min(5, len(df)) |
| | parsed_content += f"Sample data (first {sample_rows} rows):\n" |
| | parsed_content += df.head(sample_rows).to_string(index=False) |
| | parsed_content += "\n\n" |
| |
|
| | |
| | numeric_cols = df.select_dtypes(include=["number"]).columns |
| | if not numeric_cols.empty: |
| | parsed_content += "Summary statistics for numeric columns:\n" |
| | parsed_content += df[numeric_cols].describe().to_string() |
| | parsed_content += "\n\n" |
| |
|
| | except Exception as sheet_error: |
| | parsed_content += ( |
| | f"Error reading sheet '{sheet_name}': {str(sheet_error)}\n\n" |
| | ) |
| |
|
| | return parsed_content |
| |
|
| | except Exception as e: |
| | return f"Error parsing Excel file: {str(e)}" |
| |
|
| |
|
| | |
| | document_parser_tool_instance = DocumentParserTool() |
| |
|
| | |
| | document_parser_tool = Tool( |
| | name="document_parser", |
| | description=( |
| | "Parse PDF and Excel (.xlsx, .xls) documents to extract their content. " |
| | "For PDFs, extracts all text content. For Excel files, provides structured data " |
| | "including sheet names, dimensions, column headers, sample data, and summary statistics. " |
| | "Input should be a file path to the document." |
| | ), |
| | func=document_parser_tool_instance.parse_document, |
| | ) |
| |
|
| | |
| | document_parser_url_tool = Tool( |
| | name="document_parser_url", |
| | description=( |
| | "Parse PDF and Excel (.xlsx, .xls) documents from URLs or file paths. " |
| | "If URL is provided, downloads the file first. If file path is provided, uses it directly. " |
| | "For PDFs, extracts all text content. For Excel files, provides structured data " |
| | "including sheet names, dimensions, column headers, sample data, and summary statistics. " |
| | "Input can be either a URL (http/https) or a local file path." |
| | ), |
| | func=document_parser_tool_instance.parse_document_from_url_or_path, |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | print("Start testing document parser tool with file downloader integration") |
| | |
| | |
| | from utils.agent_executor import create_agent_executor |
| |
|
| | |
| | downloader = FileDownloader() |
| |
|
| | |
| | test_files = [ |
| | "https://arxiv.org/pdf/2501.00147", |
| | |
| | |
| | ] |
| |
|
| | downloaded_files = [] |
| |
|
| | for test_input in test_files: |
| | print(f"\n--- Processing: {test_input} ---") |
| |
|
| | try: |
| | |
| | file_path = downloader.get_file_path(test_input) |
| | print(f"Using file path: {file_path}") |
| |
|
| | |
| | if downloader.is_url(test_input): |
| | downloaded_files.append(file_path) |
| |
|
| | |
| | result = document_parser_tool_instance.parse_document(file_path) |
| | print( |
| | f"Parse result preview: {result[:500] + '...' if len(result) > 500 else result}" |
| | ) |
| |
|
| | |
| | tools = [document_parser_url_tool] |
| | agent_executor = create_agent_executor(tools=tools) |
| |
|
| | |
| | prompt_with_input = f"""Please analyze the document from this source: {test_input} |
| | |
| | Use the document_parser_url tool to download (if URL) and analyze the content. |
| | Provide a comprehensive summary of what you find in the document. |
| | |
| | The tool will handle both URLs (by downloading) and file paths (by using directly).""" |
| |
|
| | print(f"\n--- Testing with Agent Executor (URL-capable tool) ---") |
| | response = agent_executor.invoke({"input": prompt_with_input}) |
| | print("Agent Response:") |
| | print(response["output"]) |
| |
|
| | except Exception as e: |
| | print(f"Error processing {test_input}: {str(e)}") |
| |
|
| | |
| | print(f"\n--- Cleanup ---") |
| | for file_path in downloaded_files: |
| | try: |
| | downloader.delete_file(file_path) |
| | except Exception as e: |
| | print(f"Warning: Could not delete {file_path}: {e}") |
| |
|
| | print(f"Final downloader state: {repr(downloader)}") |
| |
|