Spaces:
Running
Running
| """ | |
| Example script showing how to use the document ingestion system programmatically. | |
| """ | |
| from ingestion import DocumentIngestion | |
| def main(): | |
| # Initialize the ingestion system | |
| ingestion = DocumentIngestion(embedding_model="all-MiniLM-L6-v2") | |
| # Example 1: Process PDFs | |
| pdf_paths = [ | |
| # Add your PDF file paths here | |
| # "path/to/document1.pdf", | |
| # "path/to/document2.pdf", | |
| ] | |
| # Example 2: Process URLs | |
| urls = [ | |
| # Add URLs here | |
| # "https://en.wikipedia.org/wiki/Artificial_intelligence", | |
| # "https://huggingface.co/docs/transformers", | |
| ] | |
| # Process documents | |
| if pdf_paths or urls: | |
| print("Processing documents...") | |
| documents = ingestion.process_documents(pdf_paths=pdf_paths, urls=urls) | |
| print(f"Processed {len(documents)} document chunks") | |
| # Build vector store | |
| ingestion.build_vector_store() | |
| # Save vector store | |
| ingestion.save("data/vector_store") | |
| # Example search | |
| query = "What is artificial intelligence?" | |
| results = ingestion.search(query, k=3) | |
| print(f"\nSearch results for: '{query}'") | |
| print("-" * 50) | |
| for i, result in enumerate(results, 1): | |
| print(f"\nResult {i}:") | |
| print(f"Source: {result['metadata']['source']}") | |
| print(f"Score: {result['score']:.4f}") | |
| print(f"Text: {result['text'][:200]}...") | |
| else: | |
| print("Please add PDF paths or URLs to the script to test ingestion.") | |
| if __name__ == "__main__": | |
| main() | |