Spaces:

dsimeone
/

organic-chatbot

Running

daniel-simeone

improve quality

66c4741 about 1 month ago

1.61 kB

	"""
	Example script showing how to use the document ingestion system programmatically.
	"""
	from ingestion import DocumentIngestion


	def main():
	# Initialize the ingestion system
	ingestion = DocumentIngestion(embedding_model="all-MiniLM-L6-v2")

	# Example 1: Process PDFs
	pdf_paths = [
	# Add your PDF file paths here
	# "path/to/document1.pdf",
	# "path/to/document2.pdf",
	]

	# Example 2: Process URLs
	urls = [
	# Add URLs here
	# "https://en.wikipedia.org/wiki/Artificial_intelligence",
	# "https://huggingface.co/docs/transformers",
	]

	# Process documents
	if pdf_paths or urls:
	print("Processing documents...")
	documents = ingestion.process_documents(pdf_paths=pdf_paths, urls=urls)
	print(f"Processed {len(documents)} document chunks")

	# Build vector store
	ingestion.build_vector_store()

	# Save vector store
	ingestion.save("data/vector_store")

	# Example search
	query = "What is artificial intelligence?"
	results = ingestion.search(query, k=3)

	print(f"\nSearch results for: '{query}'")
	print("-" * 50)
	for i, result in enumerate(results, 1):
	print(f"\nResult {i}:")
	print(f"Source: {result['metadata']['source']}")
	print(f"Score: {result['score']:.4f}")
	print(f"Text: {result['text'][:200]}...")
	else:
	print("Please add PDF paths or URLs to the script to test ingestion.")


	if __name__ == "__main__":
	main()