Spaces:
Runtime error
Runtime error
| import os | |
| import weaviate | |
| from llama_index import download_loader | |
| from llama_index.vector_stores import WeaviateVectorStore | |
| from llama_index import VectorStoreIndex, StorageContext | |
| from pathlib import Path | |
| import argparse | |
| def get_pdf_files(base_path, loader): | |
| """ | |
| Get paths to all PDF files in a directory and its subdirectories. | |
| Parameters: | |
| - base_path (str): The path to the starting directory. | |
| Returns: | |
| - list of str: A list of paths to all PDF files found. | |
| """ | |
| pdf_paths = [] | |
| # Check if the base path exists and is a directory | |
| if not os.path.exists(base_path): | |
| raise FileNotFoundError(f"The specified base path does not exist: {base_path}") | |
| if not os.path.isdir(base_path): | |
| raise NotADirectoryError( | |
| f"The specified base_path is not a directory: {base_path}" | |
| ) | |
| # Loop through all directories and files starting from the base path | |
| for root, dirs, files in os.walk(base_path): | |
| for filename in files: | |
| # If a file has a .pdf extension, add its path to the list | |
| if filename.endswith(".pdf"): | |
| pdf_file = loader.load_data(file=Path(root, filename)) | |
| pdf_paths.extend(pdf_file) | |
| return pdf_paths | |
| def main(args): | |
| PDFReader = download_loader("PDFReader") | |
| loader = PDFReader() | |
| documents = get_pdf_files(args.pdf_dir, loader) | |
| client = weaviate.Client( | |
| url=os.environ["WEAVIATE_URL"], | |
| auth_client_secret=weaviate.AuthApiKey(api_key=os.environ["WEAVIATE_API_KEY"]), | |
| additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}, | |
| ) | |
| # construct vector store | |
| vector_store = WeaviateVectorStore( | |
| weaviate_client=client, index_name=args.customer, text_key="content" | |
| ) | |
| # setting up the storage for the embeddings | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| # set up the index | |
| index = VectorStoreIndex(documents, storage_context=storage_context) | |
| query_engine = index.as_query_engine() | |
| response = query_engine.query(args.query) | |
| print(response) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Process and query PDF files.") | |
| parser.add_argument("--customer", default="Ausy", help="Customer name") | |
| parser.add_argument("--pdf_dir", default="./data", help="Directory containing PDFs") | |
| parser.add_argument( | |
| "--query", | |
| default="What is CX0 customer exprience office?", | |
| help="Query to execute", | |
| ) | |
| args = parser.parse_args() | |
| main(args) | |