from pinecone import Pinecone from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings import os, uuid def create_vector_store_index(file_path): file_path_split = file_path.split(".") file_type = file_path_split[-1].rstrip('/') if file_type == 'csv': loader = Docx2txtLoader(file_path) elif file_type == 'pdf': loader = PyPDFLoader(file_path) pages = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size = 512, chunk_overlap = 128) docs = text_splitter.split_documents(pages) pc = Pinecone( api_key=os.environ.get("PINECONE_API_KEY"), ) index = pc.Index(os.environ.get("PINECONE_INDEX")) embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-mpnet-base-v2", ) batch_size = 32 for i in range(0, len(docs), batch_size): i_end = min(len(docs), i+batch_size) batch = docs[i:i_end] ids = [str(uuid.uuid4()) for _ in batch] texts = [x.page_content for x in batch] embeds = embeddings.embed_documents(texts) metadata = [ {'text': x.page_content, **x.metadata} for x in batch ] index.upsert(vectors=zip(ids, embeds, metadata)) return "Vector store index is created." def upload_and_create_vector_store(files): for file in files: index_success_msg = create_vector_store_index(file) return index_success_msg