LLM-Langchain-Personal-Information-Helper-Bot / dataIngestionFromPdfToPinecone.py
anujmaha's picture
Upload 8 files
6951a91
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone
import pinecone
from dotenv import load_dotenv
from consts import INDEX_NAME
load_dotenv()
pinecone.init(
api_key=os.environ.get("PINECONE_API_KEY"),
environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
)
def ingestDataFromPdfIntoPinecone():
print('Reading Data from PDF')
pdf_path = "/Users/anujmahajan/Desktop/Anuj Documents/Resume/PDF/Amazon/Anuj Mahajan - IUB MS CS - CV.pdf"
loader = PyPDFLoader(file_path=pdf_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=30, separators=["\n\n", "\n", " ", ""]
)
documents = text_splitter.split_documents(documents=documents)
print(f"Going to insert {len(documents)} to Pinecone")
embeddings = OpenAIEmbeddings()
Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME)
print("****** Added to Pinecone vectorstore vectors")
if __name__ == "__main__":
ingestDataFromPdfIntoPinecone()