|
import os |
|
|
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter |
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.chains import RetrievalQA |
|
from langchain.llms import OpenAI |
|
|
|
from langchain.vectorstores import Pinecone |
|
import pinecone |
|
from dotenv import load_dotenv |
|
|
|
from consts import INDEX_NAME |
|
|
|
load_dotenv() |
|
|
|
pinecone.init( |
|
api_key=os.environ.get("PINECONE_API_KEY"), |
|
environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"), |
|
) |
|
|
|
def ingestDataFromPdfIntoPinecone(): |
|
print('Reading Data from PDF') |
|
pdf_path = "/Users/anujmahajan/Desktop/Anuj Documents/Resume/PDF/Amazon/Anuj Mahajan - IUB MS CS - CV.pdf" |
|
loader = PyPDFLoader(file_path=pdf_path) |
|
documents = loader.load() |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=1000, chunk_overlap=30, separators=["\n\n", "\n", " ", ""] |
|
) |
|
documents = text_splitter.split_documents(documents=documents) |
|
|
|
print(f"Going to insert {len(documents)} to Pinecone") |
|
embeddings = OpenAIEmbeddings() |
|
Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME) |
|
print("****** Added to Pinecone vectorstore vectors") |
|
|
|
if __name__ == "__main__": |
|
ingestDataFromPdfIntoPinecone() |