import os from pinecone import Pinecone as PineconeClient, ServerlessSpec from langchain_community.vectorstores import Pinecone from langchain_openai import OpenAIEmbeddings from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from dotenv import load_dotenv, find_dotenv load_dotenv(find_dotenv(), override=True) PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY') PINECONE_ENV = os.environ.get('PINECONE_ENV') embeddings = OpenAIEmbeddings() loader = PyPDFLoader("docs/M92TB4_2023-24_online.pdf") data = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=0) texts = text_splitter.split_documents(data) pinecone = PineconeClient( api_key=PINECONE_API_KEY ) index_name = "linuxtips" if index_name not in pinecone.list_indexes().names(): pinecone.create_index( name=index_name, dimension=1536, metric='euclidean', spec=ServerlessSpec( cloud="aws", region="us-east-1" ) ) index = pinecone.Index(index_name) docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name) # Test query = "Assistant, please tell me what are the main functions of an autarchy department" docs = docsearch.similarity_search(query) print(docs[0].page_content)