from langchain_community.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from pinecone import Pinecone, ServerlessSpec #from langchain_community.vectorstores import Pinecone from dotenv import load_dotenv import os from langchain_pinecone import PineconeVectorStore load_dotenv() directory = "D:/Projects/Aido/data" def load_docs(directory): loader = DirectoryLoader(directory) documents = loader.load() return documents documents = load_docs(directory) print(f"Number of documents in the dataset: {len(documents)}") def split_docs(documents,chunk_size=400,chunk_overlap=150): text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) docs = text_splitter.split_documents(documents) return docs docs = split_docs(documents) print(f"There are total of {len(docs)} chunks derived from {len(documents)} document" ) embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") pc= Pinecone(api_key=os.getenv('PINECONE_API_KEY') ) # next to api key in console index_name = "aido" if index_name not in pc.list_indexes().names(): pc.create_index( name=index_name, dimension=1536, # dimensionality of text-embedding-ada-002 metric="cosine" ) pinecone_index = pc.Index(index_name) index = PineconeVectorStore.from_documents( docs, embeddings, index_name=index_name ) def get_similiar_docs(query,k=3,score=False): if score: similar_docs = index.similarity_search_with_score(query,k=k) else: similar_docs = index.similarity_search(query,k=k) return similar_docs query = "What do students from Albino doubtful on their return on investment when considering studying in the USA?" similar_docs = get_similiar_docs(query) print(similar_docs)