File size: 2,014 Bytes
9a6c384 8b95e7f 9a6c384 8b95e7f 9a6c384 8b95e7f 9a6c384 8b95e7f 9a6c384 8b95e7f 9a6c384 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import openai
import os
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from dotenv import load_dotenv
load_dotenv()
## Lets Read the document
def read_doc(directory):
file_loader=PyPDFDirectoryLoader(directory)
documents=file_loader.load()
return documents
doc=read_doc('documents/') #PDF directory
len(doc)
## Divide the docs into chunks
### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
doc=text_splitter.split_documents(docs)
return docs
documents=chunk_data(docs=doc) #Fnct that divides the PDF into chuncks
len(documents)
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings
vectors=embeddings.embed_query("How are you?")
len(vectors)
pinecone.init(
api_key="3cdc872c-aecc-4b11-93d6-b5243930ac3a",
environment="gcp-starter"
)
index_name="knowledgebase"
index=Pinecone.from_documents(doc,embeddings,index_name=index_name)
## Cosine Similarity Retreive Results from VectorDB
def retrieve_query(query,k=2):
matching_results=index.similarity_search(query,k=k)
return matching_results
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI
llm=OpenAI(model_name="text-davinci-003",temperature=0.5)
chain=load_qa_chain(llm,chain_type="stuff")
## Search answers from VectorDB
def retrieve_answers(query):
doc_search=retrieve_query(query)
print(doc_search)
response=chain.run(input_documents=doc_search,question=query)
return response
our_query = "What is my name?"
answer = retrieve_answers(our_query)
print(answer) |