RAG / SimpleRAG.py
SyedBasitAbbas's picture
Update SimpleRAG.py
b9479ec verified
# import Libraries
import openai
import langchain
import pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()
## Lets Read the document
def read_doc(directory):
loader = DirectoryLoader(
directory,
glob="**/*.docx", # This will match .docx files
loader_cls=UnstructuredWordDocumentLoader
)
documents = loader.load()
return documents
import os
doc = read_doc('documents/')
print(f"Loaded {len(doc)} documents")
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=False,
)
# Split documents and maintain document identity
chunks = text_splitter.split_documents(docs)
# Print information about the chunks
print(f"Split {len(docs)} documents into {len(chunks)} chunks")
for i, chunk in enumerate(chunks):
print(f"Chunk {i}: Source: {chunk.metadata['source']}, Length: {len(chunk.page_content)} chars")
return chunks # Return chunks instead of original docs
documents=chunk_data(docs=doc)
len(documents)
## Embedding Technique Of OPENAI
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings
vectors=embeddings.embed_query("How are you?")
len(vectors)
## Vector Search DB In Pinecone
import pinecone
pc = pinecone.Pinecone(
api_key="s_jb2Enoqd32qMqAZHGtT3BlbkFJUSYttAQpCkEFzWehIwE3HYwtUpR8TCgI0juyjCfLd1V8yKoPBDBuOTrlzJ26veRHI538W38p4A"
)
index_name = "advrag"
index = Pinecone.from_documents(
documents,
embeddings,
index_name=index_name
)
## Cosine Similarity Retreive Results from VectorDB
def retrieve_query(query,k=2):
matching_results=index.similarity_search(query,k=k)
return matching_results
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
def initialize_qa_chain():
llm = ChatOpenAI(
model="gpt-4",
temperature=0.5
)
prompt_template = """
System: You are a helpful AI assistant that provides accurate and concise answers based on the given context. Always cite the specific source document when providing information.
Context: {context}
Question: {question}
Please provide a clear and direct answer based on the context above. If the information isn't available in the context, say so.
"""
PROMPT = PromptTemplate(
template=prompt_template,
input_variables=["context", "question"]
)
chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)
return chain
qa_chain = None
def retrieve_answers(query, k=2):
global qa_chain
if qa_chain is None:
qa_chain = initialize_qa_chain()
try:
# Get relevant documents
matching_docs = retrieve_query(query, k=k)
# Create the input dictionary
chain_input = {
"input_documents": matching_docs,
"question": query
}
# Use invoke instead of __call__
result = qa_chain.invoke(chain_input)
return result['output_text']
except Exception as e:
return f"Error processing query: {str(e)}"
# Test the function
our_query = "Identify the homework items that the client agreed to complete in each of the two coaching sessions."
answer = retrieve_answers(our_query)
print("\nAnswer:", answer)