Spaces:
Sleeping
Sleeping
| # import Libraries | |
| import openai | |
| import langchain | |
| import pinecone | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain.vectorstores import Pinecone | |
| from langchain.llms import OpenAI | |
| from langchain_community.document_loaders import DirectoryLoader | |
| from langchain_community.document_loaders import UnstructuredWordDocumentLoader | |
| from langchain_openai import ChatOpenAI | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| ## Lets Read the document | |
| def read_doc(directory): | |
| loader = DirectoryLoader( | |
| directory, | |
| glob="**/*.docx", # This will match .docx files | |
| loader_cls=UnstructuredWordDocumentLoader | |
| ) | |
| documents = loader.load() | |
| return documents | |
| import os | |
| doc = read_doc('documents/') | |
| print(f"Loaded {len(doc)} documents") | |
| def chunk_data(docs, chunk_size=800, chunk_overlap=50): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len, | |
| is_separator_regex=False, | |
| ) | |
| # Split documents and maintain document identity | |
| chunks = text_splitter.split_documents(docs) | |
| # Print information about the chunks | |
| print(f"Split {len(docs)} documents into {len(chunks)} chunks") | |
| for i, chunk in enumerate(chunks): | |
| print(f"Chunk {i}: Source: {chunk.metadata['source']}, Length: {len(chunk.page_content)} chars") | |
| return chunks # Return chunks instead of original docs | |
| documents=chunk_data(docs=doc) | |
| len(documents) | |
| ## Embedding Technique Of OPENAI | |
| embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY']) | |
| embeddings | |
| vectors=embeddings.embed_query("How are you?") | |
| len(vectors) | |
| ## Vector Search DB In Pinecone | |
| import pinecone | |
| pc = pinecone.Pinecone( | |
| api_key="s_jb2Enoqd32qMqAZHGtT3BlbkFJUSYttAQpCkEFzWehIwE3HYwtUpR8TCgI0juyjCfLd1V8yKoPBDBuOTrlzJ26veRHI538W38p4A" | |
| ) | |
| index_name = "advrag" | |
| index = Pinecone.from_documents( | |
| documents, | |
| embeddings, | |
| index_name=index_name | |
| ) | |
| ## Cosine Similarity Retreive Results from VectorDB | |
| def retrieve_query(query,k=2): | |
| matching_results=index.similarity_search(query,k=k) | |
| return matching_results | |
| from langchain.chains.question_answering import load_qa_chain | |
| from langchain_openai import OpenAI | |
| from langchain.chains import RetrievalQA | |
| from langchain.prompts import PromptTemplate | |
| def initialize_qa_chain(): | |
| llm = ChatOpenAI( | |
| model="gpt-4", | |
| temperature=0.5 | |
| ) | |
| prompt_template = """ | |
| System: You are a helpful AI assistant that provides accurate and concise answers based on the given context. Always cite the specific source document when providing information. | |
| Context: {context} | |
| Question: {question} | |
| Please provide a clear and direct answer based on the context above. If the information isn't available in the context, say so. | |
| """ | |
| PROMPT = PromptTemplate( | |
| template=prompt_template, | |
| input_variables=["context", "question"] | |
| ) | |
| chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT) | |
| return chain | |
| qa_chain = None | |
| def retrieve_answers(query, k=2): | |
| global qa_chain | |
| if qa_chain is None: | |
| qa_chain = initialize_qa_chain() | |
| try: | |
| # Get relevant documents | |
| matching_docs = retrieve_query(query, k=k) | |
| # Create the input dictionary | |
| chain_input = { | |
| "input_documents": matching_docs, | |
| "question": query | |
| } | |
| # Use invoke instead of __call__ | |
| result = qa_chain.invoke(chain_input) | |
| return result['output_text'] | |
| except Exception as e: | |
| return f"Error processing query: {str(e)}" | |
| # Test the function | |
| our_query = "Identify the homework items that the client agreed to complete in each of the two coaching sessions." | |
| answer = retrieve_answers(our_query) | |
| print("\nAnswer:", answer) | |