File size: 3,516 Bytes
6b33c23
13d8e3b
 
 
 
 
 
 
 
 
6b33c23
13d8e3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b33c23
 
13d8e3b
 
 
 
 
 
 
 
 
 
 
 
 
 
9a29948
13d8e3b
 
 
 
 
83aaa54
cb56324
 
 
 
 
13d8e3b
 
 
 
 
 
 
67c35bb
 
cb56324
13d8e3b
fe46ac0
 
13d8e3b
 
 
 
 
 
79a6421
13d8e3b
 
 
 
 
 
 
 
 
cb56324
13d8e3b
 
 
 
 
 
79a6421
13d8e3b
 
 
 
 
79a6421
13d8e3b
 
 
 
 
6b33c23
13d8e3b
 
 
6b33c23
13d8e3b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import streamlit as st
from langchain_community.document_loaders import PyPDFDirectoryLoader
from pypdf import PdfReader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from pinecone import Pinecone as PineconeClient
from langchain.chains.question_answering import load_qa_chain
from datetime import datetime
from langchain_community.vectorstores import Pinecone
import os
import time


def get_pdf_text(pdf_doc):
    text = ""
    pdf_reader = PdfReader(pdf_doc)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def create_docs(user_pdf_list, unique_id):
    docs=[]
    for filename in user_pdf_list:
        chunks = get_pdf_text(filename)

        docs.append(Document(
            page_content = chunks,
            metadata = {"name":filename.name, "type=": filename.type, "size": filename.size, "unique_id": unique_id, 'time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
        ))

        
    return docs

# transform documents
def split_docs(documents, chunk_size=400, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    
    return docs

def get_embeddings():
    embedding = OpenAIEmbeddings()
    return embedding


def push_to_pinecone(docs, embedding,namespace):

    pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
    index_name=os.environ.get("PINECONE_INDEX_NAME")
    index = pc.Index(index_name)

    index_dict = index.describe_index_stats()
    namespace_list = list(index_dict["namespaces"].keys())
    if "rag_bot" in namespace_list:
        index.delete(delete_all=True, namespace='rag_bot')
    else:
        pass
    
    vector = []
    for i, doc in enumerate(docs):
        entry = { "id": str(i),
                "values": embedding.embed_query(doc.page_content),
                "metadata":doc.metadata}
        vector.append(entry)
    with st.expander("info"):
        st.write(docs[:5])
    index = Pinecone.from_documents(docs, embedding, index_name = index_name, namespace='rag_bot')

    st.sidebar.write("This 35 seconds delay was added Manually... \n(because I'm using some free resources)")
    time.sleep(35)

    return index



#Function to pull index data from Pinecone
def pull_from_pinecone(embeddings):

    pinecone_apikey = os.environ.get("PINECONE_API_KEY")
    pinecone_index_name =os.environ.get("PINECONE_INDEX_NAME")

    PineconeClient(
    api_key=pinecone_apikey
    )

    #PineconeStore is an alias name of Pinecone class, please look at the imports section at the top :)
    index = Pinecone.from_existing_index(pinecone_index_name, embeddings, namespace='rag_bot')

    return index




def get_similar_doc(query, embedding, k=2):

    pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
    index_name=os.environ.get("PINECONE_INDEX_NAME")
    index = pc.Index(index_name)

    index = pull_from_pinecone(embeddings=embedding)
    similar_doc = index.similarity_search_with_score(query, int(k))
    
    return [doc for doc, similarity_score in similar_doc]
    


def get_answer(query, embedding, k=2):
    llm=ChatOpenAI(temperature=0.5)
    chain = load_qa_chain(llm, chain_type="stuff")

    relevent_doc = get_similar_doc(query, embedding,k=2)
    response = chain.run(input_documents = relevent_doc, question=query)
    return response