File size: 3,600 Bytes
6b33c23
13d8e3b
 
 
 
 
 
 
 
 
6b33c23
13d8e3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b33c23
 
13d8e3b
 
 
 
 
 
 
 
 
 
 
 
 
 
9a29948
13d8e3b
 
 
 
 
83aaa54
cb56324
 
 
 
 
13d8e3b
 
 
 
 
 
 
 
2b2f2f5
06bb820
24a6666
 
4e48e34
cb56324
13d8e3b
fe46ac0
 
13d8e3b
 
 
 
 
 
79a6421
13d8e3b
 
 
 
 
 
 
 
 
cb56324
13d8e3b
 
 
 
 
 
79a6421
13d8e3b
 
 
 
 
79a6421
13d8e3b
 
 
 
 
6b33c23
13d8e3b
 
 
6b33c23
13d8e3b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
from langchain_community.document_loaders import PyPDFDirectoryLoader
from pypdf import PdfReader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from pinecone import Pinecone as PineconeClient
from langchain.chains.question_answering import load_qa_chain
from datetime import datetime
from langchain_community.vectorstores import Pinecone
import os
import time


def get_pdf_text(pdf_doc):
    text = ""
    pdf_reader = PdfReader(pdf_doc)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def create_docs(user_pdf_list, unique_id):
    docs=[]
    for filename in user_pdf_list:
        chunks = get_pdf_text(filename)

        docs.append(Document(
            page_content = chunks,
            metadata = {"name":filename.name, "type=": filename.type, "size": filename.size, "unique_id": unique_id, 'time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
        ))

        
    return docs

# transform documents
def split_docs(documents, chunk_size=400, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    
    return docs

def get_embeddings():
    embedding = OpenAIEmbeddings()
    return embedding


def push_to_pinecone(docs, embedding,namespace):

    pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
    index_name=os.environ.get("PINECONE_INDEX_NAME")
    index = pc.Index(index_name)

    index_dict = index.describe_index_stats()
    namespace_list = list(index_dict["namespaces"].keys())
    if "rag_bot" in namespace_list:
        index.delete(delete_all=True, namespace='rag_bot')
    else:
        pass
    
    vector = []
    for i, doc in enumerate(docs):
        entry = { "id": str(i),
                "values": embedding.embed_query(doc.page_content),
                "metadata":doc.metadata}
        vector.append(entry)

    with st.expander("click here:"):
        st.write(type(docs))
        for doc in docs:
            st.write(doc)
        st.write(docs)
    index = Pinecone.from_documents(docs, embedding, index_name = index_name, namespace='rag_bot')

    st.sidebar.write("This 35 seconds delay was added Manually... \n(because I'm using some free resources)")
    time.sleep(35)

    return index



#Function to pull index data from Pinecone
def pull_from_pinecone(embeddings):

    pinecone_apikey = os.environ.get("PINECONE_API_KEY")
    pinecone_index_name =os.environ.get("PINECONE_INDEX_NAME")

    PineconeClient(
    api_key=pinecone_apikey
    )

    #PineconeStore is an alias name of Pinecone class, please look at the imports section at the top :)
    index = Pinecone.from_existing_index(pinecone_index_name, embeddings, namespace='rag_bot')

    return index




def get_similar_doc(query, embedding, k=2):

    pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
    index_name=os.environ.get("PINECONE_INDEX_NAME")
    index = pc.Index(index_name)

    index = pull_from_pinecone(embeddings=embedding)
    similar_doc = index.similarity_search_with_score(query, int(k))
    
    return [doc for doc, similarity_score in similar_doc]
    


def get_answer(query, embedding, k=2):
    llm=ChatOpenAI(temperature=0.5)
    chain = load_qa_chain(llm, chain_type="stuff")

    relevent_doc = get_similar_doc(query, embedding,k=2)
    response = chain.run(input_documents = relevent_doc, question=query)
    return response