File size: 4,581 Bytes
458e7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8175c18
 
 
 
 
be2c876
8175c18
 
 
 
be7d807
8175c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from uuid import uuid4
from uuid import UUID
import os
from astrapy.db import AstraDB
import gradio as gr
from dotenv import load_dotenv
load_dotenv() 
from openai import OpenAI
client = OpenAI()


# Initialization
db = AstraDB(
    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
)

model = SentenceTransformer('BAAI/bge-base-en-v1.5')

def get_embeddings(text):
    embeddings_1 = model.encode(text, normalize_embeddings=True)
    return embeddings_1.tolist()
        
def query(ques, col):
    emb = get_embeddings(ques)
    results = col.vector_find(emb, limit=2, fields={"text", "$vector"})
    return results

def read_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    pdf_content = ""
    for i in range(len(reader.pages)):
        pdf_content += reader.pages[i].extract_text()
    return pdf_content
    
def create_chunks(content):
    batch_size = 1000
    overlap_size = 100

    chunks = []

    for i in range(0, len(content), batch_size - overlap_size):
        chunk = content[i:i + batch_size]
        chunks.append(chunk)

    return chunks

def create_docs(chunks):
    documents = []
    for i in (range(len(chunks))) :
        mydict = {
            "_id" : i+1,
            "text" : chunks[i],
            "$vector" : get_embeddings(chunks[i])
            }
        documents.append(mydict)
    return documents

def create_and_insert_docs(docs):
    user_id = str(uuid4())
    col = db.create_collection(f"user_{UUID(user_id).hex}", dimension=768, metric="cosine")
    res = col.insert_many(docs, partial_failures_allowed=True)
    return col

def get_answer(context, query):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
        "role": "system",
        "content": "You are a document mining bot createed by Kanva Bhatia and Kanjika Singh. You will given a user query, and user context. You have to give the reply to the user's query if the query's answer is in the context. If it isn't you reply with \"I don't know\""
        },
        {
        "role": "user",
        "content": f"Below is a context and a query, reply from the context if the answer is there in the context, otherwise say I don't know.\nContext: {context}\nQuery: {query}"
        }
    ],
    temperature=0.3,
    max_tokens=2000,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
    )
    return response.choices[0].message.content

def query(col, ques):
    q = get_embeddings(ques)
    results = col.vector_find(q, limit=2, fields={"text", "$vector"})
    context = ""
    for res in results:
        context += res['text'] + "\n"
    return get_answer(context, ques)

def delete_col(col):
    db.delete_collection(col.collection_name)

def pipeline(files, user_input):
    total_chunks = []
    for file in files:
        content = read_pdf(file.name)
        chunks = create_chunks(content)
        total_chunks.extend(chunks)
    docs = create_docs(total_chunks)
    try:
        col = create_and_insert_docs(docs)
        ans = query(col, user_input)
        delete_col(col)
    except Exception as e:
        print(e)
        return "Sorry, we can't query that document right now. Please try a different document."
    return ans 

with gr.Blocks() as demo:
    gr.Markdown("# Chatbot Demo using DataStax Astra DB and OpenAI")
    about_bot = """## About the bot
    We created this bot using [DataStax Astra DB](https://www.datastax.com/products/datastax-astra) to store the vectors, and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model to create embeddings, and [OpenAI's GPT-3.5-turbo](https://platform.openai.com/docs/models) for collecting the closest vectors and creating a human-friendly response.
    You can upload your pdf documents and chat with them!
    """
    gr.Markdown(about_bot)
    with gr.Row():
        with gr.Column():
            files = gr.Files(label = "Upload PDF Files", file_types = ['.pdf'])
            user_input = gr.Textbox(label = "Enter Query")
        with gr.Column():
            output = gr.Textbox(label = "Chatbot Response")
    with gr.Row():
        btn = gr.Button("Submit")

    btn.click(fn = pipeline, inputs=[files, user_input], outputs=output)
    about_team = """
    ### About the team
    This product is created by [Kanjika Singh](https://www.linkedin.com/in/kanjika-singh/) and [Kanva Bhatia](https://www.linkedin.com/in/kanva-bhatia/).
    """
    gr.Markdown(about_team)

demo.launch()