KanjikaSingh commited on
Commit
458e7b9
1 Parent(s): 961c352

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from PyPDF2 import PdfReader
3
+ from uuid import uuid4
4
+ from uuid import UUID
5
+ import os
6
+ from astrapy.db import AstraDB
7
+ import gradio as gr
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+ from openai import OpenAI
11
+ client = OpenAI()
12
+
13
+
14
+ # Initialization
15
+ db = AstraDB(
16
+ token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
17
+ api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
18
+ )
19
+
20
+ model = SentenceTransformer('BAAI/bge-base-en-v1.5')
21
+
22
+ def get_embeddings(text):
23
+ embeddings_1 = model.encode(text, normalize_embeddings=True)
24
+ return embeddings_1.tolist()
25
+
26
+ def query(ques, col):
27
+ emb = get_embeddings(ques)
28
+ results = col.vector_find(emb, limit=2, fields={"text", "$vector"})
29
+ return results
30
+
31
+ def read_pdf(pdf_path):
32
+ reader = PdfReader(pdf_path)
33
+ pdf_content = ""
34
+ for i in range(len(reader.pages)):
35
+ pdf_content += reader.pages[i].extract_text()
36
+ return pdf_content
37
+
38
+ def create_chunks(content):
39
+ batch_size = 1000
40
+ overlap_size = 100
41
+
42
+ chunks = []
43
+
44
+ for i in range(0, len(content), batch_size - overlap_size):
45
+ chunk = content[i:i + batch_size]
46
+ chunks.append(chunk)
47
+
48
+ return chunks
49
+
50
+ def create_docs(chunks):
51
+ documents = []
52
+ for i in (range(len(chunks))) :
53
+ mydict = {
54
+ "_id" : i+1,
55
+ "text" : chunks[i],
56
+ "$vector" : get_embeddings(chunks[i])
57
+ }
58
+ documents.append(mydict)
59
+ return documents
60
+
61
+ def create_and_insert_docs(docs):
62
+ user_id = str(uuid4())
63
+ col = db.create_collection(f"user_{UUID(user_id).hex}", dimension=768, metric="cosine")
64
+ res = col.insert_many(docs, partial_failures_allowed=True)
65
+ return col
66
+
67
+ def get_answer(context, query):
68
+ response = client.chat.completions.create(
69
+ model="gpt-3.5-turbo",
70
+ messages=[
71
+ {
72
+ "role": "system",
73
+ "content": "You are a document mining bot createed by Kanva Bhatia and Kanjika Singh. You will given a user query, and user context. You have to give the reply to the user's query if the query's answer is in the context. If it isn't you reply with \"I don't know\""
74
+ },
75
+ {
76
+ "role": "user",
77
+ "content": f"Below is a context and a query, reply from the context if the answer is there in the context, otherwise say I don't know.\nContext: {context}\nQuery: {query}"
78
+ }
79
+ ],
80
+ temperature=0.3,
81
+ max_tokens=2000,
82
+ top_p=1,
83
+ frequency_penalty=0,
84
+ presence_penalty=0
85
+ )
86
+ return response.choices[0].message.content
87
+
88
+ def query(col, ques):
89
+ q = get_embeddings(ques)
90
+ results = col.vector_find(q, limit=2, fields={"text", "$vector"})
91
+ context = ""
92
+ for res in results:
93
+ context += res['text'] + "\n"
94
+ return get_answer(context, ques)
95
+
96
+ def delete_col(col):
97
+ db.delete_collection(col.collection_name)
98
+
99
+ def pipeline(files, user_input):
100
+ total_chunks = []
101
+ for file in files:
102
+ content = read_pdf(file.name)
103
+ chunks = create_chunks(content)
104
+ total_chunks.extend(chunks)
105
+ docs = create_docs(total_chunks)
106
+ try:
107
+ col = create_and_insert_docs(docs)
108
+ ans = query(col, user_input)
109
+ delete_col(col)
110
+ except Exception as e:
111
+ print(e)
112
+ return "Sorry, we can't query that document right now. Please try a different document."
113
+ return ans
114
+ chatbot_input = [gr.Files(label="Upload PDF Files"),gr.components.Textbox(label="User Input")]
115
+ chatbot_output = gr.components.Textbox(label="Chatbot Response")
116
+ article_mdx = """## About the bot
117
+ We created this bot using [DataStax AstraDB](https://www.datastax.com/products/datastax-astra) to store the vectors, and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model to create embeddings, and [OpenAI's GPT-3.5-turbo](https://platform.openai.com/docs/models) for collecting the closest vectors and creating a human-friendly response.
118
+
119
+ ## About the team
120
+ This product is created by [Kanjika Singh](https://www.linkedin.com/in/kanjika-singh/) and [Kanva Bhatia](https://www.linkedin.com/in/kanva-bhatia/).
121
+ """
122
+ iface = gr.Interface(fn=pipeline, inputs=chatbot_input, outputs=chatbot_output, title="Chatbot Demo using DataStax AstraDB and OpenAI", description=article_mdx).queue()
123
+ iface.launch(debug=True, share = True)