documentchatbotcloud

Sleeping

App Files Files Community

KanvaBhatia commited on Jan 30

Commit

87f4167

•

1 Parent(s): fb92e61

Create app.py

Browse files

Files changed (1) hide show

app.py +138 -0

app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from sentence_transformers import SentenceTransformer
+from PyPDF2 import PdfReader
+from uuid import uuid4
+from uuid import UUID
+import os
+from astrapy.db import AstraDB
+import gradio as gr
+from dotenv import load_dotenv
+load_dotenv()
+from openai import OpenAI
+client = OpenAI()
+# Initialization
+db = AstraDB(
+    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+)
+model = SentenceTransformer('BAAI/bge-base-en-v1.5')
+def get_embeddings(text):
+    embeddings_1 = model.encode(text, normalize_embeddings=True)
+    return embeddings_1.tolist()
+def query(ques, col):
+    emb = get_embeddings(ques)
+    results = col.vector_find(emb, limit=2, fields={"text", "$vector"})
+    return results
+def read_pdf(pdf_path):
+    reader = PdfReader(pdf_path)
+    pdf_content = ""
+    for i in range(len(reader.pages)):
+        pdf_content += reader.pages[i].extract_text()
+    return pdf_content
+def create_chunks(content):
+    batch_size = 1000
+    overlap_size = 100
+    chunks = []
+    for i in range(0, len(content), batch_size - overlap_size):
+        chunk = content[i:i + batch_size]
+        chunks.append(chunk)
+    return chunks
+def create_docs(chunks):
+    documents = []
+    for i in (range(len(chunks))) :
+        mydict = {
+            "_id" : i+1,
+            "text" : chunks[i],
+            "$vector" : get_embeddings(chunks[i])
+            }
+        documents.append(mydict)
+    return documents
+def create_and_insert_docs(docs):
+    user_id = str(uuid4())
+    col = db.create_collection(f"user_{UUID(user_id).hex}", dimension=768, metric="cosine")
+    res = col.insert_many(docs, partial_failures_allowed=True)
+    return col
+def get_answer(context, query):
+    response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+        "role": "system",
+        "content": "You are a document mining bot createed by Kanva Bhatia and Kanjika Singh. You will given a user query, and user context. You have to give the reply to the user's query if the query's answer is in the context. If it isn't you reply with \"I don't know\""
+        },
+        {
+        "role": "user",
+        "content": f"Below is a context and a query, reply from the context if the answer is there in the context, otherwise say I don't know.\nContext: {context}\nQuery: {query}"
+        }
+    ],
+    temperature=0.3,
+    max_tokens=2000,
+    top_p=1,
+    frequency_penalty=0,
+    presence_penalty=0
+    )
+    return response.choices[0].message.content
+def query(col, ques):
+    q = get_embeddings(ques)
+    results = col.vector_find(q, limit=2, fields={"text", "$vector"})
+    context = ""
+    for res in results:
+        context += res['text'] + "\n"
+    return get_answer(context, ques)
+def delete_col(col):
+    db.delete_collection(col.collection_name)
+def pipeline(files, user_input):
+    total_chunks = []
+    for file in files:
+        content = read_pdf(file.name)
+        chunks = create_chunks(content)
+        total_chunks.extend(chunks)
+    docs = create_docs(total_chunks)
+    try:
+        col = create_and_insert_docs(docs)
+        ans = query(col, user_input)
+        delete_col(col)
+    except Exception as e:
+        print(e)
+        return "Sorry, we can't query that document right now. Please try a different document."
+    return ans
+with gr.Blocks() as demo:
+    gr.Markdown("# Chatbot Demo using DataStax Astra DB and OpenAI")
+    about_bot = """## About the bot
+    We created this bot using [DataStax Astra DB](https://www.datastax.com/products/datastax-astra) to store the vectors, and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model to create embeddings, and [OpenAI's GPT-3.5-turbo](https://platform.openai.com/docs/models) for collecting the closest vectors and creating a human-friendly response.
+    You can upload your pdf documents and chat with them!
+    """
+    gr.Markdown(about_bot)
+    with gr.Row():
+        with gr.Column():
+            files = gr.Files(label = "Upload PDF Files", file_types = ['.pdf'])
+            user_input = gr.Textbox(label = "Enter Query")
+        with gr.Column():
+            output = gr.Textbox(label = "Chatbot Response")
+    with gr.Row():
+        btn = gr.Button("Submit")
+    btn.click(fn = pipeline, inputs=[files, user_input], outputs=output)
+    about_team = """
+    ### About the team
+    This product is created by [Kanjika Singh](https://www.linkedin.com/in/kanjika-singh/) and [Kanva Bhatia](https://www.linkedin.com/in/kanva-bhatia/).
+    """
+    gr.Markdown(about_team)
+demo.launch()