Spaces:

KushwanthK
/

chat-with-docs

Sleeping

File size: 8,742 Bytes

4997aeb
 
da863bf
3ae066d
 
0bff6fd
940c185
517f1a0
f06193e
bb5b4ac
9c2d532
4997aeb
0fb4cf5
4997aeb
8eb3e51
4997aeb
2da6f20
4997aeb
 
2da6f20
4997aeb
2da6f20
 
4997aeb
2da6f20
 
 
4997aeb
 
 
 
 
 
e496258
 
 
 
4997aeb
 
8ce3d9b
4997aeb
 
9eb3e78
11bc07e
9eb3e78
dcca063
 
9eb3e78
 
 
 
 
79497a3
c9dd21c
9eb3e78
2376b2f
64523d8
 
 
 
 
9eb3e78
 
 
 
 
 
c9dd21c
9eb3e78
 
 
 
 
c9dd21c
9eb3e78
 
f76455a
64523d8
9eb3e78
 
a1f90e6
38e40b5
a1f90e6
 
 
 
 
 
 
 
 
 
 
 
 
 
abef2ac
a1f90e6
 
8c993f6
065cb17
a1f90e6
c15629f
a1f90e6
8ce3d9b
9eb3e78
 
5d2299c
9eb3e78
8ce3d9b
 
 
 
dcb00f7
 
3d67d69
dcb00f7
a1f90e6
a969331
 
 
a1f90e6
a969331
c2e5bed
a969331
 
78c999a
a969331
 
 
 
909aec0
64523d8
e488916
160526f
 
 
 
557b724
544f3f0
150b8d9
909aec0
 
 
a969331
909aec0
 
6725ef7
8ce3d9b
 
 
 
23247c4
8ce3d9b
 
 
 
9eb3e78
8a3a5d7
940c185
 
 
 
f06193e
 
 
 
 
80682f3
f06193e
80682f3
f06193e
517f1a0
f06193e
7c92df9
 
 
 
 
 
 
 
ec7fc78
 
517f1a0
609ef88
 
 
 
 
 
 
517f1a0
 
 
 
 
 
a78496d
 
 
db780e3
a78496d
 
517f1a0
a78496d
517f1a0
 
 
a78496d
517f1a0
a78496d
517f1a0
 
 
 
 
 
 
 
 
ec7fc78
 
 
 
 
d34a703
792de2f
c15629f
7a0f54c
 
 
c15629f
 
a1f90e6
81d7170
792de2f
61c3232
ec7fc78
7d43644
 
 
 
 
39dae03
 
7d43644
 
 
 
 
 
940c185
 
609ef88
ec7fc78
 
 
 
517f1a0

import streamlit as st
import os
from streamlit_chat import message
import numpy as np
import pandas as pd
from io import StringIO
import PyPDF2
from tqdm.auto import tqdm
import math
from transformers import pipeline
# import json

# st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")

# from datasets import load_dataset

# dataset = load_dataset("wikipedia", "20220301.en", split="train[240000:250000]")


# wikidata = []

# for record in dataset:
#     wikidata.append(record["text"])

# wikidata = list(set(wikidata))
# # print("\n".join(wikidata[:5]))
# # print(len(wikidata))

from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device != 'cuda':
    st.markdown(f"you are using {device}. This is much slower than using "
    "a CUDA-enabled GPU. If on colab you can change this by "
    "clicking Runtime > change runtime type > GPU.")

model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
st.divider()

# Creating a Index(Pinecone Vector Database)
import os
# import pinecone

from pinecone.grpc import PineconeGRPC


PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENV=os.getenv("PINECONE_ENV")
PINECONE_ENVIRONMENT=os.getenv("PINECONE_ENVIRONMENT")

# pc = PineconeGRPC( api_key=os.environ.get("PINECONE_API_KEY") ) # Now do stuff if 'my_index' not in pc.list_indexes().names(): pc.create_index( name='my_index', dimension=1536, metric='euclidean', spec=ServerlessSpec( cloud='aws', region='us-west-2' ) )

def connect_pinecone():
    pinecone = PineconeGRPC(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    # st.code(pinecone)
    # st.divider()
    # st.text(pinecone.list_indexes().names())
    # st.divider()
    # st.text(f"Succesfully connected to the pinecone")
    return pinecone

def get_pinecone_semantic_index(pinecone):
    index_name = "sematic-search"

    # only create if it deosnot exists
    if index_name not in pinecone.list_indexes().names():
        pinecone.create_index(
            name=index_name,
            description="Semantic search",
            dimension=model.get_sentence_embedding_dimension(),
            metric="cosine",
            spec=ServerlessSpec( cloud='gcp', region='us-central1' )
        )
    # now connect to index
    index = pinecone.Index(index_name)
    # st.text(f"Succesfully connected to the pinecone index")
    return index

def promt_engineer(text):
    prompt_template = """
    write a concise summary of the following text delimited by triple backquotes.
    return your response in bullet points which convers the key points of the text.

    ```{text}```

    BULLET POINT SUMMARY:
    """
    # Load the summarization pipeline with the specified model
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    # Generate the prompt
    prompt = prompt_template.format(text=text)

    # Generate the summary
    summary = summarizer(prompt, max_length=1024, min_length=50)[0]["summary_text"]
    
    with st.sidebar:
        st.divider()
        st.markdown("*:red[Text Summary Generation]* from above Top 5 **:green[similarity search results]**.")
        st.write(summary)
        st.divider()

def chat_actions():
    
    pinecone = connect_pinecone()
    index = get_pinecone_semantic_index(pinecone)

    st.session_state["chat_history"].append(
        {"role": "user", "content": st.session_state["chat_input"]},
    )

    query_embedding = model.encode(st.session_state["chat_input"])
    # create the query vector
    query_vector = query_embedding.tolist()
    # now query vector database
    result = index.query(query_vector, top_k=5, include_metadata=True)  # result is a list of tuples

    # Create a list of lists
    data = []
    consolidated_text = ""
    i = 0
    for res in result['matches']:
        i = i + 1
        data.append([f"{i}⭐", res['score'], res['metadata']['text']])
        consolidated_text += res['metadata']['text']

    # Create a DataFrame from the list of lists
    resdf = pd.DataFrame(data, columns=['TopRank', 'Score', 'Text'])

    with st.sidebar:
        st.markdown("*:red[semantic search results]* with **:green[Retrieval Augmented Generation]** ***(RAG)***.")
        st.dataframe(resdf)
        bytesize = consolidated_text.encode("utf-8")
        p = math.pow(1024, 2)
        mbsize = round(len(bytesize) / p, 2)
        st.write(f"Text lenth of {len(consolidated_text)} characters with {mbsize}MB size")
        promt_engineer(consolidated_text[:1024])

    for res in result['matches']:
        st.session_state["chat_history"].append(
            {
                "role": "assistant",
                "content": f"{res['metadata']['text']}",
            },  # This can be replaced with your chat response logic
        )
        break;

if "chat_history" not in st.session_state:
    st.session_state["chat_history"] = []

st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")

for i in st.session_state["chat_history"]:
    with st.chat_message(name=i["role"]):
        st.write(i["content"])

def print_out(pages):
    for i in range(len(pages)):
        text = pages[i].extract_text().strip()
        st.write(f"Page {i} : {text}")

def combine_text(pages):
    concatenates_text = ""
    for page in tqdm(pages):
        text = page.extract_text().strip()
        concatenates_text += text
    bytesize = concatenates_text.encode("utf-8")
    p = math.pow(1024, 2)
    mbsize = round(len(bytesize) / p, 2)
    st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
    return concatenates_text

def split_into_chunks(text, chunk_size):

    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])

    return chunks

def create_embeddings():
    # Get the uploaded file
    inputtext = ""
    with st.sidebar:
        uploaded_files = st.session_state["uploaded_files"]
        for uploaded_file in uploaded_files:
            # Read the contents of the file
            reader = PyPDF2.PdfReader(uploaded_file)
            pages = reader.pages
            print_out(pages)
            inputtext = combine_text(pages)

    # connect to pinecone index
    pinecone = connect_pinecone()
    index = get_pinecone_semantic_index(pinecone)

    # The maximum metadata size per vector is 40KB ~ 40000Bytes ~ each text character is 1 to 2 bytes. so rougly given chunk size of 10000 to 40000
    chunk_size = 10000
    batch_size = 2
    chunks = split_into_chunks(inputtext, chunk_size)

    for i in tqdm(range(0, len(chunks), batch_size)):
        # find end of batch
        end = min(i + batch_size, len(chunks))
        # create ids batch
        ids = [str(i) for i in range(i, end)]
        # create metadata batch
        metadata = [{"text": text} for text in chunks[i:end]]
        # create embeddings
        xc = model.encode(chunks[i:end])
        # create records list for upsert
        records = zip(ids, xc, metadata)
        # upsert records
        index.upsert(vectors=records)

    with st.sidebar:
        st.write("created vector embeddings!")
        # check no of records in the index
        st.write(f"{index.describe_index_stats()}")


    # Display the contents of the file
    # st.write(file_contents)

with st.sidebar:
    st.markdown("""
    ***:red[Follow this steps]***
    - upload pdf file to create embeddings using model on your own docs
    - wait see success message on embeddings creation 
    - It Takes couple of mins after upload the pdf
    - Now Chat with your documents with help of this RAG system 
    - It Generate Promted reponses on the upload pdf
    - Provides summarized results and QA's using GPT models
    - This system already trained on some wikipedia datasets too
    """)
    uploaded_files = st.file_uploader('Choose your .pdf file', type="pdf", accept_multiple_files=True, key="uploaded_files", on_change=create_embeddings)
    # for uploaded_file in uploaded_files:
        # To read file as bytes:
        # bytes_data = uploaded_file.getvalue()
        # st.write(bytes_data)

        # To convert to a string based IO:
        # stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
        # st.write(stringio)

        # To read file as string:
        # string_data = stringio.read()
        # st.write(string_data)

        # Can be used wherever a "file-like" object is accepted:
        # dataframe = pd.read_csv(uploaded_file)
        # st.write(dataframe)

        # reader = PyPDF2.PdfReader(uploaded_file)
        # pages = reader.pages
        # print_out(pages)
        # combine_text(pages)
        # promt_engineer(text)