Vishnu-add commited on
Commit
1f6b1f0
1 Parent(s): 9d2bd49

Upload 17 files

Browse files
.gitattributes CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +38,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
1
+ db/ filter=lfs diff=lfs merge=lfs -text
2
+ # LaMini-T5-738M/ filter=lfs diff=lfs merge=lfs -text
3
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
4
+ *.bin filter=lfs diff=lfs merge=lfs -text
5
+ # HF
6
  *.7z filter=lfs diff=lfs merge=lfs -text
7
  *.arrow filter=lfs diff=lfs merge=lfs -text
8
  *.bin filter=lfs diff=lfs merge=lfs -text
 
38
  *.zip filter=lfs diff=lfs merge=lfs -text
39
  *.zst filter=lfs diff=lfs merge=lfs -text
40
  *tfevents* filter=lfs diff=lfs merge=lfs -text
41
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ # Lib/
3
+ search_pdf_env/
4
+ LaMini-T5-738M/
5
+ # db/
6
+ # uploaded/
Commands.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Youtube video : https://youtu.be/rIV1EseKwU4?si=YOJ2a_9eYVPhxn6X
2
+ Github : https://github.com/AIAnytime/Search-Your-PDF-App/tree/main
3
+ LLM : https://huggingface.co/MBZUAI/LaMini-T5-738M
4
+
5
+
6
+ NOTE: Remove the chroma settings from the code to work with latest versions
7
+
8
+
9
+
10
+ 1) Creating a virtual env
11
+ python -m venv <env_name>
12
+
13
+ 2) Activating virtual environment
14
+ search_pdf_env\Scripts\activate
15
+
16
+ 3)Installing requirements:
17
+ pipi nstall -r requirements.txt
README.md CHANGED
@@ -1,13 +1,10 @@
1
- ---
2
- title: Chat With Your Doc
3
  emoji: 😻
4
- colorFrom: green
5
- colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.29.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ metadata
2
+ title: Chat With Doc
3
  emoji: 😻
4
+ colorFrom: gray
5
+ colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.29.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
 
 
 
app.ipynb ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from google.colab import drive\n",
10
+ "drive.mount('/content/drive')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "!pip install -r \"/content/drive/MyDrive/Colab Notebooks/Search_your_pdf APP/requirements.txt\"\n",
20
+ "!pip install pyngrok"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "!streamlit run \"/content/drive/MyDrive/Colab Notebooks/Search_your_pdf APP/app.py\" &>\"/content/drive/MyDrive/Colab Notebooks/Search_your_pdf APP/logs_streamlit.txt\" &\n"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "!ngrok config add-authtoken 2Z7XecBchSB7U8OxYamQIBoDH4F_7huod8eqNPzz6W5hgu1Uz"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "from pyngrok import ngrok\n",
48
+ "ngrok_tunnel = ngrok.connect(8501)\n",
49
+ "print('Public URL:', ngrok_tunnel.public_url)"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "ngrok.kill()"
59
+ ]
60
+ }
61
+ ],
62
+ "metadata": {
63
+ "language_info": {
64
+ "name": "python"
65
+ }
66
+ },
67
+ "nbformat": 4,
68
+ "nbformat_minor": 2
69
+ }
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from transformers import pipeline
4
+ import torch
5
+ import base64
6
+ import textwrap
7
+ from langchain.embeddings import SentenceTransformerEmbeddings
8
+ from langchain.vectorstores import Chroma
9
+ from langchain.llms.huggingface_pipeline import HuggingFacePipeline
10
+ from langchain.chains import RetrievalQA
11
+ from streamlit_chat import message
12
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+ from langchain.embeddings import SentenceTransformerEmbeddings
15
+ from langchain.vectorstores import Chroma
16
+ import os
17
+
18
+ st.set_page_config(page_title="pdf-GPT", page_icon="📖", layout="wide")
19
+ # @st.cache_resource
20
+ # def get_model():
21
+ # device = torch.device('cpu')
22
+ # # device = torch.device('cuda:0')
23
+
24
+ # checkpoint = "LaMini-T5-738M"
25
+ # checkpoint = "MBZUAI/LaMini-T5-738M"
26
+ # tokenizer = AutoTokenizer.from_pretrained(checkpoint)
27
+ # base_model = AutoModelForSeq2SeqLM.from_pretrained(
28
+ # checkpoint,
29
+ # device_map=device,
30
+ # torch_dtype = torch.float32,
31
+ # # offload_folder= "/model_ck"
32
+ # )
33
+ # return base_model,tokenizer
34
+
35
+ # @st.cache_resource
36
+ # def llm_pipeline():
37
+ # base_model,tokenizer = get_model()
38
+ # pipe = pipeline(
39
+ # 'text2text-generation',
40
+ # model = base_model,
41
+ # tokenizer=tokenizer,
42
+ # max_length = 512,
43
+ # do_sample = True,
44
+ # temperature = 0.3,
45
+ # top_p = 0.95,
46
+ # # device=device
47
+ # )
48
+
49
+ # local_llm = HuggingFacePipeline(pipeline = pipe)
50
+ # return local_llm
51
+
52
+ # @st.cache_resource
53
+ # def qa_llm():
54
+ # llm = llm_pipeline()
55
+ # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
56
+ # db = Chroma(persist_directory="db", embedding_function = embeddings)
57
+ # retriever = db.as_retriever()
58
+ # qa = RetrievalQA.from_chain_type(
59
+ # llm=llm,
60
+ # chain_type = "stuff",
61
+ # retriever = retriever,
62
+ # return_source_documents=True
63
+ # )
64
+ # return qa
65
+
66
+
67
+ # def process_answer(instruction):
68
+ # response=''
69
+ # instruction = instruction
70
+ # qa = qa_llm()
71
+ # generated_text = qa(instruction)
72
+ # answer = generated_text['result']
73
+ # return answer, generated_text
74
+
75
+ # Display conversation history using Streamlit messages
76
+ def display_conversation(history):
77
+ # st.write(history)
78
+ for i in range(len(history["generated"])):
79
+ message(history["past"][i] , is_user=True, key= str(i) + "_user")
80
+ if isinstance(history["generated"][i],str):
81
+ message(history["generated"][i] , key= str(i))
82
+ else:
83
+
84
+ message(history["generated"][i][0] , key= str(i))
85
+ # sources_list = []
86
+ # for source in history["generated"][i][1]['source_documents']:
87
+ # # st.write(source.metadata['source'])
88
+ # sources_list.append(source.metadata['source'])
89
+ # message(str(set(sources_list)) , key="sources_"+str(i))
90
+
91
+
92
+ # function to display the PDF of a given file
93
+ @st.cache_data
94
+ def displayPDF(file,file_name):
95
+ # Opening file from file path
96
+ with open(file, "rb") as f:
97
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
98
+
99
+ # Embedding PDF in HTML
100
+ # pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf"></iframe>'
101
+ # st.write()
102
+ # pdf_display = f'<embed src="http://localhost:8900/{file_name}" width="700" height="1000" type="application/pdf"></embed>'
103
+ pdf_display = f'<iframe src="http://localhost:8900/{file_name}" width="700" height="900" type="application/pdf"></iframe>'
104
+
105
+
106
+ # st.write(pdf_display)
107
+ st.markdown(pdf_display, unsafe_allow_html=True)
108
+
109
+ @st.cache_resource
110
+ def data_ingestion(file_path,persist_directory):
111
+ # for root, dirs, files in os.walk("docs"):
112
+ # for file in files:
113
+ if file_path.endswith(".pdf"):
114
+ print(file_path)
115
+ loader = PDFMinerLoader(file_path)
116
+ documents = loader.load()
117
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500)
118
+ texts = text_splitter.split_documents(documents)
119
+ # create embeddings
120
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
121
+ # create vector store
122
+ db = Chroma.from_documents(texts, embeddings, persist_directory="uploaded/db")
123
+ db.persist()
124
+ db=None
125
+
126
+ def main():
127
+ st.markdown("<h1 style='text-align:center; color: blue;'>Chat with Your PDF 📑</h1>", unsafe_allow_html=True)
128
+ st.markdown("<h3 style='text-align:center; color: grey;'>Built by Vicky</h3>", unsafe_allow_html=True)
129
+ st.markdown("<h2 style='text-align:center; color: red;'>Upload your PDF</h2>", unsafe_allow_html=True)
130
+
131
+ uploaded_file = st.file_uploader("",type=["pdf"])
132
+
133
+ if uploaded_file is not None:
134
+ file_details = {
135
+ "name" : uploaded_file.name,
136
+ "type" : uploaded_file.type,
137
+ "size" : uploaded_file.size
138
+ }
139
+ filepath = "uploaded/"+uploaded_file.name
140
+ with open(filepath, "wb") as temp_file:
141
+ temp_file.write(uploaded_file.read())
142
+
143
+ col1, col2 = st.columns([1,1])
144
+ with col1:
145
+ # st.markdown("<h2 style='text-align:center; color:grey;'>PDF Details</h2>",unsafe_allow_html=True)
146
+ # st.write(file_details)
147
+ st.markdown("<h2 style='text-align:center; color: grey;'>PDF Preview</h2>", unsafe_allow_html=True)
148
+ displayPDF(filepath,uploaded_file.name)
149
+ # displayPDF(uploaded_file)
150
+ with col2:
151
+ with st.spinner("Embeddings are in process......."):
152
+ ingested_data = data_ingestion(filepath,filepath)
153
+ st.success('Embeddings are created Successfully!')
154
+ st.markdown("<h2 style='text-align:center; color: grey;'>Chat Here</h2>", unsafe_allow_html=True)
155
+
156
+
157
+ user_input = st.text_input(label="Message",key="input")
158
+ # user_input = st.chat_input("",key="input")
159
+ # styl = f"""
160
+ # <style>
161
+ # .stTextInput {{
162
+ # position: fixed;
163
+ # bottom: 3rem;
164
+ # }}
165
+ # </style>
166
+ # """
167
+ # st.markdown(styl, unsafe_allow_html=True)
168
+
169
+ # Initialize session state for generated responses and past messages
170
+ if "generated" not in st.session_state:
171
+ st.session_state["generated"] = ["I am ready to help you"]
172
+ if "past" not in st.session_state:
173
+ st.session_state["past"] = ["Hey There!"]
174
+
175
+ # Search the database for a response based on user input and update session state
176
+ if user_input:
177
+ # answer = process_answer({"query" : user_input})
178
+ answer = user_input
179
+ st.session_state["past"].append(user_input)
180
+ response = answer
181
+ st.session_state["generated"].append(response)
182
+ st.write(st.session_state)
183
+ # user_input = st.text_input(label="Message",key="input")
184
+
185
+ # Display Conversation history using Streamlit messages
186
+ if st.session_state["generated"]:
187
+ display_conversation(st.session_state)
188
+
189
+
190
+
191
+ if __name__ == "__main__":
192
+ main()
chatbot.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from transformers import pipeline
4
+ import torch
5
+ import base64
6
+ import textwrap
7
+ from langchain.embeddings import SentenceTransformerEmbeddings
8
+ from langchain.vectorstores import Chroma
9
+ from langchain.llms.huggingface_pipeline import HuggingFacePipeline
10
+ from langchain.chains import RetrievalQA
11
+ from streamlit_chat import message
12
+
13
+ # device = torch.device('cpu')
14
+ device = torch.device('cuda:0')
15
+
16
+
17
+ checkpoint = "LaMini-T5-738M"
18
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
19
+ base_model = AutoModelForSeq2SeqLM.from_pretrained(
20
+ checkpoint,
21
+ device_map=device,
22
+ torch_dtype = torch.float32,
23
+ # offload_folder= "/model_ck"
24
+ )
25
+
26
+ @st.cache_resource
27
+ def llm_pipeline():
28
+ pipe = pipeline(
29
+ 'text2text-generation',
30
+ model = base_model,
31
+ tokenizer=tokenizer,
32
+ max_length = 256,
33
+ do_sample = True,
34
+ temperature = 0.3,
35
+ top_p = 0.95,
36
+ )
37
+
38
+ local_llm = HuggingFacePipeline(pipeline = pipe)
39
+ return local_llm
40
+
41
+ @st.cache_resource
42
+ def qa_llm():
43
+ llm = llm_pipeline()
44
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
45
+ db = Chroma(persist_directory="db", embedding_function = embeddings)
46
+ retriever = db.as_retriever()
47
+ qa = RetrievalQA.from_chain_type(
48
+ llm=llm,
49
+ chain_type = "stuff",
50
+ retriever = retriever,
51
+ return_source_documents=True
52
+ )
53
+ return qa
54
+
55
+
56
+ def process_answer(instruction):
57
+ response=''
58
+ instruction = instruction
59
+ qa = qa_llm()
60
+ generated_text = qa(instruction)
61
+ answer = generated_text['result']
62
+ return answer, generated_text
63
+
64
+ # Display conversation history using Streamlit messages
65
+ def display_conversation(history):
66
+ for i in range(len(history["generated"])):
67
+ message(history["past"][i] , is_user=True, key= str(i) + "_user")
68
+ message(history["generated"][i] , key= str(i))
69
+
70
+
71
+ def main():
72
+ st.title("Chat with your pdf📚")
73
+ with st.expander("About the App"):
74
+ st.markdown(
75
+ """
76
+ This is a Generative AI powered Question and Answering app that responds to questions about your PDF file.
77
+ """
78
+ )
79
+
80
+ user_input = st.text_input("",key="input")
81
+
82
+ # Initialize session state for generated responses and past messages
83
+ if "generated" not in st.session_state:
84
+ st.session_state["generated"] = ["I am ready to help you"]
85
+ if "past" not in st.session_state:
86
+ st.session_state["past"] = ["Hey There!"]
87
+
88
+ # Search the database for a response based on user input and update session state
89
+ if user_input:
90
+ answer = process_answer({"query" : user_input})
91
+ st.session_state["past"].append(user_input)
92
+ response = answer
93
+ st.session_state["generated"].append(response)
94
+
95
+ # Display Conversation history using Streamlit messages
96
+ if st.session_state["generated"]:
97
+ display_conversation(st.session_state)
98
+
99
+
100
+
101
+ if __name__ == "__main__":
102
+ main()
constants.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from chromadb.config import Settings
3
+
4
+ # Define Chroma Settings
5
+ CHROMA_SETTINGS = Settings(
6
+ chroma_db_impl = 'duckdb+parquet' ,
7
+ persist_directory = "db",
8
+ anonymized_telemetry = False
9
+ )
db/c811917d-8276-48ba-b913-6ed6196f4484/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0656652b4f3db81247ca6f4a0365416da3b66a0ed0cd46e9392400ee92da06ef
3
+ size 62012000
db/c811917d-8276-48ba-b913-6ed6196f4484/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44c6e025ebb371f800e844ce62d9b7dde9b123633b5d9e3bf6199de9a6580582
3
+ size 100
db/c811917d-8276-48ba-b913-6ed6196f4484/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b13caae7bf03a47b0bc51c04f39eb07ffdc234fe6b7f369b872a2447117da8
3
+ size 2144478
db/c811917d-8276-48ba-b913-6ed6196f4484/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fd7fddbb7246719bc06423736fe0cebe9b417bdb555ae72f6061248bc1e995
3
+ size 148000
db/c811917d-8276-48ba-b913-6ed6196f4484/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fbff72c999b684e5ef2d0dfbeb81e5179ca48fa5c62b8ccadf3ef53f2561744
3
+ size 317184
db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c5ae7212513205065174fc77e7fd813e803de0635f4fb32947eeeb2fbb067cf
3
+ size 264290304
docs/Alfred V. Aho, Monica S. Lam, Ravi Sethi, Jeffrey D. Ullman-Compilers - Principles, Techniques, and Tools-Pearson_Addison Wesley (2006).pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92646e7788a17653fbcd9aaf16724ae62e67b4990f4289ee39ca55e5fb9ab62a
3
+ size 6060190
ingest.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings import SentenceTransformerEmbeddings
4
+ from langchain.vectorstores import Chroma
5
+ import os
6
+ from constants import CHROMA_SETTINGS
7
+
8
+ persist_directory = "db"
9
+
10
+ def main():
11
+ for root, dirs, files in os.walk("docs"):
12
+ for file in files:
13
+ if file.endswith(".pdf"):
14
+ print(file)
15
+ loader = PDFMinerLoader(os.path.join(root, file))
16
+ documents = loader.load()
17
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500)
18
+ texts = text_splitter.split_documents(documents)
19
+ # create embeddings
20
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
21
+ # create vector store
22
+ db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
23
+ db.persist()
24
+ db=None
25
+
26
+ if __name__ == "__main__":
27
+ main()
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ streamlit
3
+ transformers
4
+ requests
5
+ torch
6
+ einops
7
+ accelerate
8
+ bitsandbytes
9
+ pdfminer.six
10
+ bs4
11
+ sentence-transformers
12
+ chromadb
13
+ torchvision
14
+ torchaudio
15
+ sentencepiece
16
+ requests
17
+ uvicorn
18
+ streamlit-chat