KushwanthK commited on
Commit
77ce35a
1 Parent(s): b0d0b31

chat with llama3 on bhagvatgeetha

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Bhagavad-Gita-As-It-Is.pdf filter=lfs diff=lfs merge=lfs -text
Bhagavad-Gita-As-It-Is.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f44f18a2408ca4dce5d7a7f134a6ba560afc2a4417113dfb849e514293cff3e
3
+ size 3891095
HuggingFaceEmbeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1e1f3c2fa8e46dca0be5a4282669586839343bf47426c4c626313d97bdd08a6
3
+ size 443737887
README.md CHANGED
@@ -9,5 +9,5 @@ app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  license: mit
11
  ---
12
+ # vedic_scriptures
13
+ Text generation using LLAMA-3 vedic_scriptures(RAG) Streamlit App
app.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from streamlit_chat import message
4
+ import numpy as np
5
+ import pandas as pd
6
+ from io import StringIO
7
+ import io
8
+ import PyPDF2
9
+ import pymupdf
10
+ import tempfile
11
+ import base64
12
+ from tqdm.auto import tqdm
13
+ import math
14
+ from transformers import pipeline
15
+
16
+ from collections import Counter
17
+ import nltk
18
+ from nltk.corpus import stopwords
19
+
20
+
21
+ from sentence_transformers import SentenceTransformer
22
+ import torch
23
+ from langchain_community.llms.ollama import Ollama
24
+ from langchain.prompts import ChatPromptTemplate
25
+
26
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
27
+
28
+ # if device != 'cuda':
29
+ # st.markdown(f"you are using {device}. This is much slower than using "
30
+ # "a CUDA-enabled GPU. If on colab you can change this by "
31
+ # "clicking Runtime > change runtime type > GPU.")
32
+
33
+ model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)
34
+ def display_title():
35
+ selected_value = st.session_state["value"]
36
+
37
+ st.header(f'Vedic Scriptures: {selected_value} :blue[book] :books:')
38
+
39
+ question = "ask anything about scriptures"
40
+ def open_chat():
41
+ question = st.session_state["faq"]
42
+
43
+
44
+
45
+ if "value" not in st.session_state:
46
+ st.session_state["value"] = None
47
+
48
+ if "faq" not in st.session_state:
49
+ st.session_state["faq"] = None
50
+
51
+ st.divider()
52
+
53
+ def highlight_pdf(file_path, text_to_highlight, page_numbers):
54
+ # Create a temporary file to save the modified PDF
55
+ # temp_pdf_path = "temp_highlighted_pdf.pdf"
56
+ # Create a temporary file to save the modified PDF
57
+ # with tempfile.NamedTemporaryFile(delete=False) as temp_file:
58
+ # temp_pdf_path = temp_file.name
59
+
60
+ # Open the original PDF
61
+ doc = pymupdf.open(file_path)
62
+
63
+ pages_to_display = [doc.load_page(page_number) for page_number in page_numbers]
64
+
65
+ print("pages_to_display")
66
+ print(pages_to_display)
67
+
68
+ # Tokenize the text into words
69
+ words = text_to_highlight.split()
70
+
71
+
72
+
73
+ # Remove stopwords
74
+ stop_words = set(stopwords.words("english"))
75
+ words = [word for word in words if word.lower() not in stop_words]
76
+
77
+ print(words)
78
+
79
+ # Count the frequency of each word
80
+ word_counts = Counter(words)
81
+
82
+ # Get the top N most frequent words
83
+ # top_words = [word for word, _ in word_counts.most_common(5)]
84
+
85
+ # Iterate over each page in the PDF
86
+ for page in pages_to_display:
87
+
88
+ # Highlight the specified words on the canvas
89
+ for word in words:
90
+ highlight_rect = page.search_for(word, quads=True)
91
+ # Highlight the text
92
+ # highlight_rect = pymupdf.Rect(word)
93
+ # highlight_annot = page.add_highlight_annot(highlight_rect)
94
+ # highlight_annot.set_colors({"stroke": pymupdf.utils.getColor("yellow")})
95
+ # highlight_annot.update()
96
+ page.add_highlight_annot(highlight_rect)
97
+
98
+ # Create a new document with only the specified pages
99
+ new_doc = pymupdf.open()
100
+ for page in pages_to_display:
101
+ new_doc.insert_pdf(doc, from_page=page.number, to_page=page.number)
102
+
103
+ # Save the modified PDF
104
+ # Save the document to a temporary file
105
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
106
+ temp_pdf_path = temp_file.name
107
+ new_doc.save(temp_pdf_path)
108
+
109
+ print(temp_pdf_path)
110
+
111
+ # new_doc.save("example_highlighted.pdf")
112
+
113
+ return temp_pdf_path
114
+
115
+ file_path = "../Transformers/Bhagavad-Gita-As-It-Is.pdf"
116
+ text_to_highlight = ""
117
+ sources = []
118
+
119
+ # Function to display PDF in Streamlit
120
+ def display_highlighted_pdf(file_path, text_to_highlight, sources):
121
+ # pdf_path = "../Transformers/Bhagavad-Gita-As-It-Is.pdf"
122
+ # sources = [7,8]
123
+ # response_text = "I offer my respectful obeisances unto the lotus feet of my spiritual master and unto the feet of all Vaiñëavas. I offer my respectful"
124
+
125
+ pdf_path = highlight_pdf(file_path=file_path, text_to_highlight=text_to_highlight, page_numbers=sources)
126
+
127
+ with open(pdf_path, "rb") as file:
128
+ pdf_bytes = file.read()
129
+ base64_pdf = base64.b64encode(pdf_bytes).decode("utf-8")
130
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf"></iframe>'
131
+ st.markdown(pdf_display, unsafe_allow_html=True)
132
+
133
+ # Creating a Index(Pinecone Vector Database)
134
+ import os
135
+ # import pinecone
136
+
137
+
138
+ def get_faiss_semantic_index():
139
+ import pickle
140
+
141
+ # File path to the pickle file
142
+ file_path = "./HuggingFaceEmbeddings.pkl"
143
+
144
+ # Load embeddings from the pickle file
145
+ with open(file_path, "rb") as f:
146
+ index = pickle.load(f)
147
+
148
+ print("Embeddings loaded successfully.")
149
+ return index
150
+
151
+ # def promt_engineer(text):
152
+ PROMPT_TEMPLATE = """
153
+ Instructions:
154
+ --------------------------------------------------------
155
+ you're a vedic scriptures AI expert. you shouldnot answer to any other domain specific question.
156
+ You 1000 Dollars rewards for Before answering questions always try to map the question related to the TITLE > CHAPTER > TEXT > PURPORT.
157
+ You 1000 Dollars rewards Must provide the Chapter Number and Text number in this format chapter <no> : Text <no>
158
+ You 1000 Dollars rewards Must provide the Title of the chapter. you also provide source path from where youre answering the question.
159
+ You 1000 Dollars penality for the relevant questions to answer.
160
+ Please dont answer from the public sources strictly answer from the context.
161
+ If the question is not related to the context replay with question doesnot belongs to vedic scriptures or Vedic literature.
162
+ Answer the question based only on the following context:
163
+
164
+ {context}
165
+
166
+ ---
167
+
168
+ Answer the question based on the above context: {question}
169
+ """
170
+ # # Load the summarization pipeline with the specified model
171
+ # summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
172
+
173
+ # # Generate the prompt
174
+ # prompt = prompt_template.format(text=text)
175
+
176
+ # # Generate the summary
177
+ # summary = summarizer(prompt, max_length=1024, min_length=50)[0]["summary_text"]
178
+
179
+ # with st.sidebar:
180
+ # st.divider()
181
+ # st.markdown("*:red[Text Summary Generation]* from above Top 5 **:green[similarity search results]**.")
182
+ # st.write(summary)
183
+ # st.divider()
184
+
185
+ def chat_actions():
186
+
187
+ index = get_faiss_semantic_index()
188
+
189
+ st.session_state["chat_history"].append(
190
+ {"role": "user", "content": st.session_state["chat_input"]},
191
+ )
192
+
193
+ # query_embedding = model.encode(st.session_state["chat_input"])
194
+ query = st.session_state["chat_input"]
195
+ docs = index.similarity_search(query, k=2)
196
+ for doc in docs:
197
+ print("\n")
198
+ print(str(doc.metadata["page"]+1) + ":", doc.page_content)
199
+ context_text = "\n\n---\n\n".join([doc.page_content for doc in docs])
200
+
201
+ sources = [doc.metadata.get("page", None) for doc in docs]
202
+
203
+
204
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
205
+ prompt = prompt_template.format(context=context_text, question=query)
206
+
207
+ model = Ollama(model="llama3")
208
+ response_text = model.invoke(prompt)
209
+
210
+ formatted_response = f"Response: {response_text}\nSources: {sources}"
211
+ print(formatted_response)
212
+
213
+ st.session_state["chat_history"].append(
214
+ {
215
+ "role": "assistant",
216
+ "content": f"{response_text}",
217
+ }, # This can be replaced with your chat response logic
218
+ )
219
+ # break;
220
+ # Example usage
221
+ file_path = "../Transformers/Bhagavad-Gita-As-It-Is.pdf"
222
+ text_to_highlight = context_text.strip()
223
+ display_highlighted_pdf(file_path, response_text, sources)
224
+
225
+ with st.sidebar:
226
+ option = st.selectbox(
227
+ "Select Your Favorite Scriptures",
228
+ ("Bhagvatgeetha", "Bhagavatham", "Ramayanam"),
229
+ # index=None,
230
+ # placeholder="Select scriptures...",
231
+ key="value",
232
+ on_change=display_title
233
+ )
234
+
235
+ st.write("You selected:", option)
236
+
237
+ faq = st.selectbox(
238
+ "Select Your Favorite Scriptures",
239
+ ("Why does atheism exist even when all questions are answered in Bhagavad Gita?",
240
+ "Why don’t all souls surrender to Lord Krishna, although he has demonstrated that everyone is part and parcel of Him, and all can be liberated from all sufferings by surrendering to Him?",
241
+ "Why do souls misuse their independence by rebelling against Lord Krishna?"),
242
+ # index=None,
243
+ # placeholder="Select scriptures...",
244
+ key="faq",
245
+ on_change=open_chat
246
+ )
247
+ st.write("You selected:", faq)
248
+
249
+
250
+ if "chat_history" not in st.session_state:
251
+ st.session_state["chat_history"] = []
252
+
253
+ st.chat_input(question, on_submit=chat_actions, key="chat_input")
254
+
255
+ for i in st.session_state["chat_history"]:
256
+ with st.chat_message(name=i["role"]):
257
+ st.write(i["content"])
258
+
259
+
260
+
261
+
262
+
model.py ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentence-transformers
2
+ datasets
3
+ torch
4
+ streamlit-chat-media
5
+ streamlit-chat
6
+ transformers
7
+ PyPDF2
8
+ ratelimit
9
+ backoff
10
+ tqdm
11
+ openai
12
+ PyMuPDF # instead of fitz