Update app.py
Browse files
app.py
CHANGED
@@ -3,8 +3,40 @@ import time
|
|
3 |
import streamlit as st
|
4 |
from streamlit_chat import message
|
5 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
|
|
6 |
|
7 |
-
from chat import generate_response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
if "tokenizer" not in st.session_state:
|
10 |
st.session_state["tokenizer"] = AutoTokenizer.from_pretrained(
|
@@ -14,7 +46,7 @@ if "tokenizer" not in st.session_state:
|
|
14 |
"MBZUAI/LaMini-Flan-T5-783M"
|
15 |
)
|
16 |
|
17 |
-
st.title("
|
18 |
# Storing the chat
|
19 |
if "generated" not in st.session_state:
|
20 |
st.session_state["generated"] = []
|
@@ -30,25 +62,58 @@ def get_text():
|
|
30 |
|
31 |
|
32 |
user_input = get_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
if user_input:
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
prompt_template = "\nBased on the above content, try to answer the following question.\n\n"
|
39 |
-
end_prompt = "Please make meaningful sentence and try to be descriptive as possible, ending with proper punctuations. If you think, there is good descriptive answers to the question from the above content, write sorry and advise them to contact Bibek directly.\n" # NoQA"
|
40 |
-
short_response_template = "\nIf your response is very short like 1 or 2 sentence, add a followup sentence like 'Let me know if there's anything else I can help you with. or If there's anything else I can assist with, please don't hesitate to ask. I mean something similar in polite way." # NoQA
|
41 |
-
input = output + prompt_template + user_input + end_prompt
|
42 |
start = time.time()
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
outputs = model.generate(input_ids, max_length=512, do_sample=True)
|
49 |
-
output = tokenizer.decode(outputs[0]).strip('<pad></s>').strip()
|
50 |
end = time.time()
|
51 |
|
|
|
|
|
52 |
print("Time for model inference: ", end - start)
|
53 |
# Checks for memory overflow
|
54 |
if len(st.session_state.past) == 15:
|
|
|
3 |
import streamlit as st
|
4 |
from streamlit_chat import message
|
5 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
6 |
+
import textwrap
|
7 |
|
8 |
+
from chat import generate_response, generate_tag
|
9 |
+
|
10 |
+
@st.cache(allow_output_mutation=True, hash_funcs={"_json.Scanner": lambda x: None})
|
11 |
+
def create_database():
|
12 |
+
import json
|
13 |
+
from langchain.docstore.document import Document
|
14 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
15 |
+
from langchain.vectorstores import FAISS
|
16 |
+
|
17 |
+
# Maybe it is better to save the embeddings than the text dataset
|
18 |
+
if "db" not in st.session_state:
|
19 |
+
json_file_path = "./new_dataset.json"
|
20 |
+
|
21 |
+
string_chunks = []
|
22 |
+
|
23 |
+
with open(json_file_path, "r") as json_file:
|
24 |
+
for line in json_file:
|
25 |
+
if line != '\n':
|
26 |
+
json_string = json.loads(line)
|
27 |
+
string_chunks.append(json_string)
|
28 |
+
documents_ = []
|
29 |
+
for line in string_chunks:
|
30 |
+
loader = Document(page_content=line)
|
31 |
+
documents_.append(loader)
|
32 |
+
embeddings = HuggingFaceEmbeddings()
|
33 |
+
|
34 |
+
|
35 |
+
db = FAISS.from_documents(documents_, embeddings)
|
36 |
+
print(type(db))
|
37 |
+
return db
|
38 |
+
|
39 |
+
db = create_database()
|
40 |
|
41 |
if "tokenizer" not in st.session_state:
|
42 |
st.session_state["tokenizer"] = AutoTokenizer.from_pretrained(
|
|
|
46 |
"MBZUAI/LaMini-Flan-T5-783M"
|
47 |
)
|
48 |
|
49 |
+
st.title("BGPT : Bibek's Personal Chatbot")
|
50 |
# Storing the chat
|
51 |
if "generated" not in st.session_state:
|
52 |
st.session_state["generated"] = []
|
|
|
62 |
|
63 |
|
64 |
user_input = get_text()
|
65 |
+
def wrap_text_preserve_newlines(text, width=110):
|
66 |
+
# Split the input text into lines based on newline characters
|
67 |
+
lines = text.split('\n')
|
68 |
+
|
69 |
+
# Wrap each line individually
|
70 |
+
wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
|
71 |
+
|
72 |
+
# Join the wrapped lines back together using newline characters
|
73 |
+
wrapped_text = '\n'.join(wrapped_lines).replace("page_content=", "").replace("metadata={}", "")
|
74 |
+
|
75 |
+
return wrapped_text
|
76 |
|
77 |
if user_input:
|
78 |
+
|
79 |
+
tag = generate_tag(user_input)
|
80 |
+
|
|
|
|
|
|
|
|
|
81 |
start = time.time()
|
82 |
+
# print(tag)
|
83 |
+
if tag in ["greeting"]:
|
84 |
+
output = "Hello π! Thanks for visiting!\n I am BGPT! I am here to assist you in obtaining information about Bibek. Feel free to ask me any questions about Bibek. These are some sample questions:\n (I) Tell me about Bibek.\n (II) What skills does Bibek have?\n (III) What work experience does Bibek have?\n (IV) What is Bibek's educational background?\n (V) What awards has Bibek won?\n (VI) What projects have Bibek completed? &\n (VII) How can I contact Bibek?"
|
85 |
+
else:
|
86 |
+
tokenizer = st.session_state["tokenizer"]
|
87 |
+
model = st.session_state["model"]
|
88 |
+
docs = db.similarity_search(user_input)
|
89 |
+
output = wrap_text_preserve_newlines(str(docs[0]))
|
90 |
+
if tag in ["welcome", "thanks", "exit"]:
|
91 |
+
input = user_input
|
92 |
+
elif tag in ["BibekBOT-introduction"]:
|
93 |
+
input = "I am BGPT, a large language model. I am here to assist you in obtaining information about Bibek. Feel free to ask me any questions about Bibek and I will make every effort to respond to all inquiries. These are some sample questions:\n (I) Tell me about Bibek.\n (II) What skills does Bibek have?\n (III) What work experience does Bibek have?\n (IV) What is Bibek's educational background?\n (V) What awards has Bibek won?\n (VI) What projects have Bibek completed? &\n (VII) How can I contact Bibek?. \n Can you paraphrase the above without changing the tone and contents."
|
94 |
+
elif tag in ["decline"]:
|
95 |
+
input = "Okay, if there's anything else I can assist with, please don't hesitate to ask. \n Can you paraphrase the above without changing much content and tone."
|
96 |
+
else:
|
97 |
+
# output = generate_response(user_input)
|
98 |
+
task_description_prompt = "I want you to act like my personal assistant chatbot named 'BGPT'. You are provided with some content and you will get one question. Try to answer the question in details based on the provided content. You may paraphrase the contents to reach your answer too. The below is the content: \n"
|
99 |
+
prompt_template = "\nBased on the above content, try to answer the following question.\n\n"
|
100 |
+
end_prompt = "\nPlease make meaningful sentence and try to be descriptive as possible responding with many sentences and ending with proper punctuations. If you think the content doesn't contain good answer to the question, give some polite respones telling them that you do not have specific response to the query and apologize and refer them to contact Bibek directly.\n" # NoQA"
|
101 |
+
short_response_template = "\nIf your response is very short like 1 or 2 sentence, add a followup sentence like 'Let me know if there's anything else I can help you with. or If there's anything else I can assist with, please don't hesitate to ask. I mean something similar in polite way." # NoQA
|
102 |
+
|
103 |
+
input = task_description_prompt + output + prompt_template + user_input + end_prompt
|
104 |
+
|
105 |
+
input_ids = tokenizer(
|
106 |
+
input,
|
107 |
+
return_tensors="pt",
|
108 |
+
).input_ids
|
109 |
+
|
110 |
+
outputs = model.generate(input_ids, max_length=512, do_sample=True)
|
111 |
+
output = tokenizer.decode(outputs[0]).strip("<pad></s>").strip()
|
112 |
|
|
|
|
|
113 |
end = time.time()
|
114 |
|
115 |
+
# print(input)
|
116 |
+
|
117 |
print("Time for model inference: ", end - start)
|
118 |
# Checks for memory overflow
|
119 |
if len(st.session_state.past) == 15:
|