tg131 commited on
Commit
9f6ab40
1 Parent(s): edfac63

application ready

Browse files
Files changed (2) hide show
  1. app.py +178 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pinecone
2
+ import streamlit as st
3
+ from sentence_transformers import SentenceTransformer
4
+ from transformers import BartTokenizer, BartForConditionalGeneration
5
+
6
+
7
+ class BartGenerator:
8
+ def __init__(self, model_name):
9
+ self.tokenizer = BartTokenizer.from_pretrained(model_name)
10
+ self.generator = BartForConditionalGeneration.from_pretrained(model_name)
11
+
12
+ def tokenize(self, query, max_length=1024):
13
+ inputs = self.tokenizer([query], max_length=max_length, return_tensors="pt")
14
+ return inputs
15
+
16
+ def generate(self, query, min_length=20, max_length=40):
17
+ inputs = self.tokenize(query)
18
+ ids = self.generator.generate(inputs["input_ids"], num_beams=1, min_length=int(min_length), max_length=int(max_length), temperature=int(temperature))
19
+ answer = self.tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
20
+ return answer
21
+
22
+ @st.experimental_singleton
23
+ def init_models():
24
+ retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base") #("multi-qa-mpnet-base-cos-v1") ("flax-sentence-embeddings/all_datasets_v3_mpnet-base")
25
+ generator = BartGenerator("vblagoje/bart_lfqa")
26
+ return retriever, generator
27
+
28
+ PINECONE_KEY = st.secrets["PINECONE_KEY"]
29
+
30
+ @st.experimental_singleton
31
+ def init_pinecone():
32
+ pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
33
+ return pinecone.Index("history-qa")
34
+
35
+ retriever, generator = init_models()
36
+ index = init_pinecone()
37
+
38
+ def display_answer(answer):
39
+ return st.markdown(f"""
40
+ <div class="container-fluid">
41
+ <div class="row align-items-start">
42
+ <div class="col-md-12 col-sm-12">
43
+ <span style="color: #808080;">
44
+ {answer}
45
+ </span>
46
+ </div>
47
+ </div>
48
+ </div>
49
+ """, unsafe_allow_html=True)
50
+
51
+ def display_context(title, context, url):
52
+ return st.markdown(f"""
53
+ <div class="container-fluid">
54
+ <div class="row align-items-start">
55
+ <div class="col-md-12 col-sm-12">
56
+ <a href={url}>{title}</a>
57
+ <br>
58
+ <span style="color: #808080;">
59
+ <small>{context}</small>
60
+ </span>
61
+ </div>
62
+ </div>
63
+ </div>
64
+ """, unsafe_allow_html=True)
65
+
66
+ hide_streamlit_style = """
67
+ <style>
68
+ #MainMenu {visibility: hidden;}
69
+ footer {visibility: hidden;}
70
+ </style>
71
+ """
72
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
73
+
74
+ st.write("""
75
+ # Jua Historia Yetu
76
+ ### An AI Powered Search Engine for East African History and Tourism!
77
+
78
+ This is an AI powered system designed to help learn about our history, heroes, cultures and tourist destinations.
79
+
80
+ The system generates a Human-like response to questions asked and points users to where they
81
+ can get more information on what they would like to know.
82
+ It is intended to act as a one-stop search engine for all things East Africa including the people, history, culture, wildlife and tourist destinations.
83
+ It can be of use to locals, tourists, students or anyone who would like to learn about The East African Community.
84
+ The data is to be sourced from the EAC e-resourse database, member nations' meuseums, archives and relevant tourism bodies.
85
+
86
+ Once queried, the system generates a short answer that the user can quickly read through and also points the user to
87
+ some resources they might find usefull. The user can click on the links to learn more.
88
+ """)
89
+
90
+ st.markdown("""
91
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
92
+ """, unsafe_allow_html=True)
93
+
94
+ def format_query(query, context):
95
+ context = [f"<P> {m['metadata']['passage_text']}" for m in context]
96
+ context = " ".join(context)
97
+ query = f"question: {query} context: {context}"
98
+ return query
99
+
100
+ # set parameters
101
+ top_k = 5
102
+ min_length = 1
103
+ max_length = 150
104
+ temperature = 3.5
105
+
106
+ st.sidebar.write("""
107
+ ## Here are some questions you can try out:
108
+ ### Copy and paste to test
109
+ who was the first person on the moon?\n
110
+ Which was the first radio station at Auburn University\n
111
+ where is Damastown located\n
112
+ What is the Lohanipur Torso \n
113
+ when was The Coliseum Theatre opened\n
114
+ Who invented the tatoo machine\n
115
+ whats th erecipe for Corn chowder\n
116
+ when was the Tamil Methodist Church built\n
117
+ when was the first electric power system built?\n
118
+ How was the first wireless message sent?\n
119
+ what was the war of currents?\n
120
+ what was NASAs most expensive project?\n
121
+ What brands of smokoing paper are manufactured by Miguel y Costas\n
122
+ what influenced the naming Holy Forty Martyrs Church\n
123
+ When was the world first power system built\n
124
+ which is the largest island within the Halifax Harbour\n
125
+ Who was Joseph Monier\n
126
+ who were the Karadjordjevic dynasty\n
127
+ how many royal tombs were excavated at Tillia Tepe\n
128
+ What did the HEICO company manufacture\n
129
+ tell me about The Battle of Antietam\n
130
+ Which was the smallest microbrewery in the United States\n
131
+ when did queen marie recieve the bran castle\n
132
+ Whe was York Township founded\n
133
+ When did the United Nations Security Council reform the security sector\n
134
+ When was Magandang Umaga Po first aired\n
135
+ when was Mae Lan District formed\n
136
+ what is Voice over Internet Protocol\n
137
+ When was InfluxDB developed\n
138
+ When was the Semanário Económico newspaper started\n
139
+ who owned Kasteln Castle\n
140
+ when was The Steinbach Haus built\n
141
+ when was the Guerrero ship in Africa\n
142
+ tell me about the Guerrero ship\n
143
+ When was the Companhia Paulista de Trens Metropolitanos rilway built\n
144
+ When was the lincoln mall demolished\n
145
+ where is Damastown located\n
146
+ when was solo diving first practiced\n
147
+ when was Consumers Credit Union History Consumers Credit Union was founded\n
148
+ Who built the castle of Daroynk\n
149
+ What is the prime meridian\n
150
+ Which was the first radio station at Auburn University\n
151
+ What are the origins of feminist music\n
152
+ What were the earliest insecticides to be used\n
153
+ who were the Drevlians\n
154
+ Who were the founders of A.F.C. Euro Kickers\n
155
+ when was the camera-on-a-chip developed\n
156
+ """)
157
+
158
+ st.write("If you encounter an error, search again.")
159
+ query = st.text_input("Search!", "")
160
+
161
+ if query != "":
162
+ with st.spinner(text="Wait a sec 🚀🚀🚀"):
163
+ xq = retriever.encode([query]).tolist()
164
+ xc = index.query(xq, top_k=int(top_k), include_metadata=True)
165
+ query = format_query(query, xc["matches"])
166
+
167
+ with st.spinner(text="Just a minute ✍️✍️✍️"):
168
+ answer = generator.generate(query, min_length=min_length, max_length=max_length)
169
+
170
+ st.write("#### System generated response:")
171
+ display_answer(answer)
172
+ st.write("#### Here are some resources you might find relevant:")
173
+
174
+ for m in xc["matches"]:
175
+ title = m["metadata"]["article_title"]
176
+ url = "https://en.wikipedia.org/wiki/" + title.replace(" ", "_")
177
+ context = m["metadata"]["passage_text"]
178
+ display_context(title, context, url)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pinecone-client
2
+ sentence-transformers