jumuiya / app.py
tg131
application ready
9f6ab40
import pinecone
import streamlit as st
from sentence_transformers import SentenceTransformer
from transformers import BartTokenizer, BartForConditionalGeneration
class BartGenerator:
def __init__(self, model_name):
self.tokenizer = BartTokenizer.from_pretrained(model_name)
self.generator = BartForConditionalGeneration.from_pretrained(model_name)
def tokenize(self, query, max_length=1024):
inputs = self.tokenizer([query], max_length=max_length, return_tensors="pt")
return inputs
def generate(self, query, min_length=20, max_length=40):
inputs = self.tokenize(query)
ids = self.generator.generate(inputs["input_ids"], num_beams=1, min_length=int(min_length), max_length=int(max_length), temperature=int(temperature))
answer = self.tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return answer
@st.experimental_singleton
def init_models():
retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base") #("multi-qa-mpnet-base-cos-v1") ("flax-sentence-embeddings/all_datasets_v3_mpnet-base")
generator = BartGenerator("vblagoje/bart_lfqa")
return retriever, generator
PINECONE_KEY = st.secrets["PINECONE_KEY"]
@st.experimental_singleton
def init_pinecone():
pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
return pinecone.Index("history-qa")
retriever, generator = init_models()
index = init_pinecone()
def display_answer(answer):
return st.markdown(f"""
<div class="container-fluid">
<div class="row align-items-start">
<div class="col-md-12 col-sm-12">
<span style="color: #808080;">
{answer}
</span>
</div>
</div>
</div>
""", unsafe_allow_html=True)
def display_context(title, context, url):
return st.markdown(f"""
<div class="container-fluid">
<div class="row align-items-start">
<div class="col-md-12 col-sm-12">
<a href={url}>{title}</a>
<br>
<span style="color: #808080;">
<small>{context}</small>
</span>
</div>
</div>
</div>
""", unsafe_allow_html=True)
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
st.write("""
# Jua Historia Yetu
### An AI Powered Search Engine for East African History and Tourism!
This is an AI powered system designed to help learn about our history, heroes, cultures and tourist destinations.
The system generates a Human-like response to questions asked and points users to where they
can get more information on what they would like to know.
It is intended to act as a one-stop search engine for all things East Africa including the people, history, culture, wildlife and tourist destinations.
It can be of use to locals, tourists, students or anyone who would like to learn about The East African Community.
The data is to be sourced from the EAC e-resourse database, member nations' meuseums, archives and relevant tourism bodies.
Once queried, the system generates a short answer that the user can quickly read through and also points the user to
some resources they might find usefull. The user can click on the links to learn more.
""")
st.markdown("""
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
""", unsafe_allow_html=True)
def format_query(query, context):
context = [f"<P> {m['metadata']['passage_text']}" for m in context]
context = " ".join(context)
query = f"question: {query} context: {context}"
return query
# set parameters
top_k = 5
min_length = 1
max_length = 150
temperature = 3.5
st.sidebar.write("""
## Here are some questions you can try out:
### Copy and paste to test
who was the first person on the moon?\n
Which was the first radio station at Auburn University\n
where is Damastown located\n
What is the Lohanipur Torso \n
when was The Coliseum Theatre opened\n
Who invented the tatoo machine\n
whats th erecipe for Corn chowder\n
when was the Tamil Methodist Church built\n
when was the first electric power system built?\n
How was the first wireless message sent?\n
what was the war of currents?\n
what was NASAs most expensive project?\n
What brands of smokoing paper are manufactured by Miguel y Costas\n
what influenced the naming Holy Forty Martyrs Church\n
When was the world first power system built\n
which is the largest island within the Halifax Harbour\n
Who was Joseph Monier\n
who were the Karadjordjevic dynasty\n
how many royal tombs were excavated at Tillia Tepe\n
What did the HEICO company manufacture\n
tell me about The Battle of Antietam\n
Which was the smallest microbrewery in the United States\n
when did queen marie recieve the bran castle\n
Whe was York Township founded\n
When did the United Nations Security Council reform the security sector\n
When was Magandang Umaga Po first aired\n
when was Mae Lan District formed\n
what is Voice over Internet Protocol\n
When was InfluxDB developed\n
When was the Semanário Económico newspaper started\n
who owned Kasteln Castle\n
when was The Steinbach Haus built\n
when was the Guerrero ship in Africa\n
tell me about the Guerrero ship\n
When was the Companhia Paulista de Trens Metropolitanos rilway built\n
When was the lincoln mall demolished\n
where is Damastown located\n
when was solo diving first practiced\n
when was Consumers Credit Union History Consumers Credit Union was founded\n
Who built the castle of Daroynk\n
What is the prime meridian\n
Which was the first radio station at Auburn University\n
What are the origins of feminist music\n
What were the earliest insecticides to be used\n
who were the Drevlians\n
Who were the founders of A.F.C. Euro Kickers\n
when was the camera-on-a-chip developed\n
""")
st.write("If you encounter an error, search again.")
query = st.text_input("Search!", "")
if query != "":
with st.spinner(text="Wait a sec 🚀🚀🚀"):
xq = retriever.encode([query]).tolist()
xc = index.query(xq, top_k=int(top_k), include_metadata=True)
query = format_query(query, xc["matches"])
with st.spinner(text="Just a minute ✍️✍️✍️"):
answer = generator.generate(query, min_length=min_length, max_length=max_length)
st.write("#### System generated response:")
display_answer(answer)
st.write("#### Here are some resources you might find relevant:")
for m in xc["matches"]:
title = m["metadata"]["article_title"]
url = "https://en.wikipedia.org/wiki/" + title.replace(" ", "_")
context = m["metadata"]["passage_text"]
display_context(title, context, url)