bejaeger's picture
Update app.py
9a91e05
raw
history blame contribute delete
No virus
7.48 kB
import streamlit as st
import pinecone
from sentence_transformers import SentenceTransformer
import logging
import openai
PINECONE_KEY = st.secrets["PINECONE_KEY"] # app.pinecone.io
OPENAI_KEY = st.secrets["OPENAI_KEY"]
INDEX_ID = 'sean-carrol-biggest-ideas-of-the-universe'
@st.experimental_singleton
def init_openai():
openai.api_key = OPENAI_KEY
@st.experimental_singleton
def init_pinecone():
pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
return pinecone.Index(INDEX_ID)
@st.experimental_singleton
def init_retriever():
return SentenceTransformer("multi-qa-mpnet-base-dot-v1")
def make_query(query, retriever, top_k=3, include_values=True, include_metadata=True, filter=None):
xq = retriever.encode([query]).tolist()
logging.info(f"Query: {query}")
attempt = 0
while attempt < 3:
try:
xc = st.session_state.index.query(
xq,
top_k=top_k,
include_values=include_values,
include_metadata=include_metadata,
filter=filter
)
matches = xc['matches']
break
except:
# force reload
pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
st.session_state.index = pinecone.Index(INDEX_ID)
attempt += 1
matches = []
if len(matches) == 0:
logging.error(f"Query failed")
return matches
def get_prompt(matches):
contexts = [
x['metadata']['text'] for x in matches
]
prompt_start = (
"Answer the question based on the context below.\n\n"+
"Context:\n"
)
prompt_end = (
f"\n\nQuestion: {query}\nAnswer:"
)
limit = 3750
for i in range(1, len(contexts)):
if len("\n\n--\n\n".join(contexts[:i])) >= limit:
prompt = (
prompt_start +
"\n\n--\n\n".join(contexts[:i-1]) +
prompt_end
)
break
elif i == len(contexts) - 1:
prompt = (
prompt_start +
"\n\n--\n\n".join(contexts) +
prompt_end
)
return prompt
init_openai()
st.session_state.index = init_pinecone()
retriever = init_retriever()
def card(thumbnail: str, title: str, urls: list, contexts: list, starts: list, ends: list):
meta = [(e, s, u, c) for e, s, u, c in zip(ends, starts, urls, contexts)]
meta.sort(reverse=False)
text_content = []
current_start = 0
current_end = 0
for end, start, url, context in meta:
# reformat seconds to timestamp
time = start / 60
mins = f"0{int(time)}"[-2:]
secs = f"0{int(round((time - int(mins))*60, 0))}"[-2:]
timestamp = f"{mins}:{secs}"
if start < current_end and start > current_start:
# this means it is a continuation of the previous sentence
text_content[-1][0] = text_content[-1][0].split(context[:10])[0]
text_content.append([f"[{timestamp}] {context.capitalize()}", url])
else:
text_content.append(["xxLINEBREAKxx", ""])
text_content.append([f"[{timestamp}] {context}", url])
current_start = start
current_end = end
html_text = ""
for text, url in text_content:
if text == "xxLINEBREAKxx":
html_text += "<br>"
else:
html_text += f"<small><a href={url}>{text.strip()}... </a></small>"
print(text)
html = f"""
<div class="container-fluid">
<div class="row align-items-start">
<div class="col-md-4 col-sm-4">
<div class="position-relative">
<a href={urls[0]}><img src={thumbnail} class="img-fluid" style="width: 192px; height: 106px"></a>
</div>
</div>
<div class="col-md-8 col-sm-8">
<h2>{title}</h2>
</div>
<div>
{html_text}
<br><br>
"""
return st.markdown(html, unsafe_allow_html=True)
channel_map = {
'James Briggs': 'UCv83tO5cePwHMt1952IVVHw',
'Daniel Bourke': 'UCr8O8l5cCX85Oem1d18EezQ',
'Yannic Kilcher': 'UCZHmQk67mSJgfCCTn7xBfew',
'AI Coffee Break with Letitia': 'UCobqgqE4i5Kf7wrxRxhToQA',
'sentdex': 'UCfzlCWGWYyIQ0aLC5w48gBQ'
}
st.write("""
# Sean Carroll Explains
""")
st.info("""
Ask any question about Sean Carroll's video series 'The Biggest Ideas in the Universe'.
The search is built using OpenAI's Whisper, SentenceTransformer, GPT-3, and Pinecone, and is built off of James Brigg's [example](https://pinecone.io/learn/openai-whisper)!
""")
st.markdown("""
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
""", unsafe_allow_html=True)
query = st.text_input("Ask about the universe...", "")
st.checkbox("Generate summary with GPT-3?", key="summarize")
# with st.expander("Advanced Options"):
# channel_options = st.multiselect(
# 'Channels to Search',
# ['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex'],
# ['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex']
# )
if query != "":
# channels = [channel_map[name] for name in channel_options]
print(f"query: {query}")
matches = make_query(
query, retriever, top_k=5,
# filter={
# 'channel_id': {'$in': channels}
# }
)
if st.session_state.summarize:
prompt = get_prompt(matches)
res = openai.Completion.create(
engine='text-davinci-003',
prompt=prompt,
temperature=0,
max_tokens=300,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
stop=".",
)
summary = res['choices'][0]['text'].strip()
st.info(f"Summary:\n{summary}")
results = {}
order = []
for context in matches:
video_id = context['metadata']['url'].split('/')[-1]
if video_id not in results:
results[video_id] = {
'title': context['metadata']['title'],
'urls': [f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"],
'contexts': [context['metadata']['text']],
'starts': [int(context['metadata']['start'])],
'ends': [int(context['metadata']['end'])]
}
order.append(video_id)
else:
results[video_id]['urls'].append(
f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"
)
results[video_id]['contexts'].append(
context['metadata']['text']
)
results[video_id]['starts'].append(int(context['metadata']['start']))
results[video_id]['ends'].append(int(context['metadata']['end']))
# now display cards
for video_id in order:
card(
thumbnail=f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
title=results[video_id]['title'],
urls=results[video_id]['urls'],
contexts=results[video_id]['contexts'],
starts=results[video_id]['starts'],
ends=results[video_id]['ends']
)