successor commited on
Commit
0beb012
1 Parent(s): eeb83d0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pinecone
3
+ from sentence_transformers import SentenceTransformer
4
+ import logging
5
+
6
+ PINECONE_KEY = st.secrets["PINECONE_KEY"] # app.pinecone.io
7
+ INDEX_ID = 'ask-youtube'
8
+
9
+ @st.experimental_singleton
10
+ def init_pinecone():
11
+ pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
12
+ return pinecone.Index(INDEX_ID)
13
+
14
+ @st.experimental_singleton
15
+ def init_retriever():
16
+ return SentenceTransformer("multi-qa-mpnet-base-dot-v1")
17
+
18
+ def make_query(query, retriever, top_k=10, include_values=True, include_metadata=True, filter=None):
19
+ xq = retriever.encode([query]).tolist()
20
+ logging.info(f"Query: {query}")
21
+ attempt = 0
22
+ while attempt < 3:
23
+ try:
24
+ xc = st.session_state.index.query(
25
+ xq,
26
+ top_k=top_k,
27
+ include_values=include_values,
28
+ include_metadata=include_metadata,
29
+ filter=filter
30
+ )
31
+ matches = xc['matches']
32
+ break
33
+ except:
34
+ # force reload
35
+ pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
36
+ st.session_state.index = pinecone.Index(INDEX_ID)
37
+ attempt += 1
38
+ matches = []
39
+ if len(matches) == 0:
40
+ logging.error(f"Query failed")
41
+ return matches
42
+
43
+ st.session_state.index = init_pinecone()
44
+ retriever = init_retriever()
45
+
46
+ def card(thumbnail: str, title: str, urls: list, contexts: list, starts: list, ends: list):
47
+ meta = [(e, s, u, c) for e, s, u, c in zip(ends, starts, urls, contexts)]
48
+ meta.sort(reverse=False)
49
+ text_content = []
50
+ current_start = 0
51
+ current_end = 0
52
+ for end, start, url, context in meta:
53
+ # reformat seconds to timestamp
54
+ time = start / 60
55
+ mins = f"0{int(time)}"[-2:]
56
+ secs = f"0{int(round((time - int(mins))*60, 0))}"[-2:]
57
+ timestamp = f"{mins}:{secs}"
58
+ if start < current_end and start > current_start:
59
+ # this means it is a continuation of the previous sentence
60
+ text_content[-1][0] = text_content[-1][0].split(context[:10])[0]
61
+ text_content.append([f"[{timestamp}] {context.capitalize()}", url])
62
+ else:
63
+ text_content.append(["xxLINEBREAKxx", ""])
64
+ text_content.append([f"[{timestamp}] {context}", url])
65
+ current_start = start
66
+ current_end = end
67
+ html_text = ""
68
+ for text, url in text_content:
69
+ if text == "xxLINEBREAKxx":
70
+ html_text += "<br>"
71
+ else:
72
+ html_text += f"<small><a href={url}>{text.strip()}... </a></small>"
73
+ print(text)
74
+ html = f"""
75
+ <div class="container-fluid">
76
+ <div class="row align-items-start">
77
+ <div class="col-md-4 col-sm-4">
78
+ <div class="position-relative">
79
+ <a href={urls[0]}><img src={thumbnail} class="img-fluid" style="width: 192px; height: 106px"></a>
80
+ </div>
81
+ </div>
82
+ <div class="col-md-8 col-sm-8">
83
+ <h2>{title}</h2>
84
+ </div>
85
+ <div>
86
+ {html_text}
87
+ <br><br>
88
+ """
89
+ return st.markdown(html, unsafe_allow_html=True)
90
+
91
+ channel_map = {
92
+ }
93
+
94
+ st.write("""
95
+ # YouTube Q&A
96
+ """)
97
+
98
+ st.info("""
99
+ YouTube search for the Quantum Resistant Ledger Youtube Channel. Credits to: James Briggs.
100
+ """)
101
+
102
+ st.markdown("""
103
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
104
+ """, unsafe_allow_html=True)
105
+
106
+ query = st.text_input("Search!", "")
107
+
108
+ with st.expander("Advanced Options"):
109
+ channel_options = st.multiselect(
110
+ 'Channels to Search',
111
+ )
112
+
113
+ if query != "":
114
+ channels = [channel_map[name] for name in channel_options]
115
+ print(f"query: {query}")
116
+ matches = make_query(
117
+ query, retriever, top_k=5,
118
+ filter={
119
+ 'channel_id': {'$in': channels}
120
+ }
121
+ )
122
+
123
+ results = {}
124
+ order = []
125
+ for context in matches:
126
+ video_id = context['metadata']['url'].split('/')[-1]
127
+ if video_id not in results:
128
+ results[video_id] = {
129
+ 'title': context['metadata']['title'],
130
+ 'urls': [f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"],
131
+ 'contexts': [context['metadata']['text']],
132
+ 'starts': [int(context['metadata']['start'])],
133
+ 'ends': [int(context['metadata']['end'])]
134
+ }
135
+ order.append(video_id)
136
+ else:
137
+ results[video_id]['urls'].append(
138
+ f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"
139
+ )
140
+ results[video_id]['contexts'].append(
141
+ context['metadata']['text']
142
+ )
143
+ results[video_id]['starts'].append(int(context['metadata']['start']))
144
+ results[video_id]['ends'].append(int(context['metadata']['end']))
145
+ # now display cards
146
+ for video_id in order:
147
+ card(
148
+ thumbnail=f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
149
+ title=results[video_id]['title'],
150
+ urls=results[video_id]['urls'],
151
+ contexts=results[video_id]['contexts'],
152
+ starts=results[video_id]['starts'],
153
+ ends=results[video_id]['ends']
154
+ )