Spaces:
Runtime error
Runtime error
Commit
•
eb2a4ce
0
Parent(s):
Duplicate from jamescalam/ask-youtube
Browse filesCo-authored-by: James Briggs <jamescalam@users.noreply.huggingface.co>
- .gitattributes +31 -0
- README.md +15 -0
- app.py +162 -0
- requirements.txt +4 -0
.gitattributes
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
23 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Ask YouTube
|
3 |
+
emoji: 🦾
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: blue
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.10.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: jamescalam/ask-youtube
|
11 |
+
---
|
12 |
+
|
13 |
+
Curious about how this works? Check out the [article](https://pinecone.io/learn/openai-whisper)!
|
14 |
+
|
15 |
+
The current version of the app has a very limited video scope. We'd love to add more, so if you'd like to see more content added, feel free to send CSV data, including video title, channel ID, and video ID (at a minimum) to *james\@pinecone.io*. Even better if you could follow a format similar to [this](https://huggingface.co/datasets/jamescalam/channel-metadata).
|
app.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pinecone
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
import logging
|
5 |
+
|
6 |
+
PINECONE_KEY = st.secrets["PINECONE_KEY"] # app.pinecone.io
|
7 |
+
INDEX_ID = 'ask-youtube'
|
8 |
+
|
9 |
+
@st.experimental_singleton
|
10 |
+
def init_pinecone():
|
11 |
+
pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
|
12 |
+
return pinecone.Index(INDEX_ID)
|
13 |
+
|
14 |
+
@st.experimental_singleton
|
15 |
+
def init_retriever():
|
16 |
+
return SentenceTransformer("multi-qa-mpnet-base-dot-v1")
|
17 |
+
|
18 |
+
def make_query(query, retriever, top_k=10, include_values=True, include_metadata=True, filter=None):
|
19 |
+
xq = retriever.encode([query]).tolist()
|
20 |
+
logging.info(f"Query: {query}")
|
21 |
+
attempt = 0
|
22 |
+
while attempt < 3:
|
23 |
+
try:
|
24 |
+
xc = st.session_state.index.query(
|
25 |
+
xq,
|
26 |
+
top_k=top_k,
|
27 |
+
include_values=include_values,
|
28 |
+
include_metadata=include_metadata,
|
29 |
+
filter=filter
|
30 |
+
)
|
31 |
+
matches = xc['matches']
|
32 |
+
break
|
33 |
+
except:
|
34 |
+
# force reload
|
35 |
+
pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
|
36 |
+
st.session_state.index = pinecone.Index(INDEX_ID)
|
37 |
+
attempt += 1
|
38 |
+
matches = []
|
39 |
+
if len(matches) == 0:
|
40 |
+
logging.error(f"Query failed")
|
41 |
+
return matches
|
42 |
+
|
43 |
+
st.session_state.index = init_pinecone()
|
44 |
+
retriever = init_retriever()
|
45 |
+
|
46 |
+
def card(thumbnail: str, title: str, urls: list, contexts: list, starts: list, ends: list):
|
47 |
+
meta = [(e, s, u, c) for e, s, u, c in zip(ends, starts, urls, contexts)]
|
48 |
+
meta.sort(reverse=False)
|
49 |
+
text_content = []
|
50 |
+
current_start = 0
|
51 |
+
current_end = 0
|
52 |
+
for end, start, url, context in meta:
|
53 |
+
# reformat seconds to timestamp
|
54 |
+
time = start / 60
|
55 |
+
mins = f"0{int(time)}"[-2:]
|
56 |
+
secs = f"0{int(round((time - int(mins))*60, 0))}"[-2:]
|
57 |
+
timestamp = f"{mins}:{secs}"
|
58 |
+
if start < current_end and start > current_start:
|
59 |
+
# this means it is a continuation of the previous sentence
|
60 |
+
text_content[-1][0] = text_content[-1][0].split(context[:10])[0]
|
61 |
+
text_content.append([f"[{timestamp}] {context.capitalize()}", url])
|
62 |
+
else:
|
63 |
+
text_content.append(["xxLINEBREAKxx", ""])
|
64 |
+
text_content.append([f"[{timestamp}] {context}", url])
|
65 |
+
current_start = start
|
66 |
+
current_end = end
|
67 |
+
html_text = ""
|
68 |
+
for text, url in text_content:
|
69 |
+
if text == "xxLINEBREAKxx":
|
70 |
+
html_text += "<br>"
|
71 |
+
else:
|
72 |
+
html_text += f"<small><a href={url}>{text.strip()}... </a></small>"
|
73 |
+
print(text)
|
74 |
+
html = f"""
|
75 |
+
<div class="container-fluid">
|
76 |
+
<div class="row align-items-start">
|
77 |
+
<div class="col-md-4 col-sm-4">
|
78 |
+
<div class="position-relative">
|
79 |
+
<a href={urls[0]}><img src={thumbnail} class="img-fluid" style="width: 192px; height: 106px"></a>
|
80 |
+
</div>
|
81 |
+
</div>
|
82 |
+
<div class="col-md-8 col-sm-8">
|
83 |
+
<h2>{title}</h2>
|
84 |
+
</div>
|
85 |
+
<div>
|
86 |
+
{html_text}
|
87 |
+
<br><br>
|
88 |
+
"""
|
89 |
+
return st.markdown(html, unsafe_allow_html=True)
|
90 |
+
|
91 |
+
channel_map = {
|
92 |
+
'James Briggs': 'UCv83tO5cePwHMt1952IVVHw',
|
93 |
+
'Daniel Bourke': 'UCr8O8l5cCX85Oem1d18EezQ',
|
94 |
+
'Yannic Kilcher': 'UCZHmQk67mSJgfCCTn7xBfew',
|
95 |
+
'AI Coffee Break with Letitia': 'UCobqgqE4i5Kf7wrxRxhToQA',
|
96 |
+
'sentdex': 'UCfzlCWGWYyIQ0aLC5w48gBQ'
|
97 |
+
}
|
98 |
+
|
99 |
+
st.write("""
|
100 |
+
# YouTube Q&A
|
101 |
+
""")
|
102 |
+
|
103 |
+
st.info("""
|
104 |
+
YouTube search built as [explained here](https://pinecone.io/learn/openai-whisper)!
|
105 |
+
*The current search scope is limited to a few videos talking about ML, NLP, and vector search*. Add requests for channels to include in the [*Community* tab](https://huggingface.co/spaces/jamescalam/ask-youtube/discussions).
|
106 |
+
""")
|
107 |
+
|
108 |
+
st.markdown("""
|
109 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
|
110 |
+
""", unsafe_allow_html=True)
|
111 |
+
|
112 |
+
query = st.text_input("Search!", "")
|
113 |
+
|
114 |
+
with st.expander("Advanced Options"):
|
115 |
+
channel_options = st.multiselect(
|
116 |
+
'Channels to Search',
|
117 |
+
['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex'],
|
118 |
+
['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex']
|
119 |
+
)
|
120 |
+
|
121 |
+
if query != "":
|
122 |
+
channels = [channel_map[name] for name in channel_options]
|
123 |
+
print(f"query: {query}")
|
124 |
+
matches = make_query(
|
125 |
+
query, retriever, top_k=5,
|
126 |
+
filter={
|
127 |
+
'channel_id': {'$in': channels}
|
128 |
+
}
|
129 |
+
)
|
130 |
+
|
131 |
+
results = {}
|
132 |
+
order = []
|
133 |
+
for context in matches:
|
134 |
+
video_id = context['metadata']['url'].split('/')[-1]
|
135 |
+
if video_id not in results:
|
136 |
+
results[video_id] = {
|
137 |
+
'title': context['metadata']['title'],
|
138 |
+
'urls': [f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"],
|
139 |
+
'contexts': [context['metadata']['text']],
|
140 |
+
'starts': [int(context['metadata']['start'])],
|
141 |
+
'ends': [int(context['metadata']['end'])]
|
142 |
+
}
|
143 |
+
order.append(video_id)
|
144 |
+
else:
|
145 |
+
results[video_id]['urls'].append(
|
146 |
+
f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"
|
147 |
+
)
|
148 |
+
results[video_id]['contexts'].append(
|
149 |
+
context['metadata']['text']
|
150 |
+
)
|
151 |
+
results[video_id]['starts'].append(int(context['metadata']['start']))
|
152 |
+
results[video_id]['ends'].append(int(context['metadata']['end']))
|
153 |
+
# now display cards
|
154 |
+
for video_id in order:
|
155 |
+
card(
|
156 |
+
thumbnail=f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
|
157 |
+
title=results[video_id]['title'],
|
158 |
+
urls=results[video_id]['urls'],
|
159 |
+
contexts=results[video_id]['contexts'],
|
160 |
+
starts=results[video_id]['starts'],
|
161 |
+
ends=results[video_id]['ends']
|
162 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
sentence-transformers
|
3 |
+
pinecone-client
|
4 |
+
click==8.0
|