bejaeger jamescalam commited on
Commit
eb2a4ce
0 Parent(s):

Duplicate from jamescalam/ask-youtube

Browse files

Co-authored-by: James Briggs <jamescalam@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +31 -0
  2. README.md +15 -0
  3. app.py +162 -0
  4. requirements.txt +4 -0
.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Ask YouTube
3
+ emoji: 🦾
4
+ colorFrom: purple
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: jamescalam/ask-youtube
11
+ ---
12
+
13
+ Curious about how this works? Check out the [article](https://pinecone.io/learn/openai-whisper)!
14
+
15
+ The current version of the app has a very limited video scope. We'd love to add more, so if you'd like to see more content added, feel free to send CSV data, including video title, channel ID, and video ID (at a minimum) to *james\@pinecone.io*. Even better if you could follow a format similar to [this](https://huggingface.co/datasets/jamescalam/channel-metadata).
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pinecone
3
+ from sentence_transformers import SentenceTransformer
4
+ import logging
5
+
6
+ PINECONE_KEY = st.secrets["PINECONE_KEY"] # app.pinecone.io
7
+ INDEX_ID = 'ask-youtube'
8
+
9
+ @st.experimental_singleton
10
+ def init_pinecone():
11
+ pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
12
+ return pinecone.Index(INDEX_ID)
13
+
14
+ @st.experimental_singleton
15
+ def init_retriever():
16
+ return SentenceTransformer("multi-qa-mpnet-base-dot-v1")
17
+
18
+ def make_query(query, retriever, top_k=10, include_values=True, include_metadata=True, filter=None):
19
+ xq = retriever.encode([query]).tolist()
20
+ logging.info(f"Query: {query}")
21
+ attempt = 0
22
+ while attempt < 3:
23
+ try:
24
+ xc = st.session_state.index.query(
25
+ xq,
26
+ top_k=top_k,
27
+ include_values=include_values,
28
+ include_metadata=include_metadata,
29
+ filter=filter
30
+ )
31
+ matches = xc['matches']
32
+ break
33
+ except:
34
+ # force reload
35
+ pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
36
+ st.session_state.index = pinecone.Index(INDEX_ID)
37
+ attempt += 1
38
+ matches = []
39
+ if len(matches) == 0:
40
+ logging.error(f"Query failed")
41
+ return matches
42
+
43
+ st.session_state.index = init_pinecone()
44
+ retriever = init_retriever()
45
+
46
+ def card(thumbnail: str, title: str, urls: list, contexts: list, starts: list, ends: list):
47
+ meta = [(e, s, u, c) for e, s, u, c in zip(ends, starts, urls, contexts)]
48
+ meta.sort(reverse=False)
49
+ text_content = []
50
+ current_start = 0
51
+ current_end = 0
52
+ for end, start, url, context in meta:
53
+ # reformat seconds to timestamp
54
+ time = start / 60
55
+ mins = f"0{int(time)}"[-2:]
56
+ secs = f"0{int(round((time - int(mins))*60, 0))}"[-2:]
57
+ timestamp = f"{mins}:{secs}"
58
+ if start < current_end and start > current_start:
59
+ # this means it is a continuation of the previous sentence
60
+ text_content[-1][0] = text_content[-1][0].split(context[:10])[0]
61
+ text_content.append([f"[{timestamp}] {context.capitalize()}", url])
62
+ else:
63
+ text_content.append(["xxLINEBREAKxx", ""])
64
+ text_content.append([f"[{timestamp}] {context}", url])
65
+ current_start = start
66
+ current_end = end
67
+ html_text = ""
68
+ for text, url in text_content:
69
+ if text == "xxLINEBREAKxx":
70
+ html_text += "<br>"
71
+ else:
72
+ html_text += f"<small><a href={url}>{text.strip()}... </a></small>"
73
+ print(text)
74
+ html = f"""
75
+ <div class="container-fluid">
76
+ <div class="row align-items-start">
77
+ <div class="col-md-4 col-sm-4">
78
+ <div class="position-relative">
79
+ <a href={urls[0]}><img src={thumbnail} class="img-fluid" style="width: 192px; height: 106px"></a>
80
+ </div>
81
+ </div>
82
+ <div class="col-md-8 col-sm-8">
83
+ <h2>{title}</h2>
84
+ </div>
85
+ <div>
86
+ {html_text}
87
+ <br><br>
88
+ """
89
+ return st.markdown(html, unsafe_allow_html=True)
90
+
91
+ channel_map = {
92
+ 'James Briggs': 'UCv83tO5cePwHMt1952IVVHw',
93
+ 'Daniel Bourke': 'UCr8O8l5cCX85Oem1d18EezQ',
94
+ 'Yannic Kilcher': 'UCZHmQk67mSJgfCCTn7xBfew',
95
+ 'AI Coffee Break with Letitia': 'UCobqgqE4i5Kf7wrxRxhToQA',
96
+ 'sentdex': 'UCfzlCWGWYyIQ0aLC5w48gBQ'
97
+ }
98
+
99
+ st.write("""
100
+ # YouTube Q&A
101
+ """)
102
+
103
+ st.info("""
104
+ YouTube search built as [explained here](https://pinecone.io/learn/openai-whisper)!
105
+ *The current search scope is limited to a few videos talking about ML, NLP, and vector search*. Add requests for channels to include in the [*Community* tab](https://huggingface.co/spaces/jamescalam/ask-youtube/discussions).
106
+ """)
107
+
108
+ st.markdown("""
109
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
110
+ """, unsafe_allow_html=True)
111
+
112
+ query = st.text_input("Search!", "")
113
+
114
+ with st.expander("Advanced Options"):
115
+ channel_options = st.multiselect(
116
+ 'Channels to Search',
117
+ ['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex'],
118
+ ['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex']
119
+ )
120
+
121
+ if query != "":
122
+ channels = [channel_map[name] for name in channel_options]
123
+ print(f"query: {query}")
124
+ matches = make_query(
125
+ query, retriever, top_k=5,
126
+ filter={
127
+ 'channel_id': {'$in': channels}
128
+ }
129
+ )
130
+
131
+ results = {}
132
+ order = []
133
+ for context in matches:
134
+ video_id = context['metadata']['url'].split('/')[-1]
135
+ if video_id not in results:
136
+ results[video_id] = {
137
+ 'title': context['metadata']['title'],
138
+ 'urls': [f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"],
139
+ 'contexts': [context['metadata']['text']],
140
+ 'starts': [int(context['metadata']['start'])],
141
+ 'ends': [int(context['metadata']['end'])]
142
+ }
143
+ order.append(video_id)
144
+ else:
145
+ results[video_id]['urls'].append(
146
+ f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"
147
+ )
148
+ results[video_id]['contexts'].append(
149
+ context['metadata']['text']
150
+ )
151
+ results[video_id]['starts'].append(int(context['metadata']['start']))
152
+ results[video_id]['ends'].append(int(context['metadata']['end']))
153
+ # now display cards
154
+ for video_id in order:
155
+ card(
156
+ thumbnail=f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
157
+ title=results[video_id]['title'],
158
+ urls=results[video_id]['urls'],
159
+ contexts=results[video_id]['contexts'],
160
+ starts=results[video_id]['starts'],
161
+ ends=results[video_id]['ends']
162
+ )
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ sentence-transformers
3
+ pinecone-client
4
+ click==8.0