Bilalst's picture
Update app.py
4b4a2b4
raw
history blame
No virus
3.63 kB
import gradio as gr
import requests
from sentence_transformers import SentenceTransformer
from youtube_transcript_api import YouTubeTranscriptApi
import numpy as np
import huggingface_hub
import os
import faiss
# Set up SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
playlist_id = 'PLD4EAA8F8C9148A1B'
api_key = 'AIzaSyBGuTvXcnliEh6yhTxugrAVM5YzcG9qr9U'
# Make a request to the YouTube Data API to retrieve the playlist items
url = f'https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&maxResults=50&playlistId={playlist_id}&key={api_key}'
video_ids = []
while True:
response = requests.get(url)
data = response.json()
# Extract the video IDs from the response
for item in data['items']:
video_ids.append(item['snippet']['resourceId']['videoId'])
# Check if there are more pages of results
if 'nextPageToken' in data:
next_page_token = data['nextPageToken']
url = f'https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&maxResults=50&playlistId={playlist_id}&key={api_key}&pageToken={next_page_token}'
else:
break
# Empty lists to store transcripts and video IDs
transcripts = []
ids = []
for video_id in video_ids:
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
transcript_text = ' '.join([t['text'] for t in transcript])
transcripts.append(transcript_text)
ids.append(video_id)
except Exception as e:
print(f"Error retrieving transcript for video {video_id}: {e}")
continue
# create sentence embeddings
sentence_embeddings = model.encode(transcripts)
# Set up FAISS
index = faiss.IndexFlatL2(384)
# Convert list of embeddings to NumPy array
sentence_embeddings = np.array(sentence_embeddings)
# Add sentence embeddings to FAISS index
index.add(sentence_embeddings)
#---------------------------------------------
def get_video_links(input_text):
# Encode input text using SentenceTransformer
input_embedding = model.encode([input_text])[0]
# Perform nearest neighbor search in FAISS index
k = 15 # Number of nearest neighbors to retrieve
_, T = index.search(np.array([input_embedding]), k) # search
# Return the list of video links with thumbnails and titles as an HTML string
video_links = []
visited_ids = set()
for i in T[0]:
video_id = ids[i]
if video_id in visited_ids:
continue # Skip if the video_id has already been visited
visited_ids.add(video_id)
# Retrieve video details using YouTube Data API
video_info_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}"
response = requests.get(video_info_url)
data = response.json()
video_title = data['items'][0]['snippet']['title']
video_thumbnail = data['items'][0]['snippet']['thumbnails']['default']['url']
# Generate HTML code for the video link with thumbnail and title
video_link = f"https://www.youtube.com/watch?v={video_id}"
video_html = f'<a href="{video_link}" target="_blank"><img src="{video_thumbnail}"><br>{video_title}</a><br>'
video_links.append(video_html)
return ''.join(video_links)
# Create Gradio interface with "html" output type
iface = gr.Interface(fn=get_video_links, inputs=[gr.inputs.Textbox(label="Add what you are looking to find in Dr. Joe's testimonials!")], outputs="html", title="Dr. Joe Dispenza testimonials Search")
# Launch the Gradio interface on Hugging Face Spaces
if __name__ == '__main__':
iface.launch()