Gradio_Youtube_Transcript_v2

Runtime error

App Files Files Community

Gradio_Youtube_Transcript_v2 / app.py

Bilalst

Update app.py

4b4a2b4 almost 2 years ago

raw

history blame contribute delete

3.63 kB

	import gradio as gr
	import requests
	from sentence_transformers import SentenceTransformer
	from youtube_transcript_api import YouTubeTranscriptApi
	import numpy as np
	import huggingface_hub
	import os
	import faiss

	# Set up SentenceTransformer
	model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


	playlist_id = 'PLD4EAA8F8C9148A1B'
	api_key = 'AIzaSyBGuTvXcnliEh6yhTxugrAVM5YzcG9qr9U'

	# Make a request to the YouTube Data API to retrieve the playlist items
	url = f'https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&maxResults=50&playlistId={playlist_id}&key={api_key}'
	video_ids = []

	while True:
	response = requests.get(url)
	data = response.json()

	# Extract the video IDs from the response
	for item in data['items']:
	video_ids.append(item['snippet']['resourceId']['videoId'])

	# Check if there are more pages of results
	if 'nextPageToken' in data:
	next_page_token = data['nextPageToken']
	url = f'https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&maxResults=50&playlistId={playlist_id}&key={api_key}&pageToken={next_page_token}'
	else:
	break

	# Empty lists to store transcripts and video IDs
	transcripts = []
	ids = []

	for video_id in video_ids:
	try:
	transcript = YouTubeTranscriptApi.get_transcript(video_id)
	transcript_text = ' '.join([t['text'] for t in transcript])
	transcripts.append(transcript_text)
	ids.append(video_id)

	except Exception as e:
	print(f"Error retrieving transcript for video {video_id}: {e}")
	continue

	# create sentence embeddings
	sentence_embeddings = model.encode(transcripts)

	# Set up FAISS
	index = faiss.IndexFlatL2(384)
	# Convert list of embeddings to NumPy array
	sentence_embeddings = np.array(sentence_embeddings)

	# Add sentence embeddings to FAISS index
	index.add(sentence_embeddings)


	#---------------------------------------------

	def get_video_links(input_text):
	# Encode input text using SentenceTransformer
	input_embedding = model.encode([input_text])[0]

	# Perform nearest neighbor search in FAISS index
	k = 15 # Number of nearest neighbors to retrieve
	_, T = index.search(np.array([input_embedding]), k) # search

	# Return the list of video links with thumbnails and titles as an HTML string
	video_links = []
	visited_ids = set()
	for i in T[0]:
	video_id = ids[i]
	if video_id in visited_ids:
	continue # Skip if the video_id has already been visited
	visited_ids.add(video_id)

	# Retrieve video details using YouTube Data API
	video_info_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}"
	response = requests.get(video_info_url)
	data = response.json()
	video_title = data['items'][0]['snippet']['title']
	video_thumbnail = data['items'][0]['snippet']['thumbnails']['default']['url']

	# Generate HTML code for the video link with thumbnail and title
	video_link = f"https://www.youtube.com/watch?v={video_id}"
	video_html = f'<a href="{video_link}" target="_blank"><img src="{video_thumbnail}"><br>{video_title}</a><br>'
	video_links.append(video_html)

	return ''.join(video_links)

	# Create Gradio interface with "html" output type
	iface = gr.Interface(fn=get_video_links, inputs=[gr.inputs.Textbox(label="Add what you are looking to find in Dr. Joe's testimonials!")], outputs="html", title="Dr. Joe Dispenza testimonials Search")



	# Launch the Gradio interface on Hugging Face Spaces
	if __name__ == '__main__':
	iface.launch()