Spaces:

Axelottle
/

SnipSnap

Sleeping

App Files Files Community

SnipSnap / app.py

Axelottle

Add examples, add percentage description

879e354 over 1 year ago

raw

history blame contribute delete

9.73 kB

	#pip install gradio nltk youtube-transcript-api pytube gtts --quiet

	from __future__ import division
	import nltk
	import string
	import re
	import io, os, time
	import numpy as np
	import gradio as gr
	from tempfile import TemporaryFile
	from gtts import gTTS
	from pytube import YouTube
	from youtube_transcript_api import YouTubeTranscriptApi
	from nltk import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from collections import defaultdict

	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('wordnet')

	"""## Transcript Summary Module"""

	def summarize_text(url, percent):

	# Check if the URL is valid
	try:
	youtube = YouTube(url)
	except Exception as e:
	raise gr.Error(f"Invalid YouTube URL")

	# Get transcript using youtube-transcript-api
	try:
	transcript = YouTubeTranscriptApi.get_transcript(youtube.video_id)
	Text = ' '.join([entry['text'] for entry in transcript])
	except Exception as e:
	raise gr.Error(f"Could not retrieve the video's transcript. Please try another video")

	# Clean text

	Cleaned_text = re.sub(r'[^a-zA-Z0-9\._-]', ' ', Text)
	text = word_tokenize(Cleaned_text)
	case_insensitive_text = word_tokenize(Cleaned_text.lower())

	# Sentence Segmentation

	sentences = []
	tokenized_sentences = []
	sentence = " "
	for word in text:
	if word != '.':
	sentence+=str(word)+" "
	else:
	sentences.append(sentence.strip())
	tokenized_sentences.append(word_tokenize(sentence.lower().strip()))
	sentence = " "

	def lemmatize(POS_tagged_text):

	wordnet_lemmatizer = WordNetLemmatizer()
	adjective_tags = ['JJ','JJR','JJS']
	lemmatized_text = []

	for word in POS_tagged_text:
	if word[1] in adjective_tags:
	lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
	else:
	lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun

	return lemmatized_text


	#Pre_processing:

	POS_tagged_text = nltk.pos_tag(case_insensitive_text)
	lemmatized_text = lemmatize(POS_tagged_text)
	Processed_text = nltk.pos_tag(lemmatized_text)

	def generate_stopwords(POS_tagged_text):
	stopwords = []

	wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','FW'] #may be add VBG too

	for word in POS_tagged_text:
	if word[1] not in wanted_POS:
	stopwords.append(word[0])

	punctuations = list(str(string.punctuation))
	stopwords = stopwords + punctuations

	stopword_file = open("long_stopwords.txt", "r")
	#Source = https://www.ranks.nl/stopwords

	for line in stopword_file.readlines():
	stopwords.append(str(line.strip()))

	return set(stopwords)

	stopwords = generate_stopwords(Processed_text)

	def partition_phrases(text,delimeters):
	phrases = []
	phrase = " "
	for word in text:
	if word in delimeters:
	if phrase!= " ":
	phrases.append(str(phrase).split())
	phrase = " "
	elif word not in delimeters:
	phrase+=str(word)
	phrase+=" "
	return phrases

	phrase_list = partition_phrases(lemmatized_text,stopwords)

	phrase_partitioned_sentences = []

	for sentence in tokenized_sentences:
	POS_tagged_sentence = nltk.pos_tag(sentence)
	lemmatized_sentence = lemmatize(POS_tagged_sentence)
	phrase_partitioned_sentence = partition_phrases(lemmatized_sentence,stopwords)
	phrase_partitioned_sentences.append(phrase_partitioned_sentence)

	# keyword scoring

	frequency = defaultdict(int)
	degree = defaultdict(int)
	word_score = defaultdict(float)

	vocabulary = []

	for phrase in phrase_list:
	for word in phrase:
	frequency[word]+=1
	degree[word]+=len(phrase)
	if word not in vocabulary:
	vocabulary.append(word)

	for word in vocabulary:
	word_score[word] = degree[word]/frequency[word]

	phrase_scores = []
	keywords = []
	phrase_vocabulary = []

	for phrase in phrase_list:
	if phrase not in phrase_vocabulary:
	phrase_score = 0
	for word in phrase:
	phrase_score += word_score[word]
	phrase_scores.append(phrase_score)
	phrase_vocabulary.append(phrase)


	phrase_vocabulary = []

	for phrase in phrase_list:
	if phrase not in phrase_vocabulary:
	keyword=''
	for word in phrase:
	keyword += str(word)+" "
	phrase_vocabulary.append(phrase)
	keyword = keyword.strip()
	keywords.append(keyword)

	sorted_index = np.flip(np.argsort(phrase_scores),0)

	tokenized_keywords = []
	sorted_keywords = []

	keywords_num = 0
	threshold = 50
	if len(keywords)<threshold:
	keywords_num = len(keywords)
	else:
	keywords_num = threshold

	for i in range(0,keywords_num):
	sorted_keywords.append(keywords[sorted_index[i]])
	tokenized_keywords.append(sorted_keywords[i].split())

	sentence_scores = np.zeros((len(sentences)),np.float32)
	i=0
	for sentence in phrase_partitioned_sentences:
	for phrase in sentence:
	if phrase in tokenized_keywords:

	matched_tokenized_keyword_index = tokenized_keywords.index(phrase)

	corresponding_sorted_keyword = sorted_keywords[matched_tokenized_keyword_index]

	keyword_index_where_the_sorted_keyword_is_present = keywords.index(corresponding_sorted_keyword)

	sentence_scores[i]+=phrase_scores[keyword_index_where_the_sorted_keyword_is_present]
	i+=1

	Reduce_to_percent = percent
	summary_size = int(((Reduce_to_percent)/100)*len(sentences))

	if summary_size == 0:
	summary_size = 1

	sorted_sentence_score_indices = np.flip(np.argsort(sentence_scores),0)

	indices_for_summary_results = sorted_sentence_score_indices[0:summary_size]

	summary = ""

	current_size = 0

	if 0 not in indices_for_summary_results and summary_size!=1:
	summary+=sentences[0]
	summary+=".\n\n"
	current_size+=1


	for i in range(0,len(sentences)):
	if i in indices_for_summary_results:
	summary+=sentences[i]
	summary+=".\n\n"
	current_size += 1
	if current_size == summary_size:
	break

	yt = YouTube(url)
	video_html = f'<div id="video-container" style="position: relative; width: 100%; padding-bottom: 56.25%;"><iframe id="video" style="position: absolute; width: 100%; height: 100%;" src="{yt.embed_url}" frameborder="0" allowfullscreen></iframe></div>'

	if summary == "":
	raise gr.Error(f"Could not retrieve the video's transcript. Please try another video")

	return summary, video_html

	"""## Text-to-Speech Module"""

	AUDIO_DIR = 'audio_files'
	MAX_FILE_AGE = 60 * 60 # maximum age of audio files in seconds (1 hour)

	def delete_old_audio_files():
	# delete audio files older than MAX_FILE_AGE
	now = time.time()
	for file_name in os.listdir(AUDIO_DIR):
	file_path = os.path.join(AUDIO_DIR, file_name)
	if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
	os.remove(file_path)

	def text_to_speech(input_text):
	# create the text-to-speech audio
	tts = gTTS(input_text, lang='en', slow=False)
	fp = io.BytesIO()
	tts.write_to_fp(fp)
	fp.seek(0)

	# create the audio directory if it does not exist
	os.makedirs(AUDIO_DIR, exist_ok=True)

	# generate a unique file name for the audio file
	file_name = str(time.time()) + '.wav'
	file_path = os.path.join(AUDIO_DIR, file_name)

	# save the audio stream to a file
	with open(file_path, 'wb') as f:
	f.write(fp.read())

	# delete old audio files
	delete_old_audio_files()

	# return the file path
	return file_path

	theme = gr.themes.Soft(
	primary_hue="yellow",
	secondary_hue=gr.themes.Color(c100="#f8f8f8", c200="#d9d9d9", c300="#a5b4fc", c400="#818cf8", c50="#faf0e4", c500="#6366f1", c600="#4f46e5", c700="#4338ca", c800="#3730a3", c900="#312e81", c950="#2b2c5e"),
	neutral_hue="zinc",
	).set(
	body_background_fill='*secondary_50',
	block_label_background_fill='*primary_50',
	block_label_background_fill_dark='*body_background_fill',
	)


	with gr.Blocks(theme=theme) as demo:

	gr.Markdown(
	'''
	<h1 align="center">Educational Video Transcript Summarizer</h1>

	<h6 align="center">Welcome to SnipSnap! Input a YouTube URL to get started.</h6>
	'''
	)

	with gr.Row():
	with gr.Column():
	fn = summarize_text
	url_input = gr.Textbox(label="URL", placeholder="Ex: https://youtu.be/JOiGEI9pQBs", info="Input YouTube URL")
	slider = gr.Slider(5, 100, value=20, step=5, label="Percent", info="Choose summary length (the lower the number, the shorter the summary)")

	with gr.Row():
	summarize_btn = gr.Button(variant="primary", value="Summarize")
	clear_btn = gr.ClearButton()

	video_preview = gr.HTML(label="Video Preview")
	examples = gr.Examples([['https://youtu.be/libKVRa01L8'], ['https://youtu.be/v6Agqm4K7Ok'], ['https://youtu.be/HpcTJW4ur54'], ['https://youtu.be/gjVX47dLlN8']], inputs=url_input)

	with gr.Column():
	summary_output = gr.Textbox(label="Summary", interactive=False, show_copy_button=True)
	tts_btn = gr.Button(variant="primary", value="Text-to-Speech")
	summary_tts = gr.Audio(label="Audio", interactive=False)

	# Buttons
	summarize_btn.click(summarize_text, inputs=[url_input, slider], outputs=[summary_output, video_preview])
	tts_btn.click(text_to_speech, inputs=summary_output, outputs=summary_tts)
	clear_btn.click(lambda:[None, None, None, None], outputs=[url_input, summary_output, video_preview, summary_tts])

	demo.queue()
	demo.launch()