|
|
|
|
|
from __future__ import division |
|
import nltk |
|
import string |
|
import re |
|
import io, os, time |
|
import numpy as np |
|
import gradio as gr |
|
from tempfile import TemporaryFile |
|
from gtts import gTTS |
|
from pytube import YouTube |
|
from youtube_transcript_api import YouTubeTranscriptApi |
|
from nltk import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
from collections import defaultdict |
|
|
|
nltk.download('punkt') |
|
nltk.download('averaged_perceptron_tagger') |
|
nltk.download('wordnet') |
|
|
|
"""## Transcript Summary Module""" |
|
|
|
def summarize_text(url, percent): |
|
|
|
|
|
try: |
|
youtube = YouTube(url) |
|
except Exception as e: |
|
raise gr.Error(f"Invalid YouTube URL") |
|
|
|
|
|
try: |
|
transcript = YouTubeTranscriptApi.get_transcript(youtube.video_id) |
|
Text = ' '.join([entry['text'] for entry in transcript]) |
|
except Exception as e: |
|
raise gr.Error(f"Could not retrieve the video's transcript. Please try another video") |
|
|
|
|
|
|
|
Cleaned_text = re.sub(r'[^a-zA-Z0-9\._-]', ' ', Text) |
|
text = word_tokenize(Cleaned_text) |
|
case_insensitive_text = word_tokenize(Cleaned_text.lower()) |
|
|
|
|
|
|
|
sentences = [] |
|
tokenized_sentences = [] |
|
sentence = " " |
|
for word in text: |
|
if word != '.': |
|
sentence+=str(word)+" " |
|
else: |
|
sentences.append(sentence.strip()) |
|
tokenized_sentences.append(word_tokenize(sentence.lower().strip())) |
|
sentence = " " |
|
|
|
def lemmatize(POS_tagged_text): |
|
|
|
wordnet_lemmatizer = WordNetLemmatizer() |
|
adjective_tags = ['JJ','JJR','JJS'] |
|
lemmatized_text = [] |
|
|
|
for word in POS_tagged_text: |
|
if word[1] in adjective_tags: |
|
lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a"))) |
|
else: |
|
lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) |
|
|
|
return lemmatized_text |
|
|
|
|
|
|
|
|
|
POS_tagged_text = nltk.pos_tag(case_insensitive_text) |
|
lemmatized_text = lemmatize(POS_tagged_text) |
|
Processed_text = nltk.pos_tag(lemmatized_text) |
|
|
|
def generate_stopwords(POS_tagged_text): |
|
stopwords = [] |
|
|
|
wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','FW'] |
|
|
|
for word in POS_tagged_text: |
|
if word[1] not in wanted_POS: |
|
stopwords.append(word[0]) |
|
|
|
punctuations = list(str(string.punctuation)) |
|
stopwords = stopwords + punctuations |
|
|
|
stopword_file = open("long_stopwords.txt", "r") |
|
|
|
|
|
for line in stopword_file.readlines(): |
|
stopwords.append(str(line.strip())) |
|
|
|
return set(stopwords) |
|
|
|
stopwords = generate_stopwords(Processed_text) |
|
|
|
def partition_phrases(text,delimeters): |
|
phrases = [] |
|
phrase = " " |
|
for word in text: |
|
if word in delimeters: |
|
if phrase!= " ": |
|
phrases.append(str(phrase).split()) |
|
phrase = " " |
|
elif word not in delimeters: |
|
phrase+=str(word) |
|
phrase+=" " |
|
return phrases |
|
|
|
phrase_list = partition_phrases(lemmatized_text,stopwords) |
|
|
|
phrase_partitioned_sentences = [] |
|
|
|
for sentence in tokenized_sentences: |
|
POS_tagged_sentence = nltk.pos_tag(sentence) |
|
lemmatized_sentence = lemmatize(POS_tagged_sentence) |
|
phrase_partitioned_sentence = partition_phrases(lemmatized_sentence,stopwords) |
|
phrase_partitioned_sentences.append(phrase_partitioned_sentence) |
|
|
|
|
|
|
|
frequency = defaultdict(int) |
|
degree = defaultdict(int) |
|
word_score = defaultdict(float) |
|
|
|
vocabulary = [] |
|
|
|
for phrase in phrase_list: |
|
for word in phrase: |
|
frequency[word]+=1 |
|
degree[word]+=len(phrase) |
|
if word not in vocabulary: |
|
vocabulary.append(word) |
|
|
|
for word in vocabulary: |
|
word_score[word] = degree[word]/frequency[word] |
|
|
|
phrase_scores = [] |
|
keywords = [] |
|
phrase_vocabulary = [] |
|
|
|
for phrase in phrase_list: |
|
if phrase not in phrase_vocabulary: |
|
phrase_score = 0 |
|
for word in phrase: |
|
phrase_score += word_score[word] |
|
phrase_scores.append(phrase_score) |
|
phrase_vocabulary.append(phrase) |
|
|
|
|
|
phrase_vocabulary = [] |
|
|
|
for phrase in phrase_list: |
|
if phrase not in phrase_vocabulary: |
|
keyword='' |
|
for word in phrase: |
|
keyword += str(word)+" " |
|
phrase_vocabulary.append(phrase) |
|
keyword = keyword.strip() |
|
keywords.append(keyword) |
|
|
|
sorted_index = np.flip(np.argsort(phrase_scores),0) |
|
|
|
tokenized_keywords = [] |
|
sorted_keywords = [] |
|
|
|
keywords_num = 0 |
|
threshold = 50 |
|
if len(keywords)<threshold: |
|
keywords_num = len(keywords) |
|
else: |
|
keywords_num = threshold |
|
|
|
for i in range(0,keywords_num): |
|
sorted_keywords.append(keywords[sorted_index[i]]) |
|
tokenized_keywords.append(sorted_keywords[i].split()) |
|
|
|
sentence_scores = np.zeros((len(sentences)),np.float32) |
|
i=0 |
|
for sentence in phrase_partitioned_sentences: |
|
for phrase in sentence: |
|
if phrase in tokenized_keywords: |
|
|
|
matched_tokenized_keyword_index = tokenized_keywords.index(phrase) |
|
|
|
corresponding_sorted_keyword = sorted_keywords[matched_tokenized_keyword_index] |
|
|
|
keyword_index_where_the_sorted_keyword_is_present = keywords.index(corresponding_sorted_keyword) |
|
|
|
sentence_scores[i]+=phrase_scores[keyword_index_where_the_sorted_keyword_is_present] |
|
i+=1 |
|
|
|
Reduce_to_percent = percent |
|
summary_size = int(((Reduce_to_percent)/100)*len(sentences)) |
|
|
|
if summary_size == 0: |
|
summary_size = 1 |
|
|
|
sorted_sentence_score_indices = np.flip(np.argsort(sentence_scores),0) |
|
|
|
indices_for_summary_results = sorted_sentence_score_indices[0:summary_size] |
|
|
|
summary = "" |
|
|
|
current_size = 0 |
|
|
|
if 0 not in indices_for_summary_results and summary_size!=1: |
|
summary+=sentences[0] |
|
summary+=".\n\n" |
|
current_size+=1 |
|
|
|
|
|
for i in range(0,len(sentences)): |
|
if i in indices_for_summary_results: |
|
summary+=sentences[i] |
|
summary+=".\n\n" |
|
current_size += 1 |
|
if current_size == summary_size: |
|
break |
|
|
|
yt = YouTube(url) |
|
video_html = f'<div id="video-container" style="position: relative; width: 100%; padding-bottom: 56.25%;"><iframe id="video" style="position: absolute; width: 100%; height: 100%;" src="{yt.embed_url}" frameborder="0" allowfullscreen></iframe></div>' |
|
|
|
if summary == "": |
|
raise gr.Error(f"Could not retrieve the video's transcript. Please try another video") |
|
|
|
return summary, video_html |
|
|
|
"""## Text-to-Speech Module""" |
|
|
|
AUDIO_DIR = 'audio_files' |
|
MAX_FILE_AGE = 60 * 60 |
|
|
|
def delete_old_audio_files(): |
|
|
|
now = time.time() |
|
for file_name in os.listdir(AUDIO_DIR): |
|
file_path = os.path.join(AUDIO_DIR, file_name) |
|
if now - os.path.getmtime(file_path) > MAX_FILE_AGE: |
|
os.remove(file_path) |
|
|
|
def text_to_speech(input_text): |
|
|
|
tts = gTTS(input_text, lang='en', slow=False) |
|
fp = io.BytesIO() |
|
tts.write_to_fp(fp) |
|
fp.seek(0) |
|
|
|
|
|
os.makedirs(AUDIO_DIR, exist_ok=True) |
|
|
|
|
|
file_name = str(time.time()) + '.wav' |
|
file_path = os.path.join(AUDIO_DIR, file_name) |
|
|
|
|
|
with open(file_path, 'wb') as f: |
|
f.write(fp.read()) |
|
|
|
|
|
delete_old_audio_files() |
|
|
|
|
|
return file_path |
|
|
|
theme = gr.themes.Soft( |
|
primary_hue="yellow", |
|
secondary_hue=gr.themes.Color(c100="#f8f8f8", c200="#d9d9d9", c300="#a5b4fc", c400="#818cf8", c50="#faf0e4", c500="#6366f1", c600="#4f46e5", c700="#4338ca", c800="#3730a3", c900="#312e81", c950="#2b2c5e"), |
|
neutral_hue="zinc", |
|
).set( |
|
body_background_fill='*secondary_50', |
|
block_label_background_fill='*primary_50', |
|
block_label_background_fill_dark='*body_background_fill', |
|
) |
|
|
|
|
|
with gr.Blocks(theme=theme) as demo: |
|
|
|
gr.Markdown( |
|
''' |
|
<h1 align="center">Educational Video Transcript Summarizer</h1> |
|
|
|
<h6 align="center">Welcome to SnipSnap! Input a YouTube URL to get started.</h6> |
|
''' |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
fn = summarize_text |
|
url_input = gr.Textbox(label="URL", placeholder="Ex: https://youtu.be/JOiGEI9pQBs", info="Input YouTube URL") |
|
slider = gr.Slider(5, 100, value=20, step=5, label="Percent", info="Choose summary length (the lower the number, the shorter the summary)") |
|
|
|
with gr.Row(): |
|
summarize_btn = gr.Button(variant="primary", value="Summarize") |
|
clear_btn = gr.ClearButton() |
|
|
|
video_preview = gr.HTML(label="Video Preview") |
|
examples = gr.Examples([['https://youtu.be/libKVRa01L8'], ['https://youtu.be/v6Agqm4K7Ok'], ['https://youtu.be/HpcTJW4ur54'], ['https://youtu.be/gjVX47dLlN8']], inputs=url_input) |
|
|
|
with gr.Column(): |
|
summary_output = gr.Textbox(label="Summary", interactive=False, show_copy_button=True) |
|
tts_btn = gr.Button(variant="primary", value="Text-to-Speech") |
|
summary_tts = gr.Audio(label="Audio", interactive=False) |
|
|
|
|
|
summarize_btn.click(summarize_text, inputs=[url_input, slider], outputs=[summary_output, video_preview]) |
|
tts_btn.click(text_to_speech, inputs=summary_output, outputs=summary_tts) |
|
clear_btn.click(lambda:[None, None, None, None], outputs=[url_input, summary_output, video_preview, summary_tts]) |
|
|
|
demo.queue() |
|
demo.launch() |