|
import subprocess |
|
|
|
subprocess.check_call(['pip', 'install', 'wordcloud']) |
|
subprocess.check_call(['pip', 'install', 'git+https://github.com/openai/whisper.git']) |
|
subprocess.check_call(['pip', 'install', 'transformers']) |
|
subprocess.check_call(['pip', 'install', 'imageio==2.4.1']) |
|
subprocess.check_call(['pip', 'install', 'moviepy']) |
|
subprocess.check_call(['pip', 'install', 'keybert']) |
|
subprocess.check_call(['pip', 'install', 'pytube']) |
|
|
|
import streamlit as st |
|
import os |
|
from wordcloud import WordCloud |
|
from keybert import KeyBERT |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
from moviepy.editor import * |
|
from tqdm import tqdm |
|
import os |
|
import math |
|
import nltk |
|
nltk.download('punkt') |
|
import whisper |
|
from transformers import pipeline |
|
|
|
from pytube import YouTube |
|
def process_video(path): |
|
whisper_model = whisper.load_model("base") |
|
|
|
def SpeechToTextEng(aud_path): |
|
result = whisper_model.transcribe(aud_path) |
|
return result["text"] |
|
|
|
def run_range(duration): |
|
time=duration/60 |
|
floor=math.ceil(time) |
|
return floor |
|
|
|
time_range=60 |
|
clip_run_range=0 |
|
clip_duration=0 |
|
|
|
def audio_generator(path,aud=0,vid=0): |
|
if vid==1: |
|
clip=VideoFileClip(path) |
|
clip_duration = clip.duration |
|
clip_run_range=run_range(clip_duration) |
|
for i in range(clip_run_range): |
|
left=i*time_range |
|
right=left+time_range |
|
|
|
|
|
crop_clip=clip.subclip(left,right) |
|
try: |
|
crop_clip.audio.write_audiofile("vid_to_aud"+str(i)+".mp3") |
|
except: |
|
pass |
|
|
|
if aud==1: |
|
audio_clip=AudioFileClip(path) |
|
clip_duration = audio_clip.duration |
|
print(clip_duration) |
|
clip_run_range=run_range(clip_duration) |
|
print(clip_run_range) |
|
for i in range(clip_run_range): |
|
left=i*time_range |
|
right=left+time_range |
|
|
|
crop_clip=audio_clip.subclip(left,right) |
|
try: |
|
crop_clip.write_audiofile("vid_to_aud"+str(i)+".mp3") |
|
except: |
|
pass |
|
|
|
|
|
|
|
|
|
|
|
video_url = path |
|
|
|
|
|
yt = YouTube(video_url) |
|
|
|
|
|
stream = yt.streams.get_lowest_resolution() |
|
|
|
|
|
stream.download(filename='meeting.mp4') |
|
|
|
audio_generator("./meeting.mp4",vid=1) |
|
transcribed_lit=[] |
|
label_lit=[] |
|
translated_lit=[] |
|
|
|
for i in tqdm(range(clip_run_range)): |
|
transcribed=SpeechToTextEng("./vid_to_aud"+str(i)+".mp3") |
|
transcribed_lit.append(transcribed) |
|
os.remove("./vid_to_aud"+str(i)+".mp3") |
|
|
|
|
|
data = pd.DataFrame( |
|
{'transcriptions': transcribed_lit |
|
}) |
|
|
|
summarizer = pipeline("summarization") |
|
|
|
sentiment_analyzer = pipeline("sentiment-analysis") |
|
|
|
sumarized_lit=[] |
|
sentiment_lit=[] |
|
for i in tqdm(range(len(data))): |
|
summarized=summarizer(data.iloc[i,0],min_length=75, max_length=300)[0]['summary_text'] |
|
sentiment = sentiment_analyzer(data.iloc[i,0])[0]['label'] |
|
sumarized_lit.append(summarized) |
|
sentiment_lit.append(sentiment) |
|
|
|
data['summary']=sumarized_lit |
|
data['sentiment']=sentiment_lit |
|
data.to_csv('output2.csv', index=False) |
|
tot_text="" |
|
for i in range(len(data)): |
|
tot_text=tot_text+data.iloc[i,0] |
|
|
|
key_model = KeyBERT('distilbert-base-nli-mean-tokens') |
|
def extract_keywords(text, top_n=50): |
|
keywords = key_model.extract_keywords(text, top_n=top_n) |
|
return [keyword[0] for keyword in keywords] |
|
|
|
tot_keywords=extract_keywords(tot_text) |
|
|
|
def get_500_words(text,left,right): |
|
words = text.split() |
|
first_500_words = ' '.join(words[left:right]) |
|
return first_500_words |
|
|
|
def summarize_text(text): |
|
chunk_size = 500 |
|
total_summary = "" |
|
|
|
words = text.split() |
|
num_chunks = len(words) // chunk_size + 1 |
|
|
|
for i in tqdm(range(num_chunks)): |
|
start_index = i * chunk_size |
|
end_index = start_index + chunk_size |
|
chunk = " ".join(words[start_index:end_index]) |
|
|
|
|
|
chunk_summary = summarizer(chunk,min_length=75, max_length=200)[0]['summary_text'] |
|
|
|
total_summary += chunk_summary |
|
|
|
return total_summary |
|
|
|
tot_summary=summarize_text(tot_text) |
|
return tot_text,tot_summary,tot_keywords |
|
|
|
|
|
|
|
|
|
|
|
def generate_word_cloud(text): |
|
|
|
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 5)) |
|
|
|
|
|
ax.imshow(wordcloud, interpolation='bilinear') |
|
ax.axis('off') |
|
st.pyplot(fig) |
|
|
|
|
|
def main(): |
|
st.title("Meeting Summary Web App") |
|
|
|
|
|
youtube_url = st.text_input("Enter the YouTube video link") |
|
|
|
if st.button("Process Video"): |
|
if youtube_url: |
|
|
|
tot_text, tot_summary, tot_keywords = process_video(youtube_url) |
|
|
|
|
|
if os.path.exists("output2.csv"): |
|
output_df = pd.read_csv("output2.csv") |
|
st.subheader("Transcriptions:") |
|
st.write(output_df["transcriptions"]) |
|
|
|
st.subheader("Labels:") |
|
st.write(output_df["labels"]) |
|
|
|
st.subheader("Word Cloud:") |
|
generate_word_cloud(output_df["transcriptions"].str.cat(sep=' ')) |
|
|
|
st.subheader("tot_text:") |
|
st.write(tot_text) |
|
|
|
st.subheader("tot_summary:") |
|
st.write(tot_summary) |
|
|
|
st.subheader("tot_keywords:") |
|
st.write(tot_keywords) |
|
|
|
else: |
|
st.write("No output file found.") |
|
|
|
if __name__ == "__main__": |
|
main() |