Spaces:

Tihsrah-CD
/

Meetings

Sleeping

File size: 6,637 Bytes

import subprocess
# # Run the pip install command
subprocess.check_call(['pip', 'install', 'wordcloud'])
subprocess.check_call(['pip', 'install', 'git+https://github.com/openai/whisper.git'])
subprocess.check_call(['pip', 'install', 'transformers'])
subprocess.check_call(['pip', 'install', 'imageio==2.4.1'])
subprocess.check_call(['pip', 'install', 'moviepy'])
subprocess.check_call(['pip', 'install', 'keybert'])
subprocess.check_call(['pip', 'install', 'pytube'])

import streamlit as st
import os
from wordcloud import WordCloud
from keybert import KeyBERT
import pandas as pd
import matplotlib.pyplot as plt
# //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////


from moviepy.editor import *
from tqdm import tqdm
import os
import math
import nltk
nltk.download('punkt')
import whisper
from transformers import pipeline

from pytube import YouTube
def process_video(path):
    whisper_model = whisper.load_model("base")
    
    def SpeechToTextEng(aud_path):
      result = whisper_model.transcribe(aud_path)
      return result["text"]
        
    def run_range(duration):
        time=duration/60
        floor=math.ceil(time)
        return floor

    time_range=60
    clip_run_range=0
    clip_duration=0

    def audio_generator(path,aud=0,vid=0):
        if vid==1:
            clip=VideoFileClip(path)
            clip_duration = clip.duration
            clip_run_range=run_range(clip_duration)
            for i in range(clip_run_range):
                left=i*time_range
                right=left+time_range
                # print(left,right)

                crop_clip=clip.subclip(left,right)
                try:
                    crop_clip.audio.write_audiofile("vid_to_aud"+str(i)+".mp3")
                except:
                    pass

        if aud==1:
            audio_clip=AudioFileClip(path)
            clip_duration = audio_clip.duration
            print(clip_duration)
            clip_run_range=run_range(clip_duration)
            print(clip_run_range)
            for i in range(clip_run_range):
                left=i*time_range
                right=left+time_range
                # print(left,right)
                crop_clip=audio_clip.subclip(left,right)
                try:
                    crop_clip.write_audiofile("vid_to_aud"+str(i)+".mp3")
                except:
                    pass
            
    
    

    # YouTube video URL
    video_url = path
    
    # Create a YouTube object
    yt = YouTube(video_url)
    
    # Get the highest resolution video stream
    stream = yt.streams.get_lowest_resolution()
    
    # Download the video
    stream.download(filename='meeting.mp4')
    
    audio_generator("./meeting.mp4",vid=1)
    transcribed_lit=[]
    label_lit=[]
    translated_lit=[]

    for i in tqdm(range(clip_run_range)):
        transcribed=SpeechToTextEng("./vid_to_aud"+str(i)+".mp3")
        transcribed_lit.append(transcribed)
        os.remove("./vid_to_aud"+str(i)+".mp3")


    data = pd.DataFrame(
        {'transcriptions': transcribed_lit
        })

    summarizer = pipeline("summarization")

    sentiment_analyzer = pipeline("sentiment-analysis")

    sumarized_lit=[]
    sentiment_lit=[]
    for i in tqdm(range(len(data))):
        summarized=summarizer(data.iloc[i,0],min_length=75, max_length=300)[0]['summary_text']
        sentiment = sentiment_analyzer(data.iloc[i,0])[0]['label']
        sumarized_lit.append(summarized)
        sentiment_lit.append(sentiment)

    data['summary']=sumarized_lit
    data['sentiment']=sentiment_lit
    data.to_csv('output2.csv', index=False)
    tot_text=""
    for i in range(len(data)):
        tot_text=tot_text+data.iloc[i,0]

    key_model = KeyBERT('distilbert-base-nli-mean-tokens')
    def extract_keywords(text, top_n=50):
        keywords = key_model.extract_keywords(text, top_n=top_n)
        return [keyword[0] for keyword in keywords]

    tot_keywords=extract_keywords(tot_text)

    def get_500_words(text,left,right):
        words = text.split()
        first_500_words = ' '.join(words[left:right])
        return first_500_words

    def summarize_text(text):
        chunk_size = 500  # Number of words per chunk
        total_summary = ""  # Total summary

        words = text.split()  # Split the text into individual words
        num_chunks = len(words) // chunk_size + 1  # Calculate the number of chunks

        for i in tqdm(range(num_chunks)):
            start_index = i * chunk_size
            end_index = start_index + chunk_size
            chunk = " ".join(words[start_index:end_index])

            # Pass the chunk to the summarizer (replace with your summarization code)
            chunk_summary = summarizer(chunk,min_length=75, max_length=200)[0]['summary_text']
            # print(chunk_summary)
            total_summary += chunk_summary

        return total_summary

    tot_summary=summarize_text(tot_text)
    return tot_text,tot_summary,tot_keywords




# //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
def generate_word_cloud(text):
    # Create a WordCloud object
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    # Display the generated word cloud
    fig, ax = plt.subplots(figsize=(10, 5))

    # Plot the word cloud on the axis
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    st.pyplot(fig)


def main():
    st.title("Meeting Summary Web App")

    # YouTube link input
    youtube_url = st.text_input("Enter the YouTube video link")

    if st.button("Process Video"):
        if youtube_url:
            # Process the YouTube video
            tot_text, tot_summary, tot_keywords = process_video(youtube_url)

            # Display the output
            if os.path.exists("output2.csv"):
                output_df = pd.read_csv("output2.csv")
                st.subheader("Transcriptions:")
                st.write(output_df["transcriptions"])

                st.subheader("Labels:")
                st.write(output_df["labels"])

                st.subheader("Word Cloud:")
                generate_word_cloud(output_df["transcriptions"].str.cat(sep=' '))

                st.subheader("tot_text:")
                st.write(tot_text)

                st.subheader("tot_summary:")
                st.write(tot_summary)

                st.subheader("tot_keywords:")
                st.write(tot_keywords)

            else:
                st.write("No output file found.")

if __name__ == "__main__":
    main()