Spaces:

frknayk
/

YoutubeSummarizer

Runtime error

File size: 5,893 Bytes

8b6bb87

import streamlit as st
# Transcript
from youtube_transcript_api import YouTubeTranscriptApi
import os
# Summarization
from transformers import (
    pipeline,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import torch
import re


def fetch_transcript(video_url):
    try:
        # Extract the video ID from the URL
        video_id = video_url.split("v=")[1]
        # Fetch the transcript for the video
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        # Process the transcript data
        text_transcript = "\n".join([entry['text'] for entry in transcript])
        return text_transcript
    except Exception as e:
        return str(e)

def clean_transcript(transcript):
    # Remove non-speech elements (e.g., laughter, background noises)
    transcript = re.sub(r'\[.*?\]', '', transcript)

    # Correct spelling and grammar (you can use libraries like NLTK or spaCy for this)
    # Example:
    # import nltk
    # transcript = ' '.join(nltk.word_tokenize(transcript))

    # Normalize punctuation and formatting
    transcript = transcript.replace('\n', ' ')  # Remove line breaks
    transcript = re.sub(r'\s+', ' ', transcript)  # Remove extra whitespaces

    # Remove timestamps and annotations
    transcript = re.sub(r'\[\d+:\d+:\d+\]', '', transcript)

    # Handle speaker identification (if present)
    # Example: transcript = re.sub(r'Speaker\d+:', '', transcript)

    # Remove filler words and phrases
    filler_words = ['like', 'you know', 'sort of']  # Add more as needed
    for word in filler_words:
        transcript = transcript.replace(word, '')
    
    # Replace common contractions with their expanded forms
    transcript = transcript.replace("won't", "will not")
    transcript = transcript.replace("can't", "cannot")
    transcript = transcript.replace("n't", " not")
    transcript = transcript.replace("'ll", " will")
    transcript = transcript.replace("'ve", " have")
    transcript = transcript.replace("'re", " are")
    transcript = transcript.replace("'d", " would")
    transcript = transcript.replace("'s", " is")

    return transcript.strip()  # Trim leading/trailing whitespaces

def extract_video_id(url):
    """Extracts the YouTube video ID from the URL."""
    match = re.search(r"(?<=v=)[\w-]+", url)
    if match:
        return match.group(0)
    else:
        return None


def summarize_transcript(text, llama_pipeline):
    def summarize_text(llama_pipeline, system_prompt, text):
        # Format the input text with special tokens for the model
        text = f"""
        <s>[INST] <<SYS>>
        {system_prompt}
        <</SYS>>
        {text}[/INST]
        """
        # Generate sequences using the pipeline with specified parameters
        sequences = llama_pipeline(text)
        # Extract the generated text from the sequences
        generated_text = sequences[0]["generated_text"]
        # Trim the generated text to remove the instruction part
        generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
        # Return the processed generated text
        return generated_text
    # Define the maximum input length for each iteration of summarization
    input_len = 1000
    # Start an infinite loop to repeatedly summarize the text
    while True:
        # Print the current length of the text
        print(len(text))
        # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
        summary = summarize_text(llama_pipeline, "", "Summarize the following: " + text[0:input_len])
        if len(summary) < input_len:
            return summary
        # Concatenate the current summary with the remaining part of the text for the next iteration
        text = summary + " " + text[input_len:]

# Load the model and tokenizer
@st.cache_resource()
def load_model():
    # Define the model name to be used for the chat function
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    pipeline_llama2 = pipeline(
        "text-generation", #task
        model=model_name,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
        # max_length=max_token_length,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id
    )
    return pipeline_llama2

def main():
    st.title("YouTube Video Preview")

    with st.spinner('Loading checkpoint shards of LLAMA-2'):
        pipeline_llama2 = load_model()
    st.success('Done!')

    # Input field for the YouTube video link
    youtube_url = st.text_input("Paste YouTube Video Link:")
    
    # Extract video ID from the URL
    video_id = extract_video_id(youtube_url)

    # Display video preview if video ID is found
    if video_id:
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        st.video(video_url, format='video/mp4')
        video_transcript = clean_transcript(fetch_transcript(video_url))
        if video_transcript:
            # Display transcript and summary side by side
            col1, col2 = st.columns(2)
            with col1:
                st.subheader("Transcript:")
                st.text_area(" ", video_transcript, height=400)

            with col2:
                st.subheader("Summary:")
                video_summary = summarize_transcript(video_transcript, pipeline_llama2)
                st.text_area(" ", video_summary, height=400)
                print(f"Summary:{video_summary}")   
        else:
            st.error("Failed to fetch video transcript. Please check the video ID or try again later.")

    elif youtube_url:
        st.warning("Invalid YouTube Video Link")

if __name__ == "__main__":
    main()