Spaces:

KevlarVK
/

content_summarizer

Runtime error

App Files Files Community

KevlarVK commited on Mar 26, 2023

Commit

3bb118d

•

1 Parent(s): 7a1d9d9

Media support, Code cleanup for summarization, Support for chunk and auto chapters summarize

Browse files

Files changed (5) hide show

Utils.py +87 -21
app.py +111 -9
process_media.py +72 -0
summarize.py +0 -69
summarizer.py +88 -0

Utils.py CHANGED Viewed

@@ -1,7 +1,13 @@
 import requests
 from bs4 import BeautifulSoup
-import string
 def fetch_article_text(url: str):
     r = requests.get(url)
@@ -9,27 +15,87 @@ def fetch_article_text(url: str):
     results = soup.find_all(["h1", "p"])
     text = [result.text for result in results]
     ARTICLE = " ".join(text)
-    ARTICLE = ARTICLE.replace(".", ".<eos>")
-    ARTICLE = ARTICLE.replace("!", "!<eos>")
-    ARTICLE = ARTICLE.replace("?", "?<eos>")
-    sentences = ARTICLE.split("<eos>")
-    current_chunk = 0
-    chunks = []
-    for sentence in sentences:
-        if len(chunks) == current_chunk + 1:
-            if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
-                chunks[current_chunk].extend(sentence.split(" "))
-            else:
-                current_chunk += 1
-                chunks.append(sentence.split(" "))
-        else:
-            print(current_chunk)
-            chunks.append(sentence.split(" "))
-    for chunk_id in range(len(chunks)):
-        chunks[chunk_id] = " ".join(chunks[chunk_id])
-    return ARTICLE, chunks
 def count_tokens(text: str):
     return len(text.split(" "))

 import requests
 from bs4 import BeautifulSoup
+from nltk.tokenize import sent_tokenize
+import nltk
+import re
+import streamlit as st
+from youtube_transcript_api import YouTubeTranscriptApi
+import spacy
+@st.cache_data
 def fetch_article_text(url: str):
     r = requests.get(url)
     results = soup.find_all(["h1", "p"])
     text = [result.text for result in results]
     ARTICLE = " ".join(text)
+    return ARTICLE
 def count_tokens(text: str):
     return len(text.split(" "))
+@st.cache_data
+def get_text_from_youtube_url(url: str):
+    id = url.split("=")[1]
+    try:
+        transcript = YouTubeTranscriptApi.get_transcript(id)
+    except:
+        transcript = YouTubeTranscriptApi.find_transcript(["en"])
+    script = ""
+    for text in transcript:
+        t = text["text"]
+        if t != '[Music]':
+            script += t.lower() + " "
+    return add_punctuation(script)
+def add_punctuation(text: str):
+    # try:
+    nlp = spacy.load("en_core_web_sm")
+    # except:
+    #     import spacy.cli
+    #     spacy.cli.download("en_core_web_sm")
+    #     nlp = spacy.load("en_core_web_sm")
+    doc = nlp(text)
+    punctuation = [".", ",", ";", ":", "?", "!"]
+    sentences = []
+    for sentence in doc.sents:
+        last_token = sentence[-1]
+        if last_token.text in punctuation:
+            sentence = sentence[:-1]
+        last_word = sentence[-1]
+        if last_word.pos_ == "NOUN":
+            sentence = sentence.text + "."
+        elif last_word.pos_ == "VERB":
+            sentence = sentence.text + "?"
+        else:
+            sentence = sentence.text + "."
+        sentence = sentence[0].upper() + sentence[1:]
+        sentences.append(sentence)
+    text_with_punctuation = " ".join(sentences)
+    return text_with_punctuation
+def get_input_chunks(text: str, max_length: int = 500):
+    try:
+        sentences = sent_tokenize(text)
+    except:
+        nltk.download('punkt')
+        sentences = sent_tokenize(text)
+    sentences = [re.sub(r'\[[0-9]*\]', ' ', sentence) for sentence in sentences if len(sentence.strip()) > 0 and count_tokens(sentence) > 4]
+    input_chunks = []
+    temp_sentences = ""
+    tokens = 0
+    for sentence in sentences:
+        if tokens + count_tokens(sentence) < max_length:
+            temp_sentences += sentence
+            tokens += count_tokens(sentence)
+        else:
+            input_chunks.append(temp_sentences)
+            tokens = count_tokens(sentence)
+            temp_sentences = sentence
+    if len(temp_sentences) > 0:
+        input_chunks.append(temp_sentences)
+    return input_chunks

app.py CHANGED Viewed

@@ -1,13 +1,115 @@
 import streamlit as st
-from summarize import bart_summarize
-# Create a text field
-text = st.text_input("Enter text here")
-# Create a button
-button = st.button("Click here")
-# get text from text field and print it
-if button:
-    summary = bart_summarize(text)
-    st.write(summary)

+import io
+import time
+import wave
+from process_media import MediaProcessor
 import streamlit as st
+from summarizer import BARTSummarizer
+from pydub import AudioSegment
+from  Utils import fetch_article_text, get_text_from_youtube_url
+st.markdown(
+"""
+<style>
+section[data-testid="stSidebar"]  div[role="radiogroup"] label {
+    padding: 0px 0px 20px 20px;
+}
+section[data-testid="stSidebar"] h2 {
+    margin: 10px;
+}
+section.main div[role="radiogroup"] label {
+    padding: 10px 10px 10px 0px;
+}
+</style>
+""",
+unsafe_allow_html=True,
+)
+with st.sidebar:
+    st.header("CHOOSE INPUT TYPE")
+    input_type = st.radio("", ["Text", "Media"], label_visibility = "hidden")
+text_to_summarize = None
+if input_type == "Text":
+    st.header("Summarize from text or URL")
+    text_type = st.radio("", ["Raw Text", "URL", "Document"], key="text_type", horizontal=True, label_visibility = "hidden")
+    if text_type == "Raw Text":
+        text = st.text_area("Enter raw text here", height=240, max_chars=10000, placeholder="Enter a paragraph to summarize")
+        if text:
+            text_to_summarize = text
+    elif text_type == "URL":
+        url = st.text_input("Enter URL here", placeholder="Enter URL to an article, blog post, etc.")
+        if url:
+            article_text = fetch_article_text(url)
+            if article_text:
+                st.markdown("#### Text from url:")
+                st.write(article_text)
+                text_to_summarize = article_text
+    else:
+        ## TODO: Add file upload option
+        pass
+elif input_type == "Media":
+    st.header("Summarize from file or YouTube URL")
+    media_type = st.radio("", ["Audio file", "Video file", "Youtube video link"], key="media_type", horizontal=True, label_visibility = "hidden")
+    if media_type == "Audio file":
+        audio_file = st.file_uploader("Upload an audio file", type=["mp3", "wav"], label_visibility="visible")
+        if audio_file is not None:
+            with st.spinner("Fetching text from audio..."):
+                # print(audio_file.read())
+                wav_bytes = None
+                media_processor = MediaProcessor()
+                if audio_file.type == "audio/mpeg":
+                    wav_bytes = media_processor.get_wav_from_audio(audio_file.read())
+                else:
+                    wav_bytes = audio_file.read()
+                text = media_processor.process_audio(wav_bytes)
+                st.markdown("#### Text from audio:")
+                st.write(text)
+    elif media_type == "Video file":
+        video_file = st.file_uploader("Upload a video file", type=["mp4"], label_visibility="visible")
+        if video_file is not None:
+            with st.spinner("Fetching text from video..."):
+                media_processor = MediaProcessor()
+                text = media_processor.process_video(video_file.read())
+                st.markdown("#### Text from video:")
+                st.write(text)
+    else:
+        youtube_url = st.text_input("Enter YouTube URL here", placeholder="Enter URL to an YouTube video", label_visibility="visible")
+        if youtube_url:
+            with st.spinner("Fetching text from video..."):
+                try:
+                    text_to_summarize = get_text_from_youtube_url(youtube_url)
+                    st.markdown("#### Text from video:")
+                    st.markdown('<div style="height: 300px; overflow: auto; margin-bottom: 20px;">' + text_to_summarize + '</div>', unsafe_allow_html=True)
+                except:
+                    st.error("Unable to fetch text from video. Please try a different video.")
+                    text_to_summarize = None
+if text_to_summarize is not None:
+    overall_summary = st.button("Overall summary")
+    auto_chapters_summary = st.button("Auto Chapters summary")
+    if overall_summary:
+        with st.spinner("Summarizing..."):
+            # time.sleep(2)
+            # st.write(text_to_summarize)
+            summarizer = BARTSummarizer()
+            summary = summarizer.chunk_summarize(text_to_summarize)
+            st.markdown("#### Summary:")
+            st.write(summary)
+    elif auto_chapters_summary:
+        with st.spinner("Summarizing..."):
+            # time.sleep(2)
+            # st.write(text_to_summarize)
+            summarizer = BARTSummarizer()
+            summary = summarizer.auto_chapters_summarize(text_to_summarize)
+            st.markdown("#### Summary:")
+            st.write(summary)

process_media.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import io
+import wave
+import tensorflow as tf
+import tensorflow_io as tfio
+import moviepy.editor as mp
+import numpy as np
+from pydub import AudioSegment
+from transformers import AutoProcessor, TFWhisperForConditionalGeneration
+from moviepy.video.io.VideoFileClip import VideoFileClip
+# tf.config.run_functions_eagerly(True)
+class MediaProcessor:
+    def __init__(self):
+        self.processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        self.model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+    def load_wav_16k_mono(self, file_bytes):
+        """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
+        wav, sample_rate = tf.audio.decode_wav(
+            file_bytes,
+            desired_channels=1)
+        wav = tf.squeeze(wav, axis=-1)
+        sample_rate = tf.cast(sample_rate, dtype=tf.int64)
+        wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
+        return wav.numpy()
+    def get_text_from_audio(self, resampled_audio_data):
+        # Split the resampled audio data into 30-second chunks
+        chunk_size = 30 * 16000
+        audio_chunks = [resampled_audio_data[i:i+chunk_size] for i in range(0, len(resampled_audio_data), chunk_size)]
+        text = []
+        for chunk in audio_chunks:
+            inputs = self.processor(chunk, sampling_rate=16000, return_tensors="tf").input_features
+            predicted_ids = self.model.generate(inputs, max_new_tokens=500)
+            transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
+            text.append(transcription[0])
+        return " ".join(text)
+    def get_audio_from_video(self, video_buffer):
+        buffer = io.BytesIO(video_buffer)
+        video_file = AudioSegment.from_file(buffer)
+        audio = video_file.set_channels(1)
+        with io.BytesIO() as wav_buffer:
+            audio.export(wav_buffer, format="wav")
+            wav_bytes = wav_buffer.getvalue()
+        return wav_bytes
+    def get_wav_from_audio(self, audio_buffer):
+        buffer = io.BytesIO(audio_buffer)
+        audio_file = AudioSegment.from_mp3(buffer)
+        raw_data = audio_file.raw_data
+        with io.BytesIO() as wav_buffer:
+            with wave.open(wav_buffer, "wb") as wav_file:
+                wav_file.setnchannels(audio_file.channels)
+                wav_file.setsampwidth(audio_file.sample_width)
+                wav_file.setframerate(audio_file.frame_rate)
+                wav_file.writeframes(raw_data)
+            wav_bytes = wav_buffer.getvalue()
+        return wav_bytes
+    def process_audio(self, audio_bytes):
+        resampled_audio_data = self.load_wav_16k_mono(audio_bytes)
+        return self.get_text_from_audio(resampled_audio_data)
+    def process_video(self, buffer):
+        audio_bytes = self.get_audio_from_video(buffer)
+        return self.process_audio(audio_bytes)

summarize.py DELETED Viewed

@@ -1,69 +0,0 @@
-from datetime import datetime
-import multiprocessing
-from transformers import BartTokenizer, TFBartForConditionalGeneration, pipeline
-from Utils import fetch_article_text, count_tokens
-import re
-from nltk.tokenize import sent_tokenize
-import nltk
-import threading
-tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
-model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-max_length = model.config.max_position_embeddings
-summaries = []
-def generate_summary(text: str):
-    encoded_input = tokenizer.encode(text, max_length=max_length, return_tensors='tf')
-    # generate summary for the input chunk
-    summary_ids = model.generate(encoded_input, max_length=300, num_beams=4, early_stopping=True)
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    # add the summary to the list of summaries
-    summaries.append(summary)
-def bart_summarize(text: str):
-    try:
-        sentences = sent_tokenize(text)
-    except:
-        nltk.download('punkt')
-        sentences = sent_tokenize(text)
-    sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and len(sentence.split(" ")) > 4]
-    input_chunks = []
-    temp_sentences = ""
-    tokens = 0
-    for sentence in sentences:
-        if tokens + count_tokens(sentence) < max_length:
-            temp_sentences += sentence
-            tokens += count_tokens(sentence)
-        else:
-            input_chunks.append(temp_sentences)
-            tokens = count_tokens(sentence)
-            temp_sentences = sentence
-    if len(temp_sentences) > 0:
-        input_chunks.append(temp_sentences)
-    # summarize each input chunk separately
-    results = []
-    print(datetime.now().strftime("%H:%M:%S"))
-    for chunk in input_chunks:
-        result_t = multiprocessing.Process(target=generate_summary, args=(chunk,))
-        results.append(result_t)
-    for result in results:
-        result.start()
-    for result in results:
-        result.join()
-    # # combine the summaries to get the final summary for the entire input
-    final_summary = " ".join(summaries)
-    print(datetime.now().strftime("%H:%M:%S"))
-    return final_summary

summarizer.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from datetime import datetime
+from transformers import BartTokenizer, TFBartForConditionalGeneration
+from Utils import get_input_chunks
+import networkx as nx
+from nltk.tokenize import sent_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+import community
+class BARTSummarizer:
+    def __init__(self, model_name: str = 'facebook/bart-large-cnn'):
+        self.model_name = model_name
+        self.tokenizer = BartTokenizer.from_pretrained(model_name)
+        self.model = TFBartForConditionalGeneration.from_pretrained(model_name)
+        self.max_length = self.model.config.max_position_embeddings
+    def summarize(self, text: str):
+        encoded_input = self.tokenizer.encode(text, max_length=self.max_length, return_tensors='tf', truncation=True)
+        summary_ids = self.model.generate(encoded_input, max_length=300, num_beams=4, early_stopping=True)
+        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return summary
+    def chunk_summarize(self, text: str):
+        # split the input into chunks
+        summaries = []
+        input_chunks = get_input_chunks(text, self.max_length)
+        # summarize each input chunk separately
+        print(datetime.now().strftime("%H:%M:%S"))
+        for chunk in input_chunks:
+            summaries.append(self.summarize(chunk))
+        # # combine the summaries to get the final summary for the entire input
+        final_summary = " ".join(summaries)
+        print(datetime.now().strftime("%H:%M:%S"))
+        return final_summary
+    def preprocess_for_auto_chapters(self, text: str):
+        # Tokenize the text into sentences
+        sentences = sent_tokenize(text)
+        # Filter out empty sentences and sentences with less than 5 words
+        sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and len(sentence.split(" ")) > 4]
+        # Combine every 5 sentences into a single sentence
+        sentences = [' '.join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)]
+        return sentences
+    def auto_chapters_summarize(self, text: str):
+        sentences = self.preprocess_for_auto_chapters(text)
+        vectorizer = TfidfVectorizer(stop_words='english')
+        X = vectorizer.fit_transform(sentences)
+        # Compute the similarity matrix using cosine similarity
+        similarity_matrix = X * X.T
+        # Convert the similarity matrix to a graph
+        graph = nx.from_scipy_sparse_array(similarity_matrix)
+        # Apply the Louvain algorithm to identify communities
+        partition = community.best_partition(graph, resolution=0.7, random_state=42)
+        # Cluster the sentences
+        clustered_sentences = []
+        for cluster in set(partition.values()):
+            sentences_to_print = []
+            for i, sentence in enumerate(sentences):
+                if partition[i] == cluster:
+                    sentences_to_print.append(sentence)
+            if len(sentences_to_print) > 1:
+                clustered_sentences.append(" ".join(sentences_to_print))
+        # Summarize each cluster
+        summaries = []
+        for cluster in clustered_sentences:
+            summaries.append(self.chunk_summarize(cluster))
+        # Combine the summaries to get the final summary for the entire input
+        final_summary = "\n\n".join(summaries)