Spaces:

kwmr
/

fastperson

Sleeping

App Files Files Community

kwmr commited on Mar 31, 2023

Commit

8ad5dc2

•

1 Parent(s): e0afab2

upload

Browse files

Files changed (6) hide show

__pycache__/utils.cpython-39.pyc +0 -0
app.py +17 -183
images/icon.png +0 -0
images/logo.png +0 -0
model.py +151 -0
utils.py +45 -0

__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (5.56 kB). View file

app.py CHANGED Viewed

@@ -1,181 +1,9 @@
-import copy
-import subprocess
-from pytube import YouTube
-from scipy.signal import resample
 import gradio as gr
-import numpy as np
-import pytsmod as tsm
-from moviepy.audio.AudioClip import AudioArrayClip
-from moviepy.editor import *
-from moviepy.video.fx.speedx import speedx
-from sentence_transformers import SentenceTransformer, util
-from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
-import torch
-import whisper
-subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
-transcriber = whisper.load_model("medium")
-sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
-summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
 root_dir = '/home/user/app/video'
-def get_youtube(video_url):
-    # YouTubeの動画をダウンロード
-    print("Start download video")
-    yt = YouTube(video_url)
-    abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename='download.mp4', output_path='movies/')
-    print("Success download video")
-    print(abs_video_path)
-    return abs_video_path
-def two_chnnel_to_one_channel(sample):
-    # 音声を2チャンネルから1チャンネルに変換
-    left_channel = sample[:, 0]
-    right_channel = sample[:, 1]
-    mono_sample = (left_channel + right_channel) / 2
-    return mono_sample
-def convert_sample_rate(data, original_sr, target_sr):
-    # 音声データのサンプリング周波数を変更
-    target_length = int(len(data) * target_sr / original_sr)
-    return resample(data, target_length)
-def summarize_video(video_path, ratio_sum, playback_speed):
-    print("Start summarize video")
-    output_path = os.path.join(os.path.dirname(video_path), 'output.mp4')
-    movie_clip = VideoFileClip(video_path)
-    audio_sampling_rate = movie_clip.audio.fps
-    clip_audio = np.array(movie_clip.audio.to_soundarray())
-    # 文字の書き起こし
-    print("Start transcribing text")
-    audio_fp32 = convert_sample_rate(clip_audio, audio_sampling_rate, 16000)
-    audio_fp32 = two_chnnel_to_one_channel(audio_fp32).astype(np.float32)
-    transcription_results = transcriber.transcribe(audio_fp32)
-    # 文の句切れごとにテキスト/発話時間をまとめる
-    print("Start summarizing text/speech time")
-    periods = ('.', '!', '?')
-    clip_sentences = []
-    head_sentence = True
-    for r in transcription_results['segments']:
-        if head_sentence:
-            start_time = r['start']
-            clip_sentences.append({'sentence':'', 'sentences':[], 'duration':[r['start'], None], 'durations':[]})
-            head_sentence = False
-        clip_sentences[-1]['sentence'] += r['text']
-        clip_sentences[-1]['sentences'].append(r['text'])
-        clip_sentences[-1]['durations'].append([r['start'], r['end']])
-        if r['text'].endswith(periods):
-            clip_sentences[-1]['duration'][1] =  r['end']
-            head_sentence = True
-    # 文字の要約
-    print("Start summarizing sentences")
-    transcription = transcription_results['text']
-    summary_text = summarizer(transcription, max_length=int(len(transcription)*0.1), min_length=int(len(transcription)*0.05), do_sample=False)[0]['summary_text']
-    print(summary_text)
-    # 要約文と一致する文を判別
-    print("Start deleting sentences that match the summary sentence")
-    summary_embedings = [sentence_transformer.encode(s, convert_to_tensor=True) for s in summary_text.split('.')]
-    important_sentence_idxs = [False]*len(clip_sentences)
-    for s, clip_sentence in enumerate(clip_sentences):
-        embedding = sentence_transformer.encode(clip_sentence['sentence'], convert_to_tensor=True)
-        for s_e in summary_embedings:
-            if util.pytorch_cos_sim(embedding, s_e) > ratio_sum:
-                important_sentence_idxs[s] = True
-    # となりの文と接続する文を判別
-    print("Start identifying sentences that are connected to the sentence next to it")
-    def next_prob(prompt, next_sentence, b=1.2):
-        encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
-        logits = next_sentence_predict(**encoding, labels=torch.LongTensor([1])).logits
-        pos = b ** logits[0, 0]
-        neg = b ** logits[0, 1]
-        return float(pos / (pos + neg))
-    connection_idxs = [False]*(len(clip_sentences)-1)
-    for s in range(len(clip_sentences)-1):
-        if next_prob(clip_sentences[s]['sentence'], clip_sentences[s+1]['sentence']) > 0.88:
-            connection_idxs[s] = True
-    # 要約後の文章のみ残す
-    def combine_arrays(A, B):
-        C = copy.deepcopy(A)
-        for i in range(len(A)):
-            if A[i]:
-                j = i
-                while j < len(B) and B[j]:
-                    C[j+1] = True
-                    j += 1
-                j = i
-                while j > 0 and B[j-1]:
-                    C[j] = True
-                    j -= 1
-        return C
-    important_idxs = combine_arrays(important_sentence_idxs, connection_idxs)
-    # 要約後の文章がどこかを可視化
-    html_text = "<h1 class='title'>Full Transcription</h1>"
-    for idx in range(len(important_sentence_idxs)):
-        seconds = clip_sentences[idx]['duration'][0] * (1/playback_speed)
-        minutes, seconds = divmod(seconds, 60)
-        if important_idxs[idx]:
-            html_text += '<p> <b>' + f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']} </b> </p>"
-        else:
-            html_text += f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']}</p>"
-    print(html_text)
-    # 動画を結合
-    print("Start combine movies")
-    clips = []
-    for i in range(len(important_idxs)):
-        if important_idxs[i]:
-            tmp_clips = []
-            for j in range(len(clip_sentences[i]['sentences'])):
-                start_time, end_time = clip_sentences[i]['durations'][j][0], clip_sentences[i]['durations'][j][1]
-                if end_time > movie_clip.duration:
-                    end_time = movie_clip.duration
-                if start_time > movie_clip.duration:
-                    continue
-                clip = movie_clip.subclip(start_time, end_time)
-                clip = clip.set_pos("center").set_duration(end_time-start_time)
-                tmp_clips.append(clip)
-            clips.append(concatenate_videoclips(tmp_clips))
-    # クリップをクロスディゾルブで結合
-    # for c in range(len(clips)-1):
-    #     fade_duration = 2
-    #     clips[c] = clips[c].crossfadeout(fade_duration).audio_fadeout(fade_duration)
-    #     clips[c+1] = clips[c+1].crossfadein(fade_duration).audio_fadein(fade_duration)
-    # 動画を結合し再生速度を変化させる
-    final_video = concatenate_videoclips(clips, method="chain")
-    final_video_audio = np.array(final_video.audio.to_soundarray(fps=audio_sampling_rate))
-    if playback_speed != 1:
-        final_video_audio_fixed = tsm.wsola(final_video_audio, 1/playback_speed).T
-    else:
-        final_video_audio_fixed = final_video_audio
-    final_video = speedx(final_video, factor=playback_speed)
-    final_video = final_video.set_audio(AudioArrayClip(final_video_audio_fixed, fps=audio_sampling_rate))
-    # if final_video.duration > 30:
-    #     final_video = final_video.subclip(0, 30)
-    final_video.write_videofile(output_path)
-    print(output_path)
-    print("Success summarize video")
-    return output_path, summary_text, html_text
 # ---- Gradio Layout -----
 youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
@@ -186,19 +14,25 @@ demo = gr.Blocks()
 demo.encrypt = False
 with demo:
-    gr.Markdown('''
-            <div style="text-align: center">
-            <h1 style='text-align: center'>FastPerson: Video summarization applied with transcription and text summarization</h1>
-            <img src="https://user-images.githubusercontent.com/33136532/215362410-97727904-e1ca-408d-967e-f5798671405e.png" alt="Video Summarization">
             </div>
         ''')
     with gr.Row():
             gr.Markdown('''
             ### Summarize video
-            ##### Step 1a. Download video from youtube
-            ##### Step 1b. You also can upload video directly
-            ##### Step 2. Enter summary rate and playback speed
-            ##### Step 3. Generating summarized video.
             ''')
     with gr.Row():
         gr.Markdown('''
@@ -228,5 +62,5 @@ with demo:
     with gr.Row():
         transcription_text.render()
-demo.launch(debug=True)
-# demo.launch(debug=True, share=True)

 import gradio as gr
+from model import summarize_video
 root_dir = '/home/user/app/video'
 # ---- Gradio Layout -----
 youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
 demo.encrypt = False
 with demo:
+    with gr.Column():
+        gr.Markdown('''
+                <div style="text-align: center">
+                    <h1 style='text-align: center'>Video Summarization</h1>
+                </div>
+            ''')
+    with gr.Column():
+        gr.Markdown('''
+            <div class="center">
+                <img src="https://user-images.githubusercontent.com/33136532/229133078-22cb84d6-b120-4a72-b1cf-b4b3ea47ed7d.png" width="500" height="300">
             </div>
         ''')
     with gr.Row():
             gr.Markdown('''
             ### Summarize video
+            #### Step 1. download a video from youtube (select one of the examples and press the Download button)
+            #### Step 2: Select the summary rate and playback speed
+            #### Step 3: Generate a summarized video (press the Summarize button)
+            A summarized video will be generated on the right side of the original video. In addition, the summarized text of the video and in the video
             ''')
     with gr.Row():
         gr.Markdown('''
     with gr.Row():
         transcription_text.render()
+# demo.launch(debug=True)
+demo.launch(debug=True, share=True)

images/icon.png DELETED Viewed

Binary file (73.3 kB)

images/logo.png ADDED Viewed

model.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import copy
+import subprocess
+import numpy as np
+import pytsmod as tsm
+from moviepy.audio.AudioClip import AudioArrayClip
+from moviepy.editor import *
+from moviepy.video.fx.speedx import speedx
+from sentence_transformers import SentenceTransformer, util
+from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
+import torch
+import whisper
+from utils import convert_sample_rate, two_chnnel_to_one_channel, convert_sample_rate
+subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
+transcriber = whisper.load_model("medium")
+sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
+summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
+def summarize_video(video_path, ratio_sum, playback_speed):
+    print("Start summarize video")
+    output_path = os.path.join(os.path.dirname(video_path), 'output.mp4')
+    movie_clip = VideoFileClip(video_path)
+    audio_sampling_rate = movie_clip.audio.fps
+    clip_audio = np.array(movie_clip.audio.to_soundarray())
+    # 文字の書き起こし
+    print("Start transcribing text")
+    audio_fp32 = convert_sample_rate(clip_audio, audio_sampling_rate, 16000)
+    audio_fp32 = two_chnnel_to_one_channel(audio_fp32).astype(np.float32)
+    transcription_results = transcriber.transcribe(audio_fp32)
+    # 文の句切れごとにテキスト/発話時間をまとめる
+    print("Start summarizing text/speech time")
+    periods = ('.', '!', '?')
+    clip_sentences = []
+    head_sentence = True
+    for r in transcription_results['segments']:
+        if head_sentence:
+            start_time = r['start']
+            clip_sentences.append({'sentence':'', 'sentences':[], 'duration':[r['start'], None], 'durations':[]})
+            head_sentence = False
+        clip_sentences[-1]['sentence'] += r['text']
+        clip_sentences[-1]['sentences'].append(r['text'])
+        clip_sentences[-1]['durations'].append([r['start'], r['end']])
+        if r['text'].endswith(periods):
+            clip_sentences[-1]['duration'][1] =  r['end']
+            head_sentence = True
+    # 文字の要約
+    print("Start summarizing sentences")
+    transcription = transcription_results['text']
+    summary_text = summarizer(transcription, max_length=int(len(transcription)*0.1), min_length=int(len(transcription)*0.05), do_sample=False)[0]['summary_text']
+    print(summary_text)
+    # 要約文と一致する文を判別
+    print("Start deleting sentences that match the summary sentence")
+    summary_embedings = [sentence_transformer.encode(s, convert_to_tensor=True) for s in summary_text.split('.')]
+    important_sentence_idxs = [False]*len(clip_sentences)
+    for s, clip_sentence in enumerate(clip_sentences):
+        embedding = sentence_transformer.encode(clip_sentence['sentence'], convert_to_tensor=True)
+        for s_e in summary_embedings:
+            if util.pytorch_cos_sim(embedding, s_e) > ratio_sum:
+                important_sentence_idxs[s] = True
+    # となりの文と接続する文を判別
+    print("Start identifying sentences that are connected to the sentence next to it")
+    def next_prob(prompt, next_sentence, b=1.2):
+        encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+        logits = next_sentence_predict(**encoding, labels=torch.LongTensor([1])).logits
+        pos = b ** logits[0, 0]
+        neg = b ** logits[0, 1]
+        return float(pos / (pos + neg))
+    connection_idxs = [False]*(len(clip_sentences)-1)
+    for s in range(len(clip_sentences)-1):
+        if next_prob(clip_sentences[s]['sentence'], clip_sentences[s+1]['sentence']) > 0.88:
+            connection_idxs[s] = True
+    # 要約後の文章のみ残す
+    def combine_arrays(A, B):
+        C = copy.deepcopy(A)
+        for i in range(len(A)):
+            if A[i]:
+                j = i
+                while j < len(B) and B[j]:
+                    C[j+1] = True
+                    j += 1
+                j = i
+                while j > 0 and B[j-1]:
+                    C[j] = True
+                    j -= 1
+        return C
+    important_idxs = combine_arrays(important_sentence_idxs, connection_idxs)
+    # 要約後の文章がどこかを可視化
+    html_text = "<h1 class='title'>Full Transcription</h1>"
+    for idx in range(len(important_sentence_idxs)):
+        seconds = clip_sentences[idx]['duration'][0] * (1/playback_speed)
+        minutes, seconds = divmod(seconds, 60)
+        if important_idxs[idx]:
+            html_text += '<p> <b>' + f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']} </b> </p>"
+        else:
+            html_text += f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']}</p>"
+    print(html_text)
+    # 動画を結合
+    print("Start combine movies")
+    clips = []
+    for i in range(len(important_idxs)):
+        if important_idxs[i]:
+            tmp_clips = []
+            for j in range(len(clip_sentences[i]['sentences'])):
+                start_time, end_time = clip_sentences[i]['durations'][j][0], clip_sentences[i]['durations'][j][1]
+                if end_time > movie_clip.duration:
+                    end_time = movie_clip.duration
+                if start_time > movie_clip.duration:
+                    continue
+                clip = movie_clip.subclip(start_time, end_time)
+                clip = clip.set_pos("center").set_duration(end_time-start_time)
+                tmp_clips.append(clip)
+            clips.append(concatenate_videoclips(tmp_clips))
+    # クリップをクロスディゾルブで結合
+    # for c in range(len(clips)-1):
+    #     fade_duration = 2
+    #     clips[c] = clips[c].crossfadeout(fade_duration).audio_fadeout(fade_duration)
+    #     clips[c+1] = clips[c+1].crossfadein(fade_duration).audio_fadein(fade_duration)
+    # 動画を結合し再生速度を変化させる
+    final_video = concatenate_videoclips(clips, method="chain")
+    final_video_audio = np.array(final_video.audio.to_soundarray(fps=audio_sampling_rate))
+    if playback_speed != 1:
+        final_video_audio_fixed = tsm.wsola(final_video_audio, 1/playback_speed).T
+    else:
+        final_video_audio_fixed = final_video_audio
+    final_video = speedx(final_video, factor=playback_speed)
+    final_video = final_video.set_audio(AudioArrayClip(final_video_audio_fixed, fps=audio_sampling_rate))
+    # if final_video.duration > 30:
+    #     final_video = final_video.subclip(0, 30)
+    final_video.write_videofile(output_path)
+    print(output_path)
+    print("Success summarize video")
+    return output_path, summary_text, html_text

utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import copy
+import subprocess
+from pytube import YouTube
+from scipy.signal import resample
+import numpy as np
+import pytsmod as tsm
+from moviepy.audio.AudioClip import AudioArrayClip
+from moviepy.editor import *
+from moviepy.video.fx.speedx import speedx
+from sentence_transformers import SentenceTransformer, util
+from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
+import torch
+import whisper
+subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
+transcriber = whisper.load_model("medium")
+sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
+summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
+def get_youtube(video_url):
+    # YouTubeの動画をダウンロード
+    print("Start download video")
+    yt = YouTube(video_url)
+    abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename='download.mp4', output_path='movies/')
+    print("Success download video")
+    print(abs_video_path)
+    return abs_video_path
+def two_chnnel_to_one_channel(sample):
+    # 音声を2チャンネルから1チャンネルに変換
+    left_channel = sample[:, 0]
+    right_channel = sample[:, 1]
+    mono_sample = (left_channel + right_channel) / 2
+    return mono_sample
+def convert_sample_rate(data, original_sr, target_sr):
+    # 音声データのサンプリング周波数を変更
+    target_length = int(len(data) * target_sr / original_sr)
+    return resample(data, target_length)