kwmr commited on
Commit
8ad5dc2
1 Parent(s): e0afab2
Files changed (6) hide show
  1. __pycache__/utils.cpython-39.pyc +0 -0
  2. app.py +17 -183
  3. images/icon.png +0 -0
  4. images/logo.png +0 -0
  5. model.py +151 -0
  6. utils.py +45 -0
__pycache__/utils.cpython-39.pyc ADDED
Binary file (5.56 kB). View file
 
app.py CHANGED
@@ -1,181 +1,9 @@
1
- import copy
2
- import subprocess
3
-
4
- from pytube import YouTube
5
- from scipy.signal import resample
6
  import gradio as gr
7
- import numpy as np
8
- import pytsmod as tsm
9
-
10
- from moviepy.audio.AudioClip import AudioArrayClip
11
- from moviepy.editor import *
12
- from moviepy.video.fx.speedx import speedx
13
-
14
- from sentence_transformers import SentenceTransformer, util
15
- from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
16
- import torch
17
- import whisper
18
 
19
- subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
20
-
21
- transcriber = whisper.load_model("medium")
22
- sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
23
- tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
24
- next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
25
- summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
26
 
27
  root_dir = '/home/user/app/video'
28
 
29
-
30
- def get_youtube(video_url):
31
- # YouTubeの動画をダウンロード
32
- print("Start download video")
33
- yt = YouTube(video_url)
34
- abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename='download.mp4', output_path='movies/')
35
- print("Success download video")
36
- print(abs_video_path)
37
- return abs_video_path
38
-
39
- def two_chnnel_to_one_channel(sample):
40
- # 音声を2チャンネルから1チャンネルに変換
41
- left_channel = sample[:, 0]
42
- right_channel = sample[:, 1]
43
- mono_sample = (left_channel + right_channel) / 2
44
- return mono_sample
45
-
46
- def convert_sample_rate(data, original_sr, target_sr):
47
- # 音声データのサンプリング周波数を変更
48
- target_length = int(len(data) * target_sr / original_sr)
49
- return resample(data, target_length)
50
-
51
- def summarize_video(video_path, ratio_sum, playback_speed):
52
- print("Start summarize video")
53
- output_path = os.path.join(os.path.dirname(video_path), 'output.mp4')
54
- movie_clip = VideoFileClip(video_path)
55
- audio_sampling_rate = movie_clip.audio.fps
56
- clip_audio = np.array(movie_clip.audio.to_soundarray())
57
-
58
- # 文字の書き起こし
59
- print("Start transcribing text")
60
- audio_fp32 = convert_sample_rate(clip_audio, audio_sampling_rate, 16000)
61
- audio_fp32 = two_chnnel_to_one_channel(audio_fp32).astype(np.float32)
62
- transcription_results = transcriber.transcribe(audio_fp32)
63
-
64
- # 文の句切れごとにテキスト/発話時間をまとめる
65
- print("Start summarizing text/speech time")
66
- periods = ('.', '!', '?')
67
- clip_sentences = []
68
- head_sentence = True
69
- for r in transcription_results['segments']:
70
- if head_sentence:
71
- start_time = r['start']
72
- clip_sentences.append({'sentence':'', 'sentences':[], 'duration':[r['start'], None], 'durations':[]})
73
- head_sentence = False
74
- clip_sentences[-1]['sentence'] += r['text']
75
- clip_sentences[-1]['sentences'].append(r['text'])
76
- clip_sentences[-1]['durations'].append([r['start'], r['end']])
77
- if r['text'].endswith(periods):
78
- clip_sentences[-1]['duration'][1] = r['end']
79
- head_sentence = True
80
-
81
- # 文字の要約
82
- print("Start summarizing sentences")
83
- transcription = transcription_results['text']
84
- summary_text = summarizer(transcription, max_length=int(len(transcription)*0.1), min_length=int(len(transcription)*0.05), do_sample=False)[0]['summary_text']
85
- print(summary_text)
86
-
87
- # 要約文と一致する文を判別
88
- print("Start deleting sentences that match the summary sentence")
89
- summary_embedings = [sentence_transformer.encode(s, convert_to_tensor=True) for s in summary_text.split('.')]
90
- important_sentence_idxs = [False]*len(clip_sentences)
91
- for s, clip_sentence in enumerate(clip_sentences):
92
- embedding = sentence_transformer.encode(clip_sentence['sentence'], convert_to_tensor=True)
93
- for s_e in summary_embedings:
94
- if util.pytorch_cos_sim(embedding, s_e) > ratio_sum:
95
- important_sentence_idxs[s] = True
96
-
97
- # となりの文と接続する文を判別
98
- print("Start identifying sentences that are connected to the sentence next to it")
99
- def next_prob(prompt, next_sentence, b=1.2):
100
- encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
101
- logits = next_sentence_predict(**encoding, labels=torch.LongTensor([1])).logits
102
- pos = b ** logits[0, 0]
103
- neg = b ** logits[0, 1]
104
- return float(pos / (pos + neg))
105
-
106
- connection_idxs = [False]*(len(clip_sentences)-1)
107
- for s in range(len(clip_sentences)-1):
108
- if next_prob(clip_sentences[s]['sentence'], clip_sentences[s+1]['sentence']) > 0.88:
109
- connection_idxs[s] = True
110
-
111
- # 要約後の文章のみ残す
112
- def combine_arrays(A, B):
113
- C = copy.deepcopy(A)
114
- for i in range(len(A)):
115
- if A[i]:
116
- j = i
117
- while j < len(B) and B[j]:
118
- C[j+1] = True
119
- j += 1
120
- j = i
121
- while j > 0 and B[j-1]:
122
- C[j] = True
123
- j -= 1
124
- return C
125
-
126
- important_idxs = combine_arrays(important_sentence_idxs, connection_idxs)
127
-
128
- # 要約後の文章がどこかを可視化
129
- html_text = "<h1 class='title'>Full Transcription</h1>"
130
- for idx in range(len(important_sentence_idxs)):
131
- seconds = clip_sentences[idx]['duration'][0] * (1/playback_speed)
132
- minutes, seconds = divmod(seconds, 60)
133
- if important_idxs[idx]:
134
- html_text += '<p> <b>' + f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']} </b> </p>"
135
- else:
136
- html_text += f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']}</p>"
137
- print(html_text)
138
-
139
- # 動画を結合
140
- print("Start combine movies")
141
- clips = []
142
- for i in range(len(important_idxs)):
143
- if important_idxs[i]:
144
- tmp_clips = []
145
- for j in range(len(clip_sentences[i]['sentences'])):
146
- start_time, end_time = clip_sentences[i]['durations'][j][0], clip_sentences[i]['durations'][j][1]
147
- if end_time > movie_clip.duration:
148
- end_time = movie_clip.duration
149
- if start_time > movie_clip.duration:
150
- continue
151
- clip = movie_clip.subclip(start_time, end_time)
152
- clip = clip.set_pos("center").set_duration(end_time-start_time)
153
- tmp_clips.append(clip)
154
- clips.append(concatenate_videoclips(tmp_clips))
155
-
156
- # クリップをクロスディゾルブで結合
157
- # for c in range(len(clips)-1):
158
- # fade_duration = 2
159
- # clips[c] = clips[c].crossfadeout(fade_duration).audio_fadeout(fade_duration)
160
- # clips[c+1] = clips[c+1].crossfadein(fade_duration).audio_fadein(fade_duration)
161
-
162
- # 動画を結合し再生速度を変化させる
163
- final_video = concatenate_videoclips(clips, method="chain")
164
- final_video_audio = np.array(final_video.audio.to_soundarray(fps=audio_sampling_rate))
165
- if playback_speed != 1:
166
- final_video_audio_fixed = tsm.wsola(final_video_audio, 1/playback_speed).T
167
- else:
168
- final_video_audio_fixed = final_video_audio
169
- final_video = speedx(final_video, factor=playback_speed)
170
- final_video = final_video.set_audio(AudioArrayClip(final_video_audio_fixed, fps=audio_sampling_rate))
171
- # if final_video.duration > 30:
172
- # final_video = final_video.subclip(0, 30)
173
- final_video.write_videofile(output_path)
174
- print(output_path)
175
- print("Success summarize video")
176
- return output_path, summary_text, html_text
177
-
178
-
179
  # ---- Gradio Layout -----
180
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
181
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
@@ -186,19 +14,25 @@ demo = gr.Blocks()
186
  demo.encrypt = False
187
 
188
  with demo:
189
- gr.Markdown('''
190
- <div style="text-align: center">
191
- <h1 style='text-align: center'>FastPerson: Video summarization applied with transcription and text summarization</h1>
192
- <img src="https://user-images.githubusercontent.com/33136532/215362410-97727904-e1ca-408d-967e-f5798671405e.png" alt="Video Summarization">
 
 
 
 
 
 
193
  </div>
194
  ''')
195
  with gr.Row():
196
  gr.Markdown('''
197
  ### Summarize video
198
- ##### Step 1a. Download video from youtube
199
- ##### Step 1b. You also can upload video directly
200
- ##### Step 2. Enter summary rate and playback speed
201
- ##### Step 3. Generating summarized video.
202
  ''')
203
  with gr.Row():
204
  gr.Markdown('''
@@ -228,5 +62,5 @@ with demo:
228
  with gr.Row():
229
  transcription_text.render()
230
 
231
- demo.launch(debug=True)
232
- # demo.launch(debug=True, share=True)
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ from model import summarize_video
 
 
 
 
 
 
4
 
5
  root_dir = '/home/user/app/video'
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  # ---- Gradio Layout -----
8
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
9
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
 
14
  demo.encrypt = False
15
 
16
  with demo:
17
+ with gr.Column():
18
+ gr.Markdown('''
19
+ <div style="text-align: center">
20
+ <h1 style='text-align: center'>Video Summarization</h1>
21
+ </div>
22
+ ''')
23
+ with gr.Column():
24
+ gr.Markdown('''
25
+ <div class="center">
26
+ <img src="https://user-images.githubusercontent.com/33136532/229133078-22cb84d6-b120-4a72-b1cf-b4b3ea47ed7d.png" width="500" height="300">
27
  </div>
28
  ''')
29
  with gr.Row():
30
  gr.Markdown('''
31
  ### Summarize video
32
+ #### Step 1. download a video from youtube (select one of the examples and press the Download button)
33
+ #### Step 2: Select the summary rate and playback speed
34
+ #### Step 3: Generate a summarized video (press the Summarize button)
35
+ A summarized video will be generated on the right side of the original video. In addition, the summarized text of the video and in the video
36
  ''')
37
  with gr.Row():
38
  gr.Markdown('''
 
62
  with gr.Row():
63
  transcription_text.render()
64
 
65
+ # demo.launch(debug=True)
66
+ demo.launch(debug=True, share=True)
images/icon.png DELETED
Binary file (73.3 kB)
 
images/logo.png ADDED
model.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import subprocess
3
+
4
+ import numpy as np
5
+ import pytsmod as tsm
6
+
7
+ from moviepy.audio.AudioClip import AudioArrayClip
8
+ from moviepy.editor import *
9
+ from moviepy.video.fx.speedx import speedx
10
+
11
+ from sentence_transformers import SentenceTransformer, util
12
+ from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
13
+ import torch
14
+ import whisper
15
+
16
+ from utils import convert_sample_rate, two_chnnel_to_one_channel, convert_sample_rate
17
+
18
+ subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
19
+
20
+ transcriber = whisper.load_model("medium")
21
+ sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
22
+ tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
23
+ next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
24
+ summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
25
+
26
+ def summarize_video(video_path, ratio_sum, playback_speed):
27
+ print("Start summarize video")
28
+ output_path = os.path.join(os.path.dirname(video_path), 'output.mp4')
29
+ movie_clip = VideoFileClip(video_path)
30
+ audio_sampling_rate = movie_clip.audio.fps
31
+ clip_audio = np.array(movie_clip.audio.to_soundarray())
32
+
33
+ # 文字の書き起こし
34
+ print("Start transcribing text")
35
+ audio_fp32 = convert_sample_rate(clip_audio, audio_sampling_rate, 16000)
36
+ audio_fp32 = two_chnnel_to_one_channel(audio_fp32).astype(np.float32)
37
+ transcription_results = transcriber.transcribe(audio_fp32)
38
+
39
+ # 文の句切れごとにテキスト/発話時間をまとめる
40
+ print("Start summarizing text/speech time")
41
+ periods = ('.', '!', '?')
42
+ clip_sentences = []
43
+ head_sentence = True
44
+ for r in transcription_results['segments']:
45
+ if head_sentence:
46
+ start_time = r['start']
47
+ clip_sentences.append({'sentence':'', 'sentences':[], 'duration':[r['start'], None], 'durations':[]})
48
+ head_sentence = False
49
+ clip_sentences[-1]['sentence'] += r['text']
50
+ clip_sentences[-1]['sentences'].append(r['text'])
51
+ clip_sentences[-1]['durations'].append([r['start'], r['end']])
52
+ if r['text'].endswith(periods):
53
+ clip_sentences[-1]['duration'][1] = r['end']
54
+ head_sentence = True
55
+
56
+ # 文字の要約
57
+ print("Start summarizing sentences")
58
+ transcription = transcription_results['text']
59
+ summary_text = summarizer(transcription, max_length=int(len(transcription)*0.1), min_length=int(len(transcription)*0.05), do_sample=False)[0]['summary_text']
60
+ print(summary_text)
61
+
62
+ # 要約文と一致する文を判別
63
+ print("Start deleting sentences that match the summary sentence")
64
+ summary_embedings = [sentence_transformer.encode(s, convert_to_tensor=True) for s in summary_text.split('.')]
65
+ important_sentence_idxs = [False]*len(clip_sentences)
66
+ for s, clip_sentence in enumerate(clip_sentences):
67
+ embedding = sentence_transformer.encode(clip_sentence['sentence'], convert_to_tensor=True)
68
+ for s_e in summary_embedings:
69
+ if util.pytorch_cos_sim(embedding, s_e) > ratio_sum:
70
+ important_sentence_idxs[s] = True
71
+
72
+ # となりの文と接続する文を判別
73
+ print("Start identifying sentences that are connected to the sentence next to it")
74
+ def next_prob(prompt, next_sentence, b=1.2):
75
+ encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
76
+ logits = next_sentence_predict(**encoding, labels=torch.LongTensor([1])).logits
77
+ pos = b ** logits[0, 0]
78
+ neg = b ** logits[0, 1]
79
+ return float(pos / (pos + neg))
80
+
81
+ connection_idxs = [False]*(len(clip_sentences)-1)
82
+ for s in range(len(clip_sentences)-1):
83
+ if next_prob(clip_sentences[s]['sentence'], clip_sentences[s+1]['sentence']) > 0.88:
84
+ connection_idxs[s] = True
85
+
86
+ # 要約後の文章のみ残す
87
+ def combine_arrays(A, B):
88
+ C = copy.deepcopy(A)
89
+ for i in range(len(A)):
90
+ if A[i]:
91
+ j = i
92
+ while j < len(B) and B[j]:
93
+ C[j+1] = True
94
+ j += 1
95
+ j = i
96
+ while j > 0 and B[j-1]:
97
+ C[j] = True
98
+ j -= 1
99
+ return C
100
+
101
+ important_idxs = combine_arrays(important_sentence_idxs, connection_idxs)
102
+
103
+ # 要約後の文章がどこかを可視化
104
+ html_text = "<h1 class='title'>Full Transcription</h1>"
105
+ for idx in range(len(important_sentence_idxs)):
106
+ seconds = clip_sentences[idx]['duration'][0] * (1/playback_speed)
107
+ minutes, seconds = divmod(seconds, 60)
108
+ if important_idxs[idx]:
109
+ html_text += '<p> <b>' + f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']} </b> </p>"
110
+ else:
111
+ html_text += f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']}</p>"
112
+ print(html_text)
113
+
114
+ # 動画を結合
115
+ print("Start combine movies")
116
+ clips = []
117
+ for i in range(len(important_idxs)):
118
+ if important_idxs[i]:
119
+ tmp_clips = []
120
+ for j in range(len(clip_sentences[i]['sentences'])):
121
+ start_time, end_time = clip_sentences[i]['durations'][j][0], clip_sentences[i]['durations'][j][1]
122
+ if end_time > movie_clip.duration:
123
+ end_time = movie_clip.duration
124
+ if start_time > movie_clip.duration:
125
+ continue
126
+ clip = movie_clip.subclip(start_time, end_time)
127
+ clip = clip.set_pos("center").set_duration(end_time-start_time)
128
+ tmp_clips.append(clip)
129
+ clips.append(concatenate_videoclips(tmp_clips))
130
+
131
+ # クリップをクロスディゾルブで結合
132
+ # for c in range(len(clips)-1):
133
+ # fade_duration = 2
134
+ # clips[c] = clips[c].crossfadeout(fade_duration).audio_fadeout(fade_duration)
135
+ # clips[c+1] = clips[c+1].crossfadein(fade_duration).audio_fadein(fade_duration)
136
+
137
+ # 動画を結合し再生速度を変化させる
138
+ final_video = concatenate_videoclips(clips, method="chain")
139
+ final_video_audio = np.array(final_video.audio.to_soundarray(fps=audio_sampling_rate))
140
+ if playback_speed != 1:
141
+ final_video_audio_fixed = tsm.wsola(final_video_audio, 1/playback_speed).T
142
+ else:
143
+ final_video_audio_fixed = final_video_audio
144
+ final_video = speedx(final_video, factor=playback_speed)
145
+ final_video = final_video.set_audio(AudioArrayClip(final_video_audio_fixed, fps=audio_sampling_rate))
146
+ # if final_video.duration > 30:
147
+ # final_video = final_video.subclip(0, 30)
148
+ final_video.write_videofile(output_path)
149
+ print(output_path)
150
+ print("Success summarize video")
151
+ return output_path, summary_text, html_text
utils.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import subprocess
3
+
4
+ from pytube import YouTube
5
+ from scipy.signal import resample
6
+ import numpy as np
7
+ import pytsmod as tsm
8
+
9
+ from moviepy.audio.AudioClip import AudioArrayClip
10
+ from moviepy.editor import *
11
+ from moviepy.video.fx.speedx import speedx
12
+
13
+ from sentence_transformers import SentenceTransformer, util
14
+ from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
15
+ import torch
16
+ import whisper
17
+
18
+ subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
19
+
20
+ transcriber = whisper.load_model("medium")
21
+ sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
22
+ tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
23
+ next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
24
+ summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
25
+
26
+ def get_youtube(video_url):
27
+ # YouTubeの動画をダウンロード
28
+ print("Start download video")
29
+ yt = YouTube(video_url)
30
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename='download.mp4', output_path='movies/')
31
+ print("Success download video")
32
+ print(abs_video_path)
33
+ return abs_video_path
34
+
35
+ def two_chnnel_to_one_channel(sample):
36
+ # 音声を2チャンネルから1チャンネルに変換
37
+ left_channel = sample[:, 0]
38
+ right_channel = sample[:, 1]
39
+ mono_sample = (left_channel + right_channel) / 2
40
+ return mono_sample
41
+
42
+ def convert_sample_rate(data, original_sr, target_sr):
43
+ # 音声データのサンプリング周波数を変更
44
+ target_length = int(len(data) * target_sr / original_sr)
45
+ return resample(data, target_length)