kwmr commited on
Commit
958c599
1 Parent(s): 64b3dd3

add first files

Browse files
app.py CHANGED
@@ -1,7 +1,224 @@
1
- mport gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
 
3
+ from pytube import YouTube
4
+ from scipy.signal import resample
5
+ import gradio as gr
6
+ import numpy as np
7
+ import pytsmod as tsm
8
 
9
+ from moviepy.audio.AudioClip import AudioArrayClip
10
+ from moviepy.editor import *
11
+ from moviepy.video.fx.speedx import speedx
12
+
13
+ from sentence_transformers import SentenceTransformer, util
14
+ from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
15
+ import torch
16
+ import whisper
17
+
18
+
19
+ transcriber = whisper.load_model("medium")
20
+ sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
21
+ tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
22
+ next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
23
+ summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
24
+
25
+ def get_youtube(video_url):
26
+ # YouTubeの動画をダウンロード
27
+ print("Start download video")
28
+ yt = YouTube(video_url)
29
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename='download.mp4', output_path='./movies/')
30
+ print("Success download video")
31
+ print(abs_video_path)
32
+
33
+ return abs_video_path
34
+
35
+ def two_chnnel_to_one_channel(sample):
36
+ # 音声を2チャンネルから1チャンネルに変換
37
+ left_channel = sample[:, 0]
38
+ right_channel = sample[:, 1]
39
+ mono_sample = (left_channel + right_channel) / 2
40
+ return mono_sample
41
+
42
+ def convert_sample_rate(data, original_sr, target_sr):
43
+ # 音声データのサンプリング周波数を変更
44
+ target_length = int(len(data) * target_sr / original_sr)
45
+ return resample(data, target_length)
46
+
47
+ def summarize_video(video_path, ratio_sum, playback_speed):
48
+ print("Start summarize video")
49
+ output_path = "./movies/output.mp4"
50
+
51
+ movie_clip = VideoFileClip(video_path)
52
+
53
+ audio_sampling_rate = movie_clip.audio.fps
54
+ clip_audio = np.array(movie_clip.audio.to_soundarray())
55
+
56
+ # 文字の書き起こし
57
+ audio_fp32 = convert_sample_rate(clip_audio, audio_sampling_rate, 16000)
58
+ audio_fp32 = two_chnnel_to_one_channel(audio_fp32).astype(np.float32)
59
+ transcription_results = transcriber.transcribe(audio_fp32)
60
+
61
+ # 文の句切れごとにテキスト/発話時間をまとめる
62
+ periods = ('.', '!', '?')
63
+ clip_sentences = []
64
+ head_sentence = True
65
+ for r in transcription_results['segments']:
66
+ if head_sentence:
67
+ start_time = r['start']
68
+ clip_sentences.append({'sentence':'', 'sentences':[], 'duration':[r['start'], None], 'durations':[]})
69
+ head_sentence = False
70
+ clip_sentences[-1]['sentence'] += r['text']
71
+ clip_sentences[-1]['sentences'].append(r['text'])
72
+ clip_sentences[-1]['durations'].append([r['start'], r['end']])
73
+ if r['text'].endswith(periods):
74
+ clip_sentences[-1]['duration'][1] = r['end']
75
+ head_sentence = True
76
+
77
+ # 文字の要約
78
+ transcription = transcription_results['text']
79
+ summary_text = summarizer(transcription, max_length=int(len(transcription)*0.1), min_length=int(len(transcription)*0.05), do_sample=False)[0]['summary_text']
80
+ print(summary_text)
81
+
82
+ # 要約文と一致する文を判別
83
+ summary_embedings = [sentence_transformer.encode(s, convert_to_tensor=True) for s in summary_text.split('.')]
84
+ important_sentence_idxs = [False]*len(clip_sentences)
85
+ for s, clip_sentence in enumerate(clip_sentences):
86
+ embedding = sentence_transformer.encode(clip_sentence['sentence'], convert_to_tensor=True)
87
+ for s_e in summary_embedings:
88
+ if util.pytorch_cos_sim(embedding, s_e) > ratio_sum:
89
+ important_sentence_idxs[s] = True
90
+
91
+ # となりの文と接続する文を判別
92
+ def next_prob(prompt, next_sentence, b=1.2):
93
+ encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
94
+ logits = next_sentence_predict(**encoding, labels=torch.LongTensor([1])).logits
95
+ pos = b ** logits[0, 0]
96
+ neg = b ** logits[0, 1]
97
+ return float(pos / (pos + neg))
98
+
99
+ connection_idxs = [False]*(len(clip_sentences)-1)
100
+ for s in range(len(clip_sentences)-1):
101
+ if next_prob(clip_sentences[s]['sentence'], clip_sentences[s+1]['sentence']) > 0.88:
102
+ connection_idxs[s] = True
103
+
104
+ # 要約後の文章のみ残す
105
+ def combine_arrays(A, B):
106
+ C = copy.deepcopy(A)
107
+ for i in range(len(A)):
108
+ if A[i]:
109
+ j = i
110
+ while j < len(B) and B[j]:
111
+ C[j+1] = True
112
+ j += 1
113
+ j = i
114
+ while j > 0 and B[j-1]:
115
+ C[j] = True
116
+ j -= 1
117
+ return C
118
+
119
+ important_idxs = combine_arrays(important_sentence_idxs, connection_idxs)
120
+
121
+ # 要約後の文章がどこかを可視化
122
+ html_text = "<h1 class='title'>Full Transcription</h1>"
123
+ for idx in range(len(important_sentence_idxs)):
124
+ seconds = clip_sentences[idx]['duration'][0] * (1/playback_speed)
125
+ minutes = int(seconds // 60)
126
+ remaining_seconds = str(seconds % 60)
127
+ if important_idxs[idx]:
128
+ html_text += '<p> <font color="#dc974e">' + f"{minutes}:{remaining_seconds[0]} | {clip_sentences[idx]['sentence']}</font> </p>"
129
+ else:
130
+ html_text += f"<p>{minutes}:{remaining_seconds[0]} | {clip_sentences[idx]['sentence']}</p>"
131
+
132
+ # 動画を結合
133
+ clips = []
134
+ for i in range(len(important_idxs)):
135
+ if important_idxs[i]:
136
+ tmp_clips = []
137
+ for j in range(len(clip_sentences[i]['sentences'])):
138
+ start_time, end_time = clip_sentences[i]['durations'][j][0], clip_sentences[i]['durations'][j][1]
139
+ if end_time > movie_clip.duration:
140
+ end_time = movie_clip.duration
141
+ clip = movie_clip.subclip(start_time, end_time)
142
+ clip = clip.set_pos("center").set_duration(end_time-start_time)
143
+ txt_clip = TextClip(clip_sentences[i]['sentences'][j], fontsize=int(movie_clip.w/40), color='white', bg_color='black', font='./fonts/Muller-Trial-Medium.ttf')
144
+ txt_clip = txt_clip.set_duration(end_time-start_time).set_position(("center", "bottom"))
145
+ clip = CompositeVideoClip([clip, txt_clip])
146
+ tmp_clips.append(clip)
147
+ clips.append(concatenate_videoclips(tmp_clips))
148
+
149
+ # クリップをクロスディゾルブで結合
150
+ # for c in range(len(clips)-1):
151
+ # fade_duration = 2
152
+ # clips[c] = clips[c].crossfadeout(fade_duration).audio_fadeout(fade_duration)
153
+ # clips[c+1] = clips[c+1].crossfadein(fade_duration).audio_fadein(fade_duration)
154
+
155
+ # 動画を結合し再生速度を変化させる
156
+ final_video = concatenate_videoclips(clips, method="chain")
157
+ final_video_audio = np.array(final_video.audio.to_soundarray(fps=audio_sampling_rate))
158
+ if playback_speed != 1:
159
+ final_video_audio_fixed = tsm.wsola(final_video_audio, 1/playback_speed).T
160
+ else:
161
+ final_video_audio_fixed = final_video_audio
162
+ final_video = speedx(final_video, factor=playback_speed)
163
+ final_video = final_video.set_audio(AudioArrayClip(final_video_audio_fixed, fps=audio_sampling_rate))
164
+ # if final_video.duration > 30:
165
+ # final_video = final_video.subclip(0, 30)
166
+ final_video.write_videofile(output_path)
167
+ print(output_path)
168
+ print("Success summarize video")
169
+ return output_path, summary_text, html_text
170
+
171
+
172
+ # ---- Gradio Layout -----
173
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
174
+ video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
175
+ video_out = gr.Video(label="Output Video")
176
+ summary_text = gr.Textbox(label="Video Transcription Summary")
177
+ transcription_text = gr.HTML(label="Full Transcription")
178
+ demo = gr.Blocks()
179
+ demo.encrypt = False
180
+
181
+ with demo:
182
+ gr.Markdown('''
183
+ <div style="text-align: center">
184
+ <h1 style='text-align: center'>FastPerson: Video summarization applied with transcription and text summarization</h1>
185
+ <img src="https://user-images.githubusercontent.com/33136532/215362410-97727904-e1ca-408d-967e-f5798671405e.png" alt="Video Summarization">
186
+ </div>
187
+ ''')
188
+ with gr.Row():
189
+ gr.Markdown('''
190
+ ### Summarize video
191
+ ##### Step 1a. Download video from youtube
192
+ ##### Step 1b. You also can upload video directly
193
+ ##### Step 2. Enter summary rate and playback speed
194
+ ##### Step 3. Generating summarized video.
195
+ ''')
196
+ with gr.Row():
197
+ gr.Markdown('''
198
+ ### You can test by following examples:
199
+ ''')
200
+ examples = gr.Examples(examples=
201
+ [ "https://www.youtube.com/watch?v=QghjaS0WQQU",
202
+ "https://www.youtube.com/watch?v=cUS_22_lDiM",
203
+ "https://www.youtube.com/watch?v=80yqL2KzBVw"],
204
+ label="Examples", inputs=[youtube_url_in])
205
+ with gr.Column():
206
+ youtube_url_in.render()
207
+ download_youtube_btn = gr.Button("Download Youtube video")
208
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [video_in])
209
+ print(video_in)
210
+ with gr.Row():
211
+ ratio_sum = gr.Slider(label="Summarize Ratio", minimum=0.3, maximum=0.8, step=0.05, value=0.6)
212
+ playback_speed = gr.Slider(label="Playback Speed", minimum=0.5, maximum=2.0, step=0.25, value=1.0)
213
+ with gr.Row():
214
+ upload_output_video_btn = gr.Button("Summarize Video")
215
+ upload_output_video_btn.click(summarize_video, [video_in, ratio_sum, playback_speed], [video_out, summary_text, transcription_text])
216
+ with gr.Row():
217
+ video_in.render()
218
+ video_out.render()
219
+ with gr.Row():
220
+ summary_text.render()
221
+ with gr.Row():
222
+ transcription_text.render()
223
+
224
+ demo.launch(debug=True, share=True)
fonts/Muller-Trial-Medium.ttf ADDED
Binary file (870 kB). View file
 
images/icon.png ADDED
movies/.__init__.py ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==22.1.0
2
+ aiohttp==3.8.3
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ async-timeout==4.0.2
7
+ attrs==22.2.0
8
+ certifi @ file:///croot/certifi_1671487769961/work/certifi
9
+ cffi==1.15.1
10
+ charset-normalizer==2.1.1
11
+ click==8.1.3
12
+ contourpy==1.0.7
13
+ cycler==0.11.0
14
+ decorator==4.4.2
15
+ entrypoints==0.4
16
+ fastapi==0.89.1
17
+ ffmpeg-python==0.2.0
18
+ ffmpy==0.3.0
19
+ filelock==3.9.0
20
+ fonttools==4.38.0
21
+ frozenlist==1.3.3
22
+ fsspec==2023.1.0
23
+ future==0.18.3
24
+ gradio==3.16.2
25
+ h11==0.14.0
26
+ httpcore==0.16.3
27
+ httpx==0.23.3
28
+ huggingface-hub==0.12.0
29
+ idna==3.4
30
+ imageio==2.25.0
31
+ imageio-ffmpeg==0.4.8
32
+ Jinja2==3.1.2
33
+ joblib==1.2.0
34
+ jsonschema==4.17.3
35
+ kiwisolver==1.4.4
36
+ linkify-it-py==1.0.3
37
+ markdown-it-py==2.1.0
38
+ MarkupSafe==2.1.2
39
+ matplotlib==3.6.3
40
+ mdit-py-plugins==0.3.3
41
+ mdurl==0.1.2
42
+ more-itertools==9.0.0
43
+ moviepy==1.0.3
44
+ multidict==6.0.4
45
+ nltk==3.8.1
46
+ numpy==1.24.1
47
+ nvidia-cublas-cu11==11.10.3.66
48
+ nvidia-cuda-nvrtc-cu11==11.7.99
49
+ nvidia-cuda-runtime-cu11==11.7.99
50
+ nvidia-cudnn-cu11==8.5.0.96
51
+ openai-whisper @ git+https://github.com/openai/whisper.git@5c1a8c10e762bf9c29fcf6b3e40f17bc8ab09864
52
+ orjson==3.8.5
53
+ packaging==23.0
54
+ pandas==1.5.3
55
+ Pillow==9.4.0
56
+ proglog==0.1.10
57
+ pycparser==2.21
58
+ pycryptodome==3.17
59
+ pydantic==1.10.4
60
+ pydub==0.25.1
61
+ pyparsing==3.0.9
62
+ pyrsistent==0.19.3
63
+ python-dateutil==2.8.2
64
+ python-multipart==0.0.5
65
+ pytsmod==0.3.6
66
+ pytube==12.1.0
67
+ pytz==2022.7.1
68
+ PyYAML==6.0
69
+ regex==2022.10.31
70
+ requests==2.28.2
71
+ rfc3986==1.5.0
72
+ scikit-learn==1.2.1
73
+ scipy==1.10.0
74
+ sentence-transformers==2.2.2
75
+ sentencepiece==0.1.97
76
+ six==1.16.0
77
+ sniffio==1.3.0
78
+ soundfile==0.11.0
79
+ starlette==0.22.0
80
+ threadpoolctl==3.1.0
81
+ tokenizers==0.13.2
82
+ toolz==0.12.0
83
+ torch==1.13.1
84
+ torchaudio==0.13.1
85
+ torchvision==0.14.1
86
+ tqdm==4.64.1
87
+ transformers==4.26.0
88
+ typing_extensions==4.4.0
89
+ uc-micro-py==1.0.1
90
+ urllib3==1.26.14
91
+ uvicorn==0.20.0
92
+ watchdog==2.2.1
93
+ websockets==10.4
94
+ whisper==1.1.10
95
+ yarl==1.8.2