kwmr commited on
Commit
9504de3
1 Parent(s): 3cb4906
Files changed (4) hide show
  1. app.py +4 -4
  2. model.py +81 -24
  3. requirements.txt +1 -0
  4. utils.py +26 -22
app.py CHANGED
@@ -49,14 +49,14 @@ with demo:
49
  with gr.Column():
50
  youtube_url_in.render()
51
  download_youtube_btn = gr.Button("Download Youtube video")
52
- download_youtube_btn.click(get_youtube, [youtube_url_in], [video_in])
53
  print(video_in)
54
  with gr.Row():
55
- ratio_sum = gr.Slider(label="Summarize Ratio", minimum=0.3, maximum=0.8, step=0.05, value=0.6)
56
  playback_speed = gr.Slider(label="Playback Speed", minimum=0.5, maximum=2.0, step=0.25, value=1.0)
57
  with gr.Row():
58
  upload_output_video_btn = gr.Button("Summarize Video")
59
- upload_output_video_btn.click(summarize_video, [video_in, ratio_sum, playback_speed], [video_out, summary_text, transcription_text])
60
  with gr.Row():
61
  video_in.render()
62
  video_out.render()
@@ -65,5 +65,5 @@ with demo:
65
  with gr.Row():
66
  transcription_text.render()
67
 
68
- # demo.launch(debug=True)
69
  demo.launch(debug=True)
 
49
  with gr.Column():
50
  youtube_url_in.render()
51
  download_youtube_btn = gr.Button("Download Youtube video")
52
+ download_youtube_btn.click(get_youtube, [user_id, youtube_url_in], [video_in])
53
  print(video_in)
54
  with gr.Row():
55
+ sum_ratio = gr.Slider(label="Summarize Ratio", minimum=0.3, maximum=0.8, step=0.05, value=0.6)
56
  playback_speed = gr.Slider(label="Playback Speed", minimum=0.5, maximum=2.0, step=0.25, value=1.0)
57
  with gr.Row():
58
  upload_output_video_btn = gr.Button("Summarize Video")
59
+ upload_output_video_btn.click(summarize_video, [user_id, video_in, sum_ratio, playback_speed], [video_out, summary_text, transcription_text])
60
  with gr.Row():
61
  video_in.render()
62
  video_out.render()
 
65
  with gr.Row():
66
  transcription_text.render()
67
 
68
+ # demo.launch(debug=True, share)
69
  demo.launch(debug=True)
model.py CHANGED
@@ -13,34 +13,63 @@ from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
13
  import torch
14
  import whisper
15
 
16
- from utils import two_chnnel_to_one_channel, convert_sample_rate
17
 
18
  subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
19
 
 
20
  transcriber = whisper.load_model("medium")
 
21
  sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 
22
  tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 
23
  next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
 
24
  summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
25
 
26
- def summarize_video(video_path, ratio_sum, playback_speed):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  print("Start summarize video")
 
28
  output_path = os.path.join(os.path.dirname(video_path), 'output.mp4')
 
29
  movie_clip = VideoFileClip(video_path)
 
30
  audio_sampling_rate = movie_clip.audio.fps
 
31
  clip_audio = np.array(movie_clip.audio.to_soundarray())
32
 
33
  # 文字の書き起こし
34
  print("Start transcribing text")
 
35
  audio_fp32 = convert_sample_rate(clip_audio, audio_sampling_rate, 16000)
36
  audio_fp32 = two_chnnel_to_one_channel(audio_fp32).astype(np.float32)
 
37
  transcription_results = transcriber.transcribe(audio_fp32)
38
 
39
  # 文の句切れごとにテキスト/発話時間をまとめる
40
  print("Start summarizing text/speech time")
 
41
  periods = ('.', '!', '?')
 
42
  clip_sentences = []
 
43
  head_sentence = True
 
44
  for r in transcription_results['segments']:
45
  if head_sentence:
46
  start_time = r['start']
@@ -53,20 +82,26 @@ def summarize_video(video_path, ratio_sum, playback_speed):
53
  clip_sentences[-1]['duration'][1] = r['end']
54
  head_sentence = True
55
 
56
- # 文字の要約
57
  print("Start summarizing sentences")
 
58
  transcription = transcription_results['text']
 
59
  summary_text = summarizer(transcription, max_length=int(len(transcription)*0.1), min_length=int(len(transcription)*0.05), do_sample=False)[0]['summary_text']
 
60
  print(summary_text)
61
 
62
  # 要約文と一致する文を判別
63
  print("Start deleting sentences that match the summary sentence")
 
64
  summary_embedings = [sentence_transformer.encode(s, convert_to_tensor=True) for s in summary_text.split('.')]
 
65
  important_sentence_idxs = [False]*len(clip_sentences)
 
66
  for s, clip_sentence in enumerate(clip_sentences):
67
  embedding = sentence_transformer.encode(clip_sentence['sentence'], convert_to_tensor=True)
68
  for s_e in summary_embedings:
69
- if util.pytorch_cos_sim(embedding, s_e) > ratio_sum:
70
  important_sentence_idxs[s] = True
71
 
72
  # となりの文と接続する文を判別
@@ -77,43 +112,59 @@ def summarize_video(video_path, ratio_sum, playback_speed):
77
  pos = b ** logits[0, 0]
78
  neg = b ** logits[0, 1]
79
  return float(pos / (pos + neg))
80
-
81
  connection_idxs = [False]*(len(clip_sentences)-1)
 
82
  for s in range(len(clip_sentences)-1):
83
  if next_prob(clip_sentences[s]['sentence'], clip_sentences[s+1]['sentence']) > 0.88:
84
  connection_idxs[s] = True
85
 
86
  # 要約後の文章のみ残す
87
- def combine_arrays(A, B):
88
- C = copy.deepcopy(A)
89
- for i in range(len(A)):
90
- if A[i]:
91
- j = i
92
- while j < len(B) and B[j]:
93
- C[j+1] = True
 
 
 
 
 
 
 
 
 
 
94
  j += 1
95
- j = i
96
- while j > 0 and B[j-1]:
97
- C[j] = True
 
 
98
  j -= 1
99
- return C
 
100
 
101
- important_idxs = combine_arrays(important_sentence_idxs, connection_idxs)
102
 
103
- # 要約後の文章がどこかを可視化
104
- html_text = "<h1 class='title'>Full Transcription</h1>"
 
105
  for idx in range(len(important_sentence_idxs)):
106
  seconds = clip_sentences[idx]['duration'][0] * (1/playback_speed)
107
  minutes, seconds = divmod(seconds, 60)
108
  if important_idxs[idx]:
109
- html_text += '<p> <b>' + f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']} </b> </p>"
110
  else:
111
- html_text += f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']}</p>"
112
- print(html_text)
113
 
114
  # 動画を結合
115
  print("Start combine movies")
116
- clips = []
 
117
  for i in range(len(important_idxs)):
118
  if important_idxs[i]:
119
  tmp_clips = []
@@ -135,17 +186,23 @@ def summarize_video(video_path, ratio_sum, playback_speed):
135
  # clips[c+1] = clips[c+1].crossfadein(fade_duration).audio_fadein(fade_duration)
136
 
137
  # 動画を結合し再生速度を変化させる
 
138
  final_video = concatenate_videoclips(clips, method="chain")
 
139
  final_video_audio = np.array(final_video.audio.to_soundarray(fps=audio_sampling_rate))
 
140
  if playback_speed != 1:
141
  final_video_audio_fixed = tsm.wsola(final_video_audio, 1/playback_speed).T
142
  else:
143
  final_video_audio_fixed = final_video_audio
 
144
  final_video = speedx(final_video, factor=playback_speed)
145
  final_video = final_video.set_audio(AudioArrayClip(final_video_audio_fixed, fps=audio_sampling_rate))
146
  # if final_video.duration > 30:
147
  # final_video = final_video.subclip(0, 30)
 
148
  final_video.write_videofile(output_path)
149
  print(output_path)
150
  print("Success summarize video")
151
- return output_path, summary_text, html_text
 
 
13
  import torch
14
  import whisper
15
 
16
+ from utils import two_chnnel_to_one_channel, convert_sample_rate, log_firestore
17
 
18
  subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
19
 
20
+ # 音声認識モデル
21
  transcriber = whisper.load_model("medium")
22
+ # 文章の埋め込みを生成する文章の埋め込みをモデル
23
  sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
24
+ # BERTのTokenizer
25
  tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
26
+ # 2つの文が連続しているかどうかを判定するモデル
27
  next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
28
+ # 文章の要約モデル
29
  summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
30
 
31
+ def summarize_video(user_id, video_path, sim_thr, playback_speed):
32
+ """
33
+ 動画要約
34
+
35
+ Parameters:
36
+ video_path (str): 動画のファイルパス
37
+ sim_thr (float): 要約文との一致度合いの閾値
38
+ playback_speed (float): 再生速度
39
+
40
+ Returns:
41
+ output_path (str): 出力動画のファイルパス
42
+ summary_text (str): 要約された文章
43
+ full_textt (str): 元の文章(要約で抽出されたところを強調)
44
+ """
45
+
46
  print("Start summarize video")
47
+ ## 動画の保存パスを設定
48
  output_path = os.path.join(os.path.dirname(video_path), 'output.mp4')
49
+ ## 動画クリップの作成
50
  movie_clip = VideoFileClip(video_path)
51
+ ## オーディオのサンプリングレートを取得
52
  audio_sampling_rate = movie_clip.audio.fps
53
+ ## オーディオをnumpy配列に変換
54
  clip_audio = np.array(movie_clip.audio.to_soundarray())
55
 
56
  # 文字の書き起こし
57
  print("Start transcribing text")
58
+ ## サンプリングレートを変更
59
  audio_fp32 = convert_sample_rate(clip_audio, audio_sampling_rate, 16000)
60
  audio_fp32 = two_chnnel_to_one_channel(audio_fp32).astype(np.float32)
61
+ ## 文字起こしの結果を取得
62
  transcription_results = transcriber.transcribe(audio_fp32)
63
 
64
  # 文の句切れごとにテキスト/発話時間をまとめる
65
  print("Start summarizing text/speech time")
66
+ ## 句読点を指定
67
  periods = ('.', '!', '?')
68
+ ## センテンスごとのテキストと時間を格納するリストを初期化
69
  clip_sentences = []
70
+ ## 先頭の文かどうかのフラグを初期化
71
  head_sentence = True
72
+ ## センテンスごとのテキストと時間を格納
73
  for r in transcription_results['segments']:
74
  if head_sentence:
75
  start_time = r['start']
 
82
  clip_sentences[-1]['duration'][1] = r['end']
83
  head_sentence = True
84
 
85
+ # 文章の要約
86
  print("Start summarizing sentences")
87
+ ## 文字起こしの結果を取得
88
  transcription = transcription_results['text']
89
+ ## 文字の要約を生成
90
  summary_text = summarizer(transcription, max_length=int(len(transcription)*0.1), min_length=int(len(transcription)*0.05), do_sample=False)[0]['summary_text']
91
+ ## 要約された文章を出力
92
  print(summary_text)
93
 
94
  # 要約文と一致する文を判別
95
  print("Start deleting sentences that match the summary sentence")
96
+ ## 要約文の各文の埋め込みを生成
97
  summary_embedings = [sentence_transformer.encode(s, convert_to_tensor=True) for s in summary_text.split('.')]
98
+ ## 重要な文のインデックスを格納するリストを初期化
99
  important_sentence_idxs = [False]*len(clip_sentences)
100
+ ## 文の埋め込みを生成して、要約文との一致が閾値以上であれば重要文としてマークする
101
  for s, clip_sentence in enumerate(clip_sentences):
102
  embedding = sentence_transformer.encode(clip_sentence['sentence'], convert_to_tensor=True)
103
  for s_e in summary_embedings:
104
+ if util.pytorch_cos_sim(embedding, s_e) > sim_thr:
105
  important_sentence_idxs[s] = True
106
 
107
  # となりの文と接続する文を判別
 
112
  pos = b ** logits[0, 0]
113
  neg = b ** logits[0, 1]
114
  return float(pos / (pos + neg))
115
+ ## 文が接続しているかどうかのフラグを格納するリストを初期化
116
  connection_idxs = [False]*(len(clip_sentences)-1)
117
+ ## 2つの文が連続しているかどうかを判定して、接続している場合はフラグをTrueにする
118
  for s in range(len(clip_sentences)-1):
119
  if next_prob(clip_sentences[s]['sentence'], clip_sentences[s+1]['sentence']) > 0.88:
120
  connection_idxs[s] = True
121
 
122
  # 要約後の文章のみ残す
123
+ def get_important_sentences(important_sentence_idxs, connection_idxs):
124
+ """
125
+ 重要な文のインデックスリストを返す
126
+
127
+ Parameters:
128
+ important_sentence_idxs (List[bool]): 要約文と一致する文のリスト
129
+ connection_idxs (List[bool]): となりの文と接続する文かどうかの判定のリスト
130
+
131
+ Returns:
132
+ important_idxs (List[bool]): 重要な文のリスト
133
+ """
134
+ for i, val in enumerate(important_sentence_idxs):
135
+ if val:
136
+ # 右側の要素を確認して更新する
137
+ j = i
138
+ while j < len(connection_idxs) and connection_idxs[j]:
139
+ important_sentence_idxs[j + 1] = True
140
  j += 1
141
+
142
+ # 左側の要素を確認して更新する
143
+ j = i - 1
144
+ while j >= 0 and connection_idxs[j]:
145
+ important_sentence_idxs[j] = True
146
  j -= 1
147
+ important_idxs = important_sentence_idxs
148
+ return important_idxs
149
 
150
+ important_idxs = get_important_sentences(important_sentence_idxs, connection_idxs)
151
 
152
+ # 要約後の文章が元の文章のどこを抽出したのかを可視化
153
+ full_textt = "<h1 class='title'>Full Transcription</h1>"
154
+ ## 重要な文であれば太字に、そうでなければ通常のフォントでHTML表現のテキストを生成
155
  for idx in range(len(important_sentence_idxs)):
156
  seconds = clip_sentences[idx]['duration'][0] * (1/playback_speed)
157
  minutes, seconds = divmod(seconds, 60)
158
  if important_idxs[idx]:
159
+ full_textt += '<p> <b>' + f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']} </b> </p>"
160
  else:
161
+ full_textt += f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']}</p>"
162
+ print(full_textt)
163
 
164
  # 動画を結合
165
  print("Start combine movies")
166
+ clips = []
167
+ ## 重要文であれば、その文の開始時間と終了時間からクリップを生成してリストに格納
168
  for i in range(len(important_idxs)):
169
  if important_idxs[i]:
170
  tmp_clips = []
 
186
  # clips[c+1] = clips[c+1].crossfadein(fade_duration).audio_fadein(fade_duration)
187
 
188
  # 動画を結合し再生速度を変化させる
189
+ ## クリップを連結する
190
  final_video = concatenate_videoclips(clips, method="chain")
191
+ ## オーディオをnumpy配列に変換
192
  final_video_audio = np.array(final_video.audio.to_soundarray(fps=audio_sampling_rate))
193
+ ## 再生速度を変更する
194
  if playback_speed != 1:
195
  final_video_audio_fixed = tsm.wsola(final_video_audio, 1/playback_speed).T
196
  else:
197
  final_video_audio_fixed = final_video_audio
198
+ ## 動画の再生速度を変更し、オーディオを設定する
199
  final_video = speedx(final_video, factor=playback_speed)
200
  final_video = final_video.set_audio(AudioArrayClip(final_video_audio_fixed, fps=audio_sampling_rate))
201
  # if final_video.duration > 30:
202
  # final_video = final_video.subclip(0, 30)
203
+ ## 動画をファイルに書き込む
204
  final_video.write_videofile(output_path)
205
  print(output_path)
206
  print("Success summarize video")
207
+ log_firestore(user_id, f'Summarize Ratio:{sim_thr},Playback Speed:{playback_speed}')
208
+ return output_path, summary_text, full_textt
requirements.txt CHANGED
@@ -93,3 +93,4 @@ watchdog==2.2.1
93
  websockets==10.4
94
  whisper==1.1.10
95
  yarl==1.8.2
 
 
93
  websockets==10.4
94
  whisper==1.1.10
95
  yarl==1.8.2
96
+ firebase-admin==6.1.0
utils.py CHANGED
@@ -1,30 +1,34 @@
1
- import copy
2
- import subprocess
 
3
 
 
 
 
4
  from pytube import YouTube
5
  from scipy.signal import resample
6
- import numpy as np
7
- import pytsmod as tsm
8
 
9
- from moviepy.audio.AudioClip import AudioArrayClip
10
- from moviepy.editor import *
11
- from moviepy.video.fx.speedx import speedx
12
-
13
- from sentence_transformers import SentenceTransformer, util
14
- from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
15
- import torch
16
- import whisper
17
-
18
- subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
19
-
20
- transcriber = whisper.load_model("medium")
21
- sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
22
- tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
23
- next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
24
- summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
25
-
26
- def get_youtube(video_url):
 
27
  # YouTubeの動画をダウンロード
 
28
  print("Start download video")
29
  yt = YouTube(video_url)
30
  abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename='download.mp4', output_path='movies/')
 
1
+ import os
2
+ import json
3
+ import base64
4
 
5
+ import firebase_admin
6
+ from firebase_admin import credentials
7
+ from firebase_admin import firestore
8
  from pytube import YouTube
9
  from scipy.signal import resample
 
 
10
 
11
+ db = firestore.client()
12
+ # 環境変数から秘密鍵を取得
13
+ encoded_key = os.environ["FIREBASE_CREDENTIALS_BASE64"]
14
+ # Base64エンコードされた秘密鍵をデコード
15
+ decoded_key = base64.b64decode(encoded_key)
16
+ # デコードされた秘密鍵を使ってCredentialオブジェクトを作成
17
+ cred = credentials.Certificate(json.loads(decoded_key))
18
+ # Firebase Admin SDKを初期化
19
+ firebase_admin.initialize_app(cred)
20
+
21
+ def log_firestore(user_id="000000", message="test"):
22
+ doc_ref = db.collection("button_clicks").document()
23
+ doc_ref.set({
24
+ "user_id": user_id,
25
+ "message": message,
26
+ "timestamp": firestore.SERVER_TIMESTAMP
27
+ })
28
+
29
+ def get_youtube(user_id, video_url):
30
  # YouTubeの動画をダウンロード
31
+ log_firestore(user_id=user_id, message=f'Download Video:{video_url}')
32
  print("Start download video")
33
  yt = YouTube(video_url)
34
  abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename='download.mp4', output_path='movies/')