Spaces:
Sleeping
Sleeping
File size: 6,799 Bytes
8ad5dc2 1f5c279 8ad5dc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import copy
import subprocess
import numpy as np
import pytsmod as tsm
from moviepy.audio.AudioClip import AudioArrayClip
from moviepy.editor import *
from moviepy.video.fx.speedx import speedx
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, BertTokenizer, BertForNextSentencePrediction
import torch
import whisper
from utils import two_chnnel_to_one_channel, convert_sample_rate
subprocess.run(['apt-get', '-y', 'install', 'imagemagick'])
transcriber = whisper.load_model("medium")
sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
next_sentence_predict = BertForNextSentencePrediction.from_pretrained("bert-base-cased").eval()
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
def summarize_video(video_path, ratio_sum, playback_speed):
print("Start summarize video")
output_path = os.path.join(os.path.dirname(video_path), 'output.mp4')
movie_clip = VideoFileClip(video_path)
audio_sampling_rate = movie_clip.audio.fps
clip_audio = np.array(movie_clip.audio.to_soundarray())
# 文字の書き起こし
print("Start transcribing text")
audio_fp32 = convert_sample_rate(clip_audio, audio_sampling_rate, 16000)
audio_fp32 = two_chnnel_to_one_channel(audio_fp32).astype(np.float32)
transcription_results = transcriber.transcribe(audio_fp32)
# 文の句切れごとにテキスト/発話時間をまとめる
print("Start summarizing text/speech time")
periods = ('.', '!', '?')
clip_sentences = []
head_sentence = True
for r in transcription_results['segments']:
if head_sentence:
start_time = r['start']
clip_sentences.append({'sentence':'', 'sentences':[], 'duration':[r['start'], None], 'durations':[]})
head_sentence = False
clip_sentences[-1]['sentence'] += r['text']
clip_sentences[-1]['sentences'].append(r['text'])
clip_sentences[-1]['durations'].append([r['start'], r['end']])
if r['text'].endswith(periods):
clip_sentences[-1]['duration'][1] = r['end']
head_sentence = True
# 文字の要約
print("Start summarizing sentences")
transcription = transcription_results['text']
summary_text = summarizer(transcription, max_length=int(len(transcription)*0.1), min_length=int(len(transcription)*0.05), do_sample=False)[0]['summary_text']
print(summary_text)
# 要約文と一致する文を判別
print("Start deleting sentences that match the summary sentence")
summary_embedings = [sentence_transformer.encode(s, convert_to_tensor=True) for s in summary_text.split('.')]
important_sentence_idxs = [False]*len(clip_sentences)
for s, clip_sentence in enumerate(clip_sentences):
embedding = sentence_transformer.encode(clip_sentence['sentence'], convert_to_tensor=True)
for s_e in summary_embedings:
if util.pytorch_cos_sim(embedding, s_e) > ratio_sum:
important_sentence_idxs[s] = True
# となりの文と接続する文を判別
print("Start identifying sentences that are connected to the sentence next to it")
def next_prob(prompt, next_sentence, b=1.2):
encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
logits = next_sentence_predict(**encoding, labels=torch.LongTensor([1])).logits
pos = b ** logits[0, 0]
neg = b ** logits[0, 1]
return float(pos / (pos + neg))
connection_idxs = [False]*(len(clip_sentences)-1)
for s in range(len(clip_sentences)-1):
if next_prob(clip_sentences[s]['sentence'], clip_sentences[s+1]['sentence']) > 0.88:
connection_idxs[s] = True
# 要約後の文章のみ残す
def combine_arrays(A, B):
C = copy.deepcopy(A)
for i in range(len(A)):
if A[i]:
j = i
while j < len(B) and B[j]:
C[j+1] = True
j += 1
j = i
while j > 0 and B[j-1]:
C[j] = True
j -= 1
return C
important_idxs = combine_arrays(important_sentence_idxs, connection_idxs)
# 要約後の文章がどこかを可視化
html_text = "<h1 class='title'>Full Transcription</h1>"
for idx in range(len(important_sentence_idxs)):
seconds = clip_sentences[idx]['duration'][0] * (1/playback_speed)
minutes, seconds = divmod(seconds, 60)
if important_idxs[idx]:
html_text += '<p> <b>' + f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']} </b> </p>"
else:
html_text += f"{int(minutes)}:{int(seconds):02} | {clip_sentences[idx]['sentence']}</p>"
print(html_text)
# 動画を結合
print("Start combine movies")
clips = []
for i in range(len(important_idxs)):
if important_idxs[i]:
tmp_clips = []
for j in range(len(clip_sentences[i]['sentences'])):
start_time, end_time = clip_sentences[i]['durations'][j][0], clip_sentences[i]['durations'][j][1]
if end_time > movie_clip.duration:
end_time = movie_clip.duration
if start_time > movie_clip.duration:
continue
clip = movie_clip.subclip(start_time, end_time)
clip = clip.set_pos("center").set_duration(end_time-start_time)
tmp_clips.append(clip)
clips.append(concatenate_videoclips(tmp_clips))
# クリップをクロスディゾルブで結合
# for c in range(len(clips)-1):
# fade_duration = 2
# clips[c] = clips[c].crossfadeout(fade_duration).audio_fadeout(fade_duration)
# clips[c+1] = clips[c+1].crossfadein(fade_duration).audio_fadein(fade_duration)
# 動画を結合し再生速度を変化させる
final_video = concatenate_videoclips(clips, method="chain")
final_video_audio = np.array(final_video.audio.to_soundarray(fps=audio_sampling_rate))
if playback_speed != 1:
final_video_audio_fixed = tsm.wsola(final_video_audio, 1/playback_speed).T
else:
final_video_audio_fixed = final_video_audio
final_video = speedx(final_video, factor=playback_speed)
final_video = final_video.set_audio(AudioArrayClip(final_video_audio_fixed, fps=audio_sampling_rate))
# if final_video.duration > 30:
# final_video = final_video.subclip(0, 30)
final_video.write_videofile(output_path)
print(output_path)
print("Success summarize video")
return output_path, summary_text, html_text |