shixian.shi commited on
Commit
76934e1
1 Parent(s): 3c166d6
README.md CHANGED
@@ -1,13 +1,21 @@
1
  ---
2
- title: Funasr App Clipvideo
3
- emoji: 🐢
4
- colorFrom: blue
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 3.34.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
 
 
 
 
 
 
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ # 详细文档见https://modelscope.cn/docs/%E5%88%9B%E7%A9%BA%E9%97%B4%E5%8D%A1%E7%89%87
3
+ domain: #领域:cv/nlp/audio/multi-modal/AutoML
4
+ - audio
5
+ tags: #自定义标签
6
+ - ClipVideo
7
+ models: #关联模型
8
+ - damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
9
+ - damo/speech_timestamp_prediction-v1-16k-offline
10
+ deployspec:
11
+ cpu: 6
12
+ memory: 87000
13
+ gpu: 1
14
+ instance_type: ecs.gn6e-c12g1.3xlarge
15
+ image_id: mshub-registry-vpc.cn-zhangjiakou.cr.aliyuncs.com/modelscope-repo/studio-service-gpu:funasr-clipvideo-gradio-3.29.0-2
16
+ license: Apache License 2.0
17
  ---
18
+ #### Clone with HTTP
19
+ ```bash
20
+ git clone https://www.modelscope.cn/studios/damo/funasr_app_clipvideo.git
21
+ ```
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from modelscope.pipelines import pipeline
3
+ from modelscope.utils.constant import Tasks
4
+ from videoclipper import VideoClipper
5
+
6
+
7
+ if __name__ == "__main__":
8
+ inference_pipeline = pipeline(
9
+ task=Tasks.auto_speech_recognition,
10
+ model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
11
+ vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
12
+ punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
13
+ )
14
+ audio_clipper = VideoClipper(inference_pipeline)
15
+
16
+ def audio_recog(audio_input):
17
+ return audio_clipper.recog(audio_input)
18
+
19
+ def audio_clip(dest_text, start_ost, end_ost, state):
20
+ return audio_clipper.clip(dest_text, start_ost, end_ost, state)
21
+
22
+ def video_recog(video_input):
23
+ return audio_clipper.video_recog(video_input)
24
+
25
+ def video_clip(dest_text, start_ost, end_ost, state):
26
+ return audio_clipper.video_clip(dest_text, start_ost, end_ost, state)
27
+
28
+ def video_clip_addsub(dest_text, start_ost, end_ost, state, font_size, font_color):
29
+ return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True)
30
+
31
+ '''
32
+ top_md_1 = ("""
33
+ 基于达摩院自研Paraformer-长音频版的语音识别、端点检测、标点预测、时间戳功能
34
+
35
+ 准确识别,自由复制所需段落并一键裁剪、添加字幕
36
+
37
+ * Step1: 上传视频文件(或使用下方的用例体验),点击 **<font color="#f7802b">识别</font>** 按钮
38
+ * Step2: 复制识别结果中所需的文字至右上方,设置偏移与字幕配置(可选)
39
+ * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
40
+ """)
41
+ '''
42
+
43
+ top_md_2 = ("""
44
+ 受到网络传输与服务资源的限制,用于体验的视频最好大小在40mb以下
45
+ 过大的视频可以尝试分离音轨使用音频剪辑,或 **<font color="#1785c4">通过源代码将您的ClipVideo服务部署在本地(推荐)</font>** :
46
+ <div align="center">
47
+ <div style="display:flex; gap: 0.25rem;" align="center">
48
+ FunASR_APP: <a href='https://github.com/alibaba/funasr-app'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
49
+ 🌟支持我们: <a href='https://github.com/alibaba/funasr-app/stargazers'><img src='https://img.shields.io/github/stars/alibaba/funasr-app.svg?style=social'></a>
50
+ </div>
51
+ </div>
52
+ """)
53
+
54
+ top_md_3 = ("""访问FunASR项目与论文能够帮助您深入了解ClipVideo中所使用的语音处理相关模型:
55
+ <div align="center">
56
+ <div style="display:flex; gap: 0.25rem;" align="center">
57
+ FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
58
+ FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
59
+ 🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
60
+ </div>
61
+ </div>
62
+ """)
63
+
64
+ # gradio interface
65
+ with gr.Blocks() as demo:
66
+ #gr.Image("./examples/guide.png", show_label=False)
67
+ # gr.Markdown(top_md_1)
68
+ #gr.Markdown(top_md_2)
69
+ #gr.Markdown(top_md_3)
70
+ video_state = gr.State()
71
+ audio_state = gr.State()
72
+ with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
73
+ with gr.Row():
74
+ with gr.Column():
75
+ video_input = gr.Video(label="🎥视频输入 Video Input")
76
+ gr.Examples(['examples/2022云栖大会_片段2.mp4',
77
+ 'examples/2022云栖大会_片段.mp4',
78
+ 'examples/为什么要多读书?这是我听过最好的答案-片段.mp4',
79
+ 'examples/使用chatgpt_片段.mp4'],
80
+ [video_input])
81
+ recog_button2 = gr.Button("👂识别 Recognize")
82
+ video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
83
+ video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
84
+ with gr.Column():
85
+ video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
86
+ with gr.Row():
87
+ video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
88
+ video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
89
+ with gr.Row():
90
+ font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
91
+ font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
92
+ # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
93
+ with gr.Row():
94
+ clip_button2 = gr.Button("✂️裁剪\nClip")
95
+ clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
96
+ video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
97
+ video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
98
+ video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
99
+
100
+ with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
101
+ with gr.Row():
102
+ with gr.Column():
103
+ audio_input = gr.Audio(label="🔊音频输入 Audio Input")
104
+ gr.Examples(['examples/鲁肃采访片段1.wav'], [audio_input])
105
+ recog_button1 = gr.Button("👂识别 Recognize")
106
+ audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
107
+ audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
108
+ with gr.Column():
109
+ audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
110
+ with gr.Row():
111
+ audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
112
+ audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
113
+ with gr.Row():
114
+ clip_button1 = gr.Button("✂️裁剪 Clip")
115
+ audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
116
+ audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
117
+ audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
118
+
119
+ recog_button1.click(audio_recog,
120
+ inputs=audio_input,
121
+ outputs=[audio_text_output, audio_srt_output, audio_state])
122
+ clip_button1.click(audio_clip,
123
+ inputs=[audio_text_input, audio_start_ost, audio_end_ost, audio_state],
124
+ outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
125
+
126
+ recog_button2.click(video_recog,
127
+ inputs=video_input,
128
+ outputs=[video_text_output, video_srt_output, video_state])
129
+ clip_button2.click(video_clip,
130
+ inputs=[video_text_input, video_start_ost, video_end_ost, video_state],
131
+ outputs=[video_output, video_mess_output, video_srt_clip_output])
132
+ clip_button3.click(video_clip_addsub,
133
+ inputs=[video_text_input, video_start_ost, video_end_ost, video_state, font_size, font_color],
134
+ outputs=[video_output, video_mess_output, video_srt_clip_output])
135
+
136
+ # start gradio service in local
137
+ demo.queue(concurrency_count=3).launch()
examples/2022云栖大会_片段.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4d171002592d4b87917349d75c9c4fdfd99e4186f8aa71420d6d98acbeb31ca
3
+ size 36077615
examples/2022云栖大会_片段2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e09b542f45373db284228cdf1507e28d926b4a2ab4697a20e1dca04d5ad0c88
3
+ size 6715481
examples/guide.png ADDED

Git LFS Details

  • SHA256: 1253633ed9a55e9cdbcb19a3ac9e9c656d5940a7d44b368d953477ed3214bf48
  • Pointer size: 131 Bytes
  • Size of remote file: 329 kB
examples/为什么要多读书?这是我听过最好的答案-片段.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3901cea588badd3ebc502428d983c3d1413edf5159db134ec82e92101535989f
3
+ size 46912031
examples/使用chatgpt_片段.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b7c448b49a96c239d19a0e7e5c14a7a8f558231f700e93722cb0b3da2991cfb
3
+ size 12033342
examples/鲁肃采访片段1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b57d8e54fd1a6d56dd16c969ac89b6570e8f73eedc99b12a18a8112a13377d0
3
+ size 6086432
font/STHeitiMedium.ttc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8fa4a63e2cf500e98e64d4c73260daaba049306cf85dec9e3729bc285b7d645
3
+ size 55754164
subtitle_utils.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def time_convert(ms):
2
+ ms = int(ms)
3
+ tail = ms % 1000
4
+ s = ms // 1000
5
+ mi = s // 60
6
+ s = s % 60
7
+ h = mi // 60
8
+ mi = mi % 60
9
+ h = "00" if h == 0 else str(h)
10
+ mi = "00" if mi == 0 else str(mi)
11
+ s = "00" if s == 0 else str(s)
12
+ tail = str(tail)
13
+ if len(h) == 1: h = '0' + h
14
+ if len(mi) == 1: mi = '0' + mi
15
+ if len(s) == 1: s = '0' + s
16
+ return "{}:{}:{},{}".format(h, mi, s, tail)
17
+
18
+
19
+ class Text2SRT():
20
+ def __init__(self, text_seg, ts_list, offset=0):
21
+ self.token_list = [i for i in text_seg.split() if len(i)]
22
+ self.ts_list = ts_list
23
+ start, end = ts_list[0][0] - offset, ts_list[-1][1] - offset
24
+ self.start_sec, self.end_sec = start, end
25
+ self.start_time = time_convert(start)
26
+ self.end_time = time_convert(end)
27
+ def text(self):
28
+ res = ""
29
+ for word in self.token_list:
30
+ if '\u4e00' <= word <= '\u9fff':
31
+ res += word
32
+ else:
33
+ res += " " + word
34
+ return res
35
+ def len(self):
36
+ return len(self.token_list)
37
+ def srt(self):
38
+ return "{} --> {}\n{}\n".format(self.start_time, self.end_time, self.text())
39
+ def time(self):
40
+ return (self.start_sec/1000, self.end_sec/1000)
41
+
42
+
43
+ def generate_srt(sentence_list):
44
+ srt_total = ''
45
+ for i, d in enumerate(sentence_list):
46
+ t2s = Text2SRT(d['text_seg'], d['ts_list'])
47
+ srt_total += "{}\n{}".format(i, t2s.srt())
48
+ return srt_total
49
+
50
+ def generate_srt_clip(sentence_list, start, end, begin_index=0):
51
+ start, end = int(start * 1000), int(end * 1000)
52
+ srt_total = ''
53
+ cc = 1 + begin_index
54
+ subs = []
55
+ for i, d in enumerate(sentence_list):
56
+ if d['ts_list'][-1][1] <= start:
57
+ continue
58
+ if d['ts_list'][0][0] >= end:
59
+ break
60
+ # parts in between
61
+ if (d['ts_list'][-1][1] < end and d['ts_list'][0][0] > start) or (d['ts_list'][-1][1] == end and d['ts_list'][0][0] == start):
62
+ t2s = Text2SRT(d['text_seg'], d['ts_list'], offset=start)
63
+ srt_total += "{}\n{}".format(cc, t2s.srt())
64
+ subs.append((t2s.time(), t2s.text()))
65
+ cc += 1
66
+ continue
67
+ if d['ts_list'][0][0] <= start:
68
+ if not d['ts_list'][-1][1] > end:
69
+ for j, ts in enumerate(d['ts_list']):
70
+ if ts[1] > start:
71
+ break
72
+ _text = " ".join(d['text_seg'].split()[j:])
73
+ _ts = d['ts_list'][j:]
74
+ else:
75
+ for j, ts in enumerate(d['ts_list']):
76
+ if ts[1] > start:
77
+ _start = j
78
+ break
79
+ for j, ts in enumerate(d['ts_list']):
80
+ if ts[1] > end:
81
+ _end = j
82
+ break
83
+ _text = " ".join(d['text_seg'].split()[_start:_end])
84
+ _ts = d['ts_list'][_start:_end]
85
+ if len(ts):
86
+ t2s = Text2SRT(_text, _ts, offset=start)
87
+ srt_total += "{}\n{}".format(cc, t2s.srt())
88
+ subs.append((t2s.time(), t2s.text()))
89
+ cc += 1
90
+ continue
91
+ if d['ts_list'][-1][1] > end:
92
+ for j, ts in enumerate(d['ts_list']):
93
+ if ts[1] > end:
94
+ break
95
+ _text = " ".join(d['text_seg'].split()[:j])
96
+ _ts = d['ts_list'][:j]
97
+ if len(_ts):
98
+ t2s = Text2SRT(_text, _ts, offset=start)
99
+ srt_total += "{}\n{}".format(cc, t2s.srt())
100
+ subs.append(
101
+ (t2s.time(), t2s.text())
102
+ )
103
+ cc += 1
104
+ continue
105
+ return srt_total, subs, cc
trans_utils.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PUNC_LIST = [',', '。', '!', '?', '、']
2
+
3
+
4
+ def pre_proc(text):
5
+ res = ''
6
+ for i in range(len(text)):
7
+ if text[i] in PUNC_LIST:
8
+ continue
9
+ if '\u4e00' <= text[i] <= '\u9fff':
10
+ if len(res) and res[-1] != " ":
11
+ res += ' ' + text[i]+' '
12
+ else:
13
+ res += text[i]+' '
14
+ else:
15
+ res += text[i]
16
+ if res[-1] == ' ':
17
+ res = res[:-1]
18
+ return res
19
+
20
+ def proc(raw_text, timestamp, dest_text):
21
+ # simple matching
22
+ ld = len(dest_text.split())
23
+ mi, ts = [], []
24
+ offset = 0
25
+ while True:
26
+ fi = raw_text.find(dest_text, offset, len(raw_text))
27
+ # import pdb; pdb.set_trace()
28
+ ti = raw_text[:fi].count(' ')
29
+ if fi == -1:
30
+ break
31
+ offset = fi + ld
32
+ mi.append(fi)
33
+ ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
34
+ # import pdb; pdb.set_trace()
35
+ return ts
36
+
37
+
38
+ def write_state(output_dir, state):
39
+ for key in ['/recog_res_raw', '/timestamp', '/sentences']:
40
+ with open(output_dir+key, 'w') as fout:
41
+ fout.write(str(state[key[1:]]))
42
+
43
+
44
+ def load_state(output_dir):
45
+ state = {}
46
+ with open(output_dir+'/recog_res_raw') as fin:
47
+ line = fin.read()
48
+ state['recog_res_raw'] = line
49
+ with open(output_dir+'/timestamp') as fin:
50
+ line = fin.read()
51
+ state['timestamp'] = eval(line)
52
+ with open(output_dir+'/sentences') as fin:
53
+ line = fin.read()
54
+ state['sentences'] = eval(line)
55
+ return state
56
+
57
+
videoclipper.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import copy
3
+ import librosa
4
+ import logging
5
+ import argparse
6
+ import numpy as np
7
+ import soundfile as sf
8
+ import moviepy.editor as mpy
9
+ from modelscope.pipelines import pipeline
10
+ from modelscope.utils.constant import Tasks
11
+ from subtitle_utils import generate_srt, generate_srt_clip
12
+ from trans_utils import pre_proc, proc, write_state, load_state
13
+
14
+ from moviepy.editor import *
15
+ from moviepy.video.tools.subtitles import SubtitlesClip
16
+
17
+
18
+ class VideoClipper():
19
+ def __init__(self, asr_pipeline):
20
+ logging.warning("Initializing VideoClipper.")
21
+ self.asr_pipeline = asr_pipeline
22
+
23
+ def recog(self, audio_input, state=None):
24
+ if state is None:
25
+ state = {}
26
+ state['audio_input'] = audio_input
27
+ _, data = audio_input
28
+ data = data.astype(np.float64)
29
+ rec_result = self.asr_pipeline(audio_in=data)
30
+ state['recog_res_raw'] = rec_result['text_postprocessed']
31
+ state['timestamp'] = rec_result['time_stamp']
32
+ state['sentences'] = rec_result['sentences']
33
+ res_text = rec_result['text']
34
+ res_srt = generate_srt(rec_result['sentences'])
35
+ return res_text, res_srt, state
36
+
37
+ def clip(self, dest_text, start_ost, end_ost, state):
38
+ # get from state
39
+ audio_input = state['audio_input']
40
+ recog_res_raw = state['recog_res_raw']
41
+ timestamp = state['timestamp']
42
+ sentences = state['sentences']
43
+ sr, data = audio_input
44
+ data = data.astype(np.float64)
45
+
46
+ all_ts = []
47
+ for _dest_text in dest_text.split('#'):
48
+ _dest_text = pre_proc(_dest_text)
49
+ ts = proc(recog_res_raw, timestamp, _dest_text)
50
+ for _ts in ts: all_ts.append(_ts)
51
+ ts = all_ts
52
+ srt_index = 0
53
+ clip_srt = ""
54
+ if len(ts):
55
+ start, end = ts[0]
56
+ start = min(max(0, start+start_ost*16), len(data))
57
+ end = min(max(0, end+end_ost*16), len(data))
58
+ res_audio = data[start:end]
59
+ start_end_info = "from {} to {}".format(start/16000, end/16000)
60
+ srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
61
+ clip_srt += srt_clip
62
+ for _ts in ts[1:]: # multiple sentence input or multiple output matched
63
+ start, end = _ts
64
+ start = min(max(0, start+start_ost*16), len(data))
65
+ end = min(max(0, end+end_ost*16), len(data))
66
+ start_end_info += ", from {} to {}".format(start, end)
67
+ res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
68
+ srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
69
+ clip_srt += srt_clip
70
+ if len(ts):
71
+ message = "{} periods found in the speech: ".format(len(ts)) + start_end_info
72
+ else:
73
+ message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
74
+ return (sr, res_audio), message, clip_srt
75
+
76
+ def video_recog(self, vedio_filename):
77
+ vedio_filename = vedio_filename
78
+ clip_video_file = vedio_filename[:-4] + '_clip.mp4'
79
+ video = mpy.VideoFileClip(vedio_filename)
80
+ audio_file = vedio_filename[:-3] + 'wav'
81
+ video.audio.write_audiofile(audio_file)
82
+ wav = librosa.load(audio_file, 16000)[0]
83
+ state = {
84
+ 'vedio_filename': vedio_filename,
85
+ 'clip_video_file': clip_video_file,
86
+ 'video': video,
87
+ }
88
+ # res_text, res_srt = self.recog((16000, wav), state)
89
+ return self.recog((16000, wav), state)
90
+
91
+ def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False):
92
+ # get from state
93
+ recog_res_raw = state['recog_res_raw']
94
+ timestamp = state['timestamp']
95
+ sentences = state['sentences']
96
+ video = state['video']
97
+ clip_video_file = state['clip_video_file']
98
+ vedio_filename = state['vedio_filename']
99
+
100
+ all_ts = []
101
+ srt_index = 0
102
+ for _dest_text in dest_text.split('#'):
103
+ _dest_text = pre_proc(_dest_text)
104
+ ts = proc(recog_res_raw, timestamp, _dest_text)
105
+ for _ts in ts: all_ts.append(_ts)
106
+ ts = all_ts
107
+ clip_srt = ""
108
+ if len(ts):
109
+ start, end = ts[0][0] / 16000, ts[0][1] / 16000
110
+ start, end = start+start_ost/1000.0, end+end_ost/1000.0
111
+ video_clip = video.subclip(start, end)
112
+ clip_video_file = clip_video_file
113
+ start_end_info = "from {} to {}".format(start, end)
114
+ # message = "{} periods found in the audio: from {} to {}.".format(len(ts), start, end)
115
+ srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index)
116
+ clip_srt += srt_clip
117
+ if add_sub:
118
+ generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
119
+ subtitles = SubtitlesClip(subs, generator)
120
+ video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
121
+ concate_clip = [video_clip]
122
+ for _ts in ts[1:]:
123
+ start, end = _ts[0] / 16000, _ts[1] / 16000
124
+ start, end = start+start_ost/1000.0, end+end_ost/1000.0
125
+ _video_clip = video.subclip(start, end)
126
+ clip_video_file = clip_video_file
127
+ start_end_info += ", from {} to {}".format(start, end)
128
+ srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1)
129
+ clip_srt += srt_clip
130
+ if add_sub:
131
+ generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
132
+ subtitles = SubtitlesClip(subs, generator)
133
+ _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
134
+ concate_clip.append(copy.copy(_video_clip))
135
+ message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
136
+ logging.warning("Concating...")
137
+ if len(concate_clip) > 1:
138
+ video_clip = concatenate_videoclips(concate_clip)
139
+ video_clip.write_videofile(clip_video_file)
140
+ else:
141
+ clip_video_file = vedio_filename
142
+ message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
143
+ srt_clip = ''
144
+ return clip_video_file, message, clip_srt
145
+
146
+