Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- app.py +154 -0
- subtitle_utils.py +130 -0
- videoclipper.py +172 -0
app.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
|
4 |
+
# update modelscope
|
5 |
+
os.system("pip install -U modelscope -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -i https://mirror.sjtu.edu.cn/pypi/web/simple")
|
6 |
+
|
7 |
+
import datetime
|
8 |
+
from modelscope.pipelines import pipeline
|
9 |
+
from modelscope.utils.constant import Tasks
|
10 |
+
from subtitle_utils import generate_srt
|
11 |
+
|
12 |
+
#获取当前北京时间
|
13 |
+
utc_dt = datetime.datetime.utcnow()
|
14 |
+
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
|
15 |
+
formatted = beijing_dt.strftime("%Y-%m-%d_%H")
|
16 |
+
print(f"北京时间: {beijing_dt.year}年{beijing_dt.month}月{beijing_dt.day}日 "
|
17 |
+
f"{beijing_dt.hour}时{beijing_dt.minute}分{beijing_dt.second}秒")
|
18 |
+
#创建作品存放目录
|
19 |
+
works_path = '../works_audio_video_recognize/' + formatted
|
20 |
+
if not os.path.exists(works_path):
|
21 |
+
os.makedirs(works_path)
|
22 |
+
print('作品目录:' + works_path)
|
23 |
+
|
24 |
+
inference_pipeline = pipeline(
|
25 |
+
task=Tasks.auto_speech_recognition,
|
26 |
+
model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
|
27 |
+
|
28 |
+
def transcript(audiofile, text_file, srt_file):
|
29 |
+
rec_result = inference_pipeline(audio_in=audiofile)
|
30 |
+
|
31 |
+
text_output = rec_result['text']
|
32 |
+
with open(text_file, "w") as f:
|
33 |
+
f.write(text_output)
|
34 |
+
|
35 |
+
srt_output = generate_srt(rec_result['sentences'])
|
36 |
+
with open(srt_file, "w") as f:
|
37 |
+
f.write(srt_output)
|
38 |
+
|
39 |
+
return text_output, srt_output
|
40 |
+
|
41 |
+
def audio_recog(audiofile):
|
42 |
+
utc_dt = datetime.datetime.utcnow()
|
43 |
+
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
|
44 |
+
formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
|
45 |
+
print(f"开始时间: {beijing_dt.year}年{beijing_dt.month}月{beijing_dt.day}日 "
|
46 |
+
f"{beijing_dt.hour}时{beijing_dt.minute}分{beijing_dt.second}秒")
|
47 |
+
|
48 |
+
print("音频文件:" + audiofile)
|
49 |
+
|
50 |
+
filename = os.path.splitext(os.path.basename(audiofile))[0]
|
51 |
+
text_file = works_path + '/' + filename + '.txt'
|
52 |
+
srt_file = works_path + '/' + filename + '.srt'
|
53 |
+
text_output, srt_output = transcript(audiofile, text_file, srt_file)
|
54 |
+
|
55 |
+
utc_dt = datetime.datetime.utcnow()
|
56 |
+
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
|
57 |
+
formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
|
58 |
+
print(f"结束时间: {beijing_dt.year}年{beijing_dt.month}月{beijing_dt.day}日 "
|
59 |
+
f"{beijing_dt.hour}时{beijing_dt.minute}分{beijing_dt.second}秒")
|
60 |
+
|
61 |
+
return text_output, text_file, srt_output, srt_file
|
62 |
+
|
63 |
+
def video_recog(filepath):
|
64 |
+
filename = os.path.splitext(os.path.basename(filepath))[0]
|
65 |
+
worksfile = works_path + '/works_' + filename + '.mp4'
|
66 |
+
print("视频文件:" + filepath)
|
67 |
+
|
68 |
+
utc_dt = datetime.datetime.utcnow()
|
69 |
+
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
|
70 |
+
formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S.%f")
|
71 |
+
|
72 |
+
# 提取音频为mp3
|
73 |
+
audiofile = works_path + '/' + formatted + '.mp3'
|
74 |
+
os.system(f"ffmpeg -i {filepath} -vn -c:a libmp3lame -q:a 4 {audiofile}")
|
75 |
+
|
76 |
+
#识别音频文件
|
77 |
+
text_output, text_file, srt_output, srt_file = audio_recog(audiofile)
|
78 |
+
|
79 |
+
# # 给视频添加字幕
|
80 |
+
# os.system(f"ffmpeg -i {filepath} -i {srt_file} -c:s mov_text -c:v copy -c:a copy {worksfile}")
|
81 |
+
# print("作品:" + worksfile)
|
82 |
+
|
83 |
+
return text_output, text_file, srt_output, srt_file
|
84 |
+
|
85 |
+
css_style = "#fixed_size_img {height: 240px;} " \
|
86 |
+
"#overview {margin: auto;max-width: 400px; max-height: 400px;}"
|
87 |
+
|
88 |
+
title = "音视频识别 by宁侠"
|
89 |
+
description = "您只需要上传一段音频或视频文件,我们的服务会快速对其进行语音识别,然后生成相应的文字和字幕。这样,您就可以轻松地记录下重要的语音内容,或者为视频添加精准的字幕。现在就来试试我们的音视频识别服务吧,让您的生活和工作更加便捷!"
|
90 |
+
|
91 |
+
examples_path = 'examples/'
|
92 |
+
examples = [[examples_path + 'demo_shejipuhui.mp4']]
|
93 |
+
|
94 |
+
# gradio interface
|
95 |
+
with gr.Blocks(title=title, css=css_style) as demo:
|
96 |
+
gr.HTML('''
|
97 |
+
<div style="text-align: center; max-width: 720px; margin: 0 auto;">
|
98 |
+
<div
|
99 |
+
style="
|
100 |
+
display: inline-flex;
|
101 |
+
align-items: center;
|
102 |
+
gap: 0.8rem;
|
103 |
+
font-size: 1.75rem;
|
104 |
+
"
|
105 |
+
>
|
106 |
+
<h1 style="font-family: PingFangSC; font-weight: 500; font-size: 36px; margin-bottom: 7px;">
|
107 |
+
音视频识别
|
108 |
+
</h1>
|
109 |
+
<h1 style="font-family: PingFangSC; font-weight: 500; line-height: 1.5em; font-size: 16px; margin-bottom: 7px;">
|
110 |
+
by宁侠
|
111 |
+
</h1>
|
112 |
+
''')
|
113 |
+
gr.Markdown(description)
|
114 |
+
|
115 |
+
with gr.Tab("🔊音频识别 Audio Transcribe"):
|
116 |
+
with gr.Row():
|
117 |
+
with gr.Column():
|
118 |
+
audio_input = gr.Audio(label="🔊音频输入 Audio Input", type="filepath")
|
119 |
+
gr.Examples(['examples/paddlespeech.asr-zh.wav', 'examples/demo_shejipuhui.mp3'], [audio_input])
|
120 |
+
audio_recog_button = gr.Button("👂音频识别 Recognize")
|
121 |
+
with gr.Column():
|
122 |
+
audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
|
123 |
+
audio_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
|
124 |
+
audio_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
|
125 |
+
audio_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
|
126 |
+
audio_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
|
127 |
+
audio_output = gr.Audio(label="🔊音频 Audio", visible=False)
|
128 |
+
|
129 |
+
audio_recog_button.click(audio_recog, inputs=[audio_input], outputs=[audio_text_output, audio_text_file, audio_srt_output, audio_srt_file])
|
130 |
+
# audio_subtitles_button.click(audio_subtitles, inputs=[audio_text_input], outputs=[audio_output])
|
131 |
+
|
132 |
+
with gr.Tab("🎥视频识别 Video Transcribe"):
|
133 |
+
with gr.Row():
|
134 |
+
with gr.Column():
|
135 |
+
video_input = gr.Video(label="🎥视频输入 Video Input")
|
136 |
+
gr.Examples(['examples/demo_shejipuhui.mp4'], [video_input], label='语音识别示例 ASR Demo')
|
137 |
+
video_recog_button = gr.Button("👂视频识别 Recognize")
|
138 |
+
video_output = gr.Video(label="🎥视频 Video", visible=False)
|
139 |
+
with gr.Column():
|
140 |
+
video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
|
141 |
+
video_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
|
142 |
+
video_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
|
143 |
+
video_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
|
144 |
+
with gr.Row(visible=False):
|
145 |
+
font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
|
146 |
+
font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
|
147 |
+
video_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
|
148 |
+
|
149 |
+
|
150 |
+
video_recog_button.click(video_recog, inputs=[video_input], outputs=[video_text_output, video_text_file, video_srt_output, video_srt_file])
|
151 |
+
# video_subtitles_button.click(video_subtitles, inputs=[video_text_input], outputs=[video_output])
|
152 |
+
|
153 |
+
# start gradio service in local
|
154 |
+
demo.queue(api_open=False).launch(debug=True)
|
subtitle_utils.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def time_convert(ms):
|
2 |
+
ms = int(ms)
|
3 |
+
tail = ms % 1000
|
4 |
+
s = ms // 1000
|
5 |
+
mi = s // 60
|
6 |
+
s = s % 60
|
7 |
+
h = mi // 60
|
8 |
+
mi = mi % 60
|
9 |
+
h = "00" if h == 0 else str(h)
|
10 |
+
mi = "00" if mi == 0 else str(mi)
|
11 |
+
s = "00" if s == 0 else str(s)
|
12 |
+
tail = str(tail)
|
13 |
+
if len(h) == 1: h = '0' + h
|
14 |
+
if len(mi) == 1: mi = '0' + mi
|
15 |
+
if len(s) == 1: s = '0' + s
|
16 |
+
return "{}:{}:{},{}".format(h, mi, s, tail)
|
17 |
+
|
18 |
+
|
19 |
+
class Text2SRT():
|
20 |
+
def __init__(self, text_seg, ts_list, offset=0):
|
21 |
+
self.token_list = [i for i in text_seg.split() if len(i)]
|
22 |
+
self.ts_list = ts_list
|
23 |
+
start, end = ts_list[0][0] - offset, ts_list[-1][1] - offset
|
24 |
+
self.start_sec, self.end_sec = start, end
|
25 |
+
self.start_time = time_convert(start)
|
26 |
+
self.end_time = time_convert(end)
|
27 |
+
def text(self):
|
28 |
+
res = ""
|
29 |
+
for word in self.token_list:
|
30 |
+
if '\u4e00' <= word <= '\u9fff':
|
31 |
+
res += word
|
32 |
+
else:
|
33 |
+
res += " " + word
|
34 |
+
return res
|
35 |
+
def len(self):
|
36 |
+
return len(self.token_list)
|
37 |
+
def srt(self, acc_ost=0.0):
|
38 |
+
return "{} --> {}\n{}\n".format(
|
39 |
+
time_convert(self.start_sec+acc_ost*1000),
|
40 |
+
time_convert(self.end_sec+acc_ost*1000),
|
41 |
+
self.text())
|
42 |
+
def time(self, acc_ost=0.0):
|
43 |
+
return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
|
44 |
+
|
45 |
+
def distribute_spk(sentence_list, sd_time_list):
|
46 |
+
sd_sentence_list = []
|
47 |
+
for d in sentence_list:
|
48 |
+
sentence_start = d['ts_list'][0][0]
|
49 |
+
sentence_end = d['ts_list'][-1][1]
|
50 |
+
sentence_spk = 0
|
51 |
+
max_overlap = 0
|
52 |
+
for sd_time in sd_time_list:
|
53 |
+
spk_st, spk_ed, spk = sd_time
|
54 |
+
spk_st = spk_st*1000
|
55 |
+
spk_ed = spk_ed*1000
|
56 |
+
overlap = max(
|
57 |
+
min(sentence_end, spk_ed) - max(sentence_start, spk_st), 0)
|
58 |
+
if overlap > max_overlap:
|
59 |
+
max_overlap = overlap
|
60 |
+
sentence_spk = spk
|
61 |
+
d['spk'] = sentence_spk
|
62 |
+
sd_sentence_list.append(d)
|
63 |
+
return sd_sentence_list
|
64 |
+
|
65 |
+
def generate_srt(sentence_list):
|
66 |
+
srt_total = ''
|
67 |
+
for i, d in enumerate(sentence_list):
|
68 |
+
t2s = Text2SRT(d['text_seg'], d['ts_list'])
|
69 |
+
if 'spk' in d:
|
70 |
+
srt_total += "{} spk{}\n{}".format(i, d['spk'], t2s.srt())
|
71 |
+
else:
|
72 |
+
srt_total += "{}\n{}".format(i, t2s.srt())
|
73 |
+
return srt_total
|
74 |
+
|
75 |
+
def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
|
76 |
+
start, end = int(start * 1000), int(end * 1000)
|
77 |
+
srt_total = ''
|
78 |
+
cc = 1 + begin_index
|
79 |
+
subs = []
|
80 |
+
for i, d in enumerate(sentence_list):
|
81 |
+
if d['ts_list'][-1][1] <= start:
|
82 |
+
continue
|
83 |
+
if d['ts_list'][0][0] >= end:
|
84 |
+
break
|
85 |
+
# parts in between
|
86 |
+
if (d['ts_list'][-1][1] <= end and d['ts_list'][0][0] > start) or (d['ts_list'][-1][1] == end and d['ts_list'][0][0] == start):
|
87 |
+
t2s = Text2SRT(d['text_seg'], d['ts_list'], offset=start)
|
88 |
+
srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
|
89 |
+
subs.append((t2s.time(time_acc_ost), t2s.text()))
|
90 |
+
cc += 1
|
91 |
+
continue
|
92 |
+
if d['ts_list'][0][0] <= start:
|
93 |
+
if not d['ts_list'][-1][1] > end:
|
94 |
+
for j, ts in enumerate(d['ts_list']):
|
95 |
+
if ts[1] > start:
|
96 |
+
break
|
97 |
+
_text = " ".join(d['text_seg'].split()[j:])
|
98 |
+
_ts = d['ts_list'][j:]
|
99 |
+
else:
|
100 |
+
for j, ts in enumerate(d['ts_list']):
|
101 |
+
if ts[1] > start:
|
102 |
+
_start = j
|
103 |
+
break
|
104 |
+
for j, ts in enumerate(d['ts_list']):
|
105 |
+
if ts[1] > end:
|
106 |
+
_end = j
|
107 |
+
break
|
108 |
+
_text = " ".join(d['text_seg'].split()[_start:_end])
|
109 |
+
_ts = d['ts_list'][_start:_end]
|
110 |
+
if len(ts):
|
111 |
+
t2s = Text2SRT(_text, _ts, offset=start)
|
112 |
+
srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
|
113 |
+
subs.append((t2s.time(time_acc_ost), t2s.text()))
|
114 |
+
cc += 1
|
115 |
+
continue
|
116 |
+
if d['ts_list'][-1][1] > end:
|
117 |
+
for j, ts in enumerate(d['ts_list']):
|
118 |
+
if ts[1] > end:
|
119 |
+
break
|
120 |
+
_text = " ".join(d['text_seg'].split()[:j])
|
121 |
+
_ts = d['ts_list'][:j]
|
122 |
+
if len(_ts):
|
123 |
+
t2s = Text2SRT(_text, _ts, offset=start)
|
124 |
+
srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
|
125 |
+
subs.append(
|
126 |
+
(t2s.time(time_acc_ost), t2s.text())
|
127 |
+
)
|
128 |
+
cc += 1
|
129 |
+
continue
|
130 |
+
return srt_total, subs, cc
|
videoclipper.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import copy
|
3 |
+
import librosa
|
4 |
+
import logging
|
5 |
+
import argparse
|
6 |
+
import numpy as np
|
7 |
+
import soundfile as sf
|
8 |
+
import moviepy.editor as mpy
|
9 |
+
# from modelscope.pipelines import pipeline
|
10 |
+
# from modelscope.utils.constant import Tasks
|
11 |
+
from subtitle_utils import generate_srt, generate_srt_clip, distribute_spk
|
12 |
+
from trans_utils import pre_proc, proc, write_state, load_state, proc_spk, generate_vad_data
|
13 |
+
# from argparse_tools import ArgumentParser, get_commandline_args
|
14 |
+
|
15 |
+
from moviepy.editor import *
|
16 |
+
from moviepy.video.tools.subtitles import SubtitlesClip
|
17 |
+
|
18 |
+
|
19 |
+
class VideoClipper():
|
20 |
+
def __init__(self, asr_pipeline, sd_pipeline=None):
|
21 |
+
logging.warning("Initializing VideoClipper.")
|
22 |
+
self.asr_pipeline = asr_pipeline
|
23 |
+
self.sd_pipeline = sd_pipeline
|
24 |
+
|
25 |
+
def recog(self, audio_input, sd_switch='no', state=None):
|
26 |
+
if state is None:
|
27 |
+
state = {}
|
28 |
+
sr, data = audio_input
|
29 |
+
assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
|
30 |
+
if len(data.shape) == 2: # multi-channel wav input
|
31 |
+
logging.warning("Input wav shape: {}, only first channel reserved.").format(data.shape)
|
32 |
+
data = data[:,0]
|
33 |
+
state['audio_input'] = (sr, data)
|
34 |
+
data = data.astype(np.float64)
|
35 |
+
rec_result = self.asr_pipeline(audio_in=data)
|
36 |
+
if sd_switch == 'yes':
|
37 |
+
vad_data = generate_vad_data(data.astype(np.float32), rec_result['sentences'], sr)
|
38 |
+
sd_result = self.sd_pipeline(audio=vad_data, batch_size=1)
|
39 |
+
rec_result['sd_sentences'] = distribute_spk(rec_result['sentences'], sd_result['text'])
|
40 |
+
res_srt = generate_srt(rec_result['sd_sentences'])
|
41 |
+
state['sd_sentences'] = rec_result['sd_sentences']
|
42 |
+
else:
|
43 |
+
res_srt = generate_srt(rec_result['sentences'])
|
44 |
+
state['recog_res_raw'] = rec_result['text_postprocessed']
|
45 |
+
state['timestamp'] = rec_result['time_stamp']
|
46 |
+
state['sentences'] = rec_result['sentences']
|
47 |
+
res_text = rec_result['text']
|
48 |
+
return res_text, res_srt, state
|
49 |
+
|
50 |
+
def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None):
|
51 |
+
# get from state
|
52 |
+
audio_input = state['audio_input']
|
53 |
+
recog_res_raw = state['recog_res_raw']
|
54 |
+
timestamp = state['timestamp']
|
55 |
+
sentences = state['sentences']
|
56 |
+
sr, data = audio_input
|
57 |
+
data = data.astype(np.float64)
|
58 |
+
|
59 |
+
all_ts = []
|
60 |
+
if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
|
61 |
+
for _dest_text in dest_text.split('#'):
|
62 |
+
_dest_text = pre_proc(_dest_text)
|
63 |
+
ts = proc(recog_res_raw, timestamp, _dest_text)
|
64 |
+
for _ts in ts: all_ts.append(_ts)
|
65 |
+
else:
|
66 |
+
for _dest_spk in dest_spk.split('#'):
|
67 |
+
ts = proc_spk(_dest_spk, state['sd_sentences'])
|
68 |
+
for _ts in ts: all_ts.append(_ts)
|
69 |
+
ts = all_ts
|
70 |
+
ts.sort()
|
71 |
+
srt_index = 0
|
72 |
+
clip_srt = ""
|
73 |
+
if len(ts):
|
74 |
+
start, end = ts[0]
|
75 |
+
start = min(max(0, start+start_ost*16), len(data))
|
76 |
+
end = min(max(0, end+end_ost*16), len(data))
|
77 |
+
res_audio = data[start:end]
|
78 |
+
start_end_info = "from {} to {}".format(start/16000, end/16000)
|
79 |
+
srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
|
80 |
+
clip_srt += srt_clip
|
81 |
+
for _ts in ts[1:]: # multiple sentence input or multiple output matched
|
82 |
+
start, end = _ts
|
83 |
+
start = min(max(0, start+start_ost*16), len(data))
|
84 |
+
end = min(max(0, end+end_ost*16), len(data))
|
85 |
+
start_end_info += ", from {} to {}".format(start, end)
|
86 |
+
res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
|
87 |
+
srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
|
88 |
+
clip_srt += srt_clip
|
89 |
+
if len(ts):
|
90 |
+
message = "{} periods found in the speech: ".format(len(ts)) + start_end_info
|
91 |
+
else:
|
92 |
+
message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
|
93 |
+
res_audio = data
|
94 |
+
return (sr, res_audio), message, clip_srt
|
95 |
+
|
96 |
+
def video_recog(self, vedio_filename, sd_switch='no'):
|
97 |
+
vedio_filename = vedio_filename
|
98 |
+
clip_video_file = vedio_filename[:-4] + '_clip.mp4'
|
99 |
+
video = mpy.VideoFileClip(vedio_filename)
|
100 |
+
audio_file = vedio_filename[:-3] + 'wav'
|
101 |
+
video.audio.write_audiofile(audio_file)
|
102 |
+
wav = librosa.load(audio_file, sr=16000)[0]
|
103 |
+
state = {
|
104 |
+
'vedio_filename': vedio_filename,
|
105 |
+
'clip_video_file': clip_video_file,
|
106 |
+
'video': video,
|
107 |
+
}
|
108 |
+
# res_text, res_srt = self.recog((16000, wav), state)
|
109 |
+
return self.recog((16000, wav), sd_switch, state)
|
110 |
+
|
111 |
+
def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False, dest_spk=None):
|
112 |
+
# get from state
|
113 |
+
recog_res_raw = state['recog_res_raw']
|
114 |
+
timestamp = state['timestamp']
|
115 |
+
sentences = state['sentences']
|
116 |
+
video = state['video']
|
117 |
+
clip_video_file = state['clip_video_file']
|
118 |
+
vedio_filename = state['vedio_filename']
|
119 |
+
|
120 |
+
all_ts = []
|
121 |
+
srt_index = 0
|
122 |
+
if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
|
123 |
+
for _dest_text in dest_text.split('#'):
|
124 |
+
_dest_text = pre_proc(_dest_text)
|
125 |
+
ts = proc(recog_res_raw, timestamp, _dest_text)
|
126 |
+
for _ts in ts: all_ts.append(_ts)
|
127 |
+
else:
|
128 |
+
for _dest_spk in dest_spk.split('#'):
|
129 |
+
ts = proc_spk(_dest_spk, state['sd_sentences'])
|
130 |
+
for _ts in ts: all_ts.append(_ts)
|
131 |
+
time_acc_ost = 0.0
|
132 |
+
ts = all_ts
|
133 |
+
ts.sort()
|
134 |
+
clip_srt = ""
|
135 |
+
if len(ts):
|
136 |
+
start, end = ts[0][0] / 16000, ts[0][1] / 16000
|
137 |
+
srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
|
138 |
+
start, end = start+start_ost/1000.0, end+end_ost/1000.0
|
139 |
+
video_clip = video.subclip(start, end)
|
140 |
+
start_end_info = "from {} to {}".format(start, end)
|
141 |
+
clip_srt += srt_clip
|
142 |
+
if add_sub:
|
143 |
+
generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
|
144 |
+
subtitles = SubtitlesClip(subs, generator)
|
145 |
+
video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
|
146 |
+
concate_clip = [video_clip]
|
147 |
+
time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
|
148 |
+
for _ts in ts[1:]:
|
149 |
+
start, end = _ts[0] / 16000, _ts[1] / 16000
|
150 |
+
srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
|
151 |
+
start, end = start+start_ost/1000.0, end+end_ost/1000.0
|
152 |
+
_video_clip = video.subclip(start, end)
|
153 |
+
start_end_info += ", from {} to {}".format(start, end)
|
154 |
+
clip_srt += srt_clip
|
155 |
+
if add_sub:
|
156 |
+
generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
|
157 |
+
subtitles = SubtitlesClip(subs, generator)
|
158 |
+
_video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
|
159 |
+
concate_clip.append(copy.copy(_video_clip))
|
160 |
+
time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
|
161 |
+
message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
|
162 |
+
logging.warning("Concating...")
|
163 |
+
if len(concate_clip) > 1:
|
164 |
+
video_clip = concatenate_videoclips(concate_clip)
|
165 |
+
video_clip.write_videofile(clip_video_file, audio_codec="aac")
|
166 |
+
else:
|
167 |
+
clip_video_file = vedio_filename
|
168 |
+
message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
|
169 |
+
srt_clip = ''
|
170 |
+
return clip_video_file, message, clip_srt
|
171 |
+
|
172 |
+
|