R1ckShi commited on
Commit
1427ef7
1 Parent(s): 2aacd40

update to v2.0.0

Browse files
__init__.py ADDED
File without changes
app.py CHANGED
@@ -1,138 +1,243 @@
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from modelscope.pipelines import pipeline
3
- from modelscope.utils.constant import Tasks
4
  from videoclipper import VideoClipper
 
 
 
 
 
5
 
6
 
7
  if __name__ == "__main__":
8
- inference_pipeline = pipeline(
9
- task=Tasks.auto_speech_recognition,
10
- model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
11
- vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
12
- punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
13
- )
14
- audio_clipper = VideoClipper(inference_pipeline)
15
-
16
- def audio_recog(audio_input):
17
- return audio_clipper.recog(audio_input)
18
-
19
- def audio_clip(dest_text, start_ost, end_ost, state):
20
- return audio_clipper.clip(dest_text, start_ost, end_ost, state)
21
 
22
- def video_recog(video_input):
23
- return audio_clipper.video_recog(video_input)
24
 
25
- def video_clip(dest_text, start_ost, end_ost, state):
26
- return audio_clipper.video_clip(dest_text, start_ost, end_ost, state)
27
 
28
- def video_clip_addsub(dest_text, start_ost, end_ost, state, font_size, font_color):
29
- return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True)
 
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- top_md_1 = ("""
33
- A video clip tool based on Paraformer-long's VAD, ASR, timestamp prediction, punctuation restoration abilities.
34
-
35
- Get the video clip simply following steps:
36
-
37
- * Step1: Upload video file (or try examples below), click **<font color="#f7802b">Recognize</font>** button
38
- * Step2: Copy text segments you need to 'Text to Clip', set the subtitle settings (if you need)
39
- * Step3: Click **<font color="#f7802b">Clip</font>** button or **<font color="#f7802b">Clip and Generate Subtitles</font>** button
40
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
-
43
- top_md_2 = ("""
44
- The video had better to have size under 40Mb,
45
- For video in large size, you can split the audio from it and use 'Audio Clip',
46
- or **<font color="#1785c4">establish your own gradio service with the source code (recommanded)</font>** :
47
- <div align="center">
48
- <div style="display:flex; gap: 0.25rem;" align="center">
49
- FunASR_APP: <a href='https://github.com/alibaba/funasr-app'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
50
- 🌟Support Us: <a href='https://github.com/alibaba/funasr-app/stargazers'><img src='https://img.shields.io/github/stars/alibaba/funasr-app.svg?style=social'></a>
51
- </div>
52
- </div>
53
- """)
54
-
55
- top_md_3 = ("""You may understand FunASR futher with source code and paper:
56
- <div align="center">
57
- <div style="display:flex; gap: 0.25rem;" align="center">
58
- FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
59
- FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
60
- 🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
61
- </div>
62
- </div>
63
- """)
64
-
65
  # gradio interface
66
- with gr.Blocks() as demo:
67
- #gr.Image("./examples/guide.png", show_label=False)
68
  gr.Markdown(top_md_1)
69
- gr.Markdown(top_md_2)
70
  gr.Markdown(top_md_3)
71
- video_state = gr.State()
72
- audio_state = gr.State()
73
- with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
74
- with gr.Row():
75
- with gr.Column():
76
- video_input = gr.Video(label="🎥视频输入 Video Input")
77
- gr.Examples(['examples/2022云栖大会_片段2.mp4',
78
- 'examples/2022云栖大会_片段.mp4',
79
- 'examples/为什么要多读书?这是我听过最好的答案-片段.mp4',
80
- 'examples/使用chatgpt_片段.mp4'],
81
- [video_input])
82
- recog_button2 = gr.Button("👂识别 Recognize")
83
- video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
84
- video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
85
  with gr.Column():
86
- video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  with gr.Row():
88
- video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
89
- video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
90
  with gr.Row():
91
- font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
92
- font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
93
- # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
94
- with gr.Row():
95
- clip_button2 = gr.Button("✂️裁剪\nClip")
96
- clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
97
- video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
98
- video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
99
- video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
100
-
101
- with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
102
- with gr.Row():
103
- with gr.Column():
104
- audio_input = gr.Audio(label="🔊音频输入 Audio Input")
105
- gr.Examples(['examples/鲁肃采访片段1.wav'], [audio_input])
106
- recog_button1 = gr.Button("👂识别 Recognize")
107
- audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
108
- audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
109
- with gr.Column():
110
- audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
111
- with gr.Row():
112
- audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
113
- audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
114
- with gr.Row():
115
- clip_button1 = gr.Button("✂️裁剪 Clip")
116
- audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
117
- audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
118
- audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
119
-
120
- recog_button1.click(audio_recog,
121
- inputs=audio_input,
122
- outputs=[audio_text_output, audio_srt_output, audio_state])
123
- clip_button1.click(audio_clip,
124
- inputs=[audio_text_input, audio_start_ost, audio_end_ost, audio_state],
125
- outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
126
-
127
- recog_button2.click(video_recog,
128
- inputs=video_input,
129
- outputs=[video_text_output, video_srt_output, video_state])
130
- clip_button2.click(video_clip,
131
- inputs=[video_text_input, video_start_ost, video_end_ost, video_state],
132
- outputs=[video_output, video_mess_output, video_srt_clip_output])
133
- clip_button3.click(video_clip_addsub,
134
- inputs=[video_text_input, video_start_ost, video_end_ost, video_state, font_size, font_color],
135
- outputs=[video_output, video_mess_output, video_srt_clip_output])
 
 
 
 
 
 
136
 
137
  # start gradio service in local
138
- demo.queue(concurrency_count=3).launch()
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ import os
7
+ import logging
8
  import gradio as gr
9
+ from funasr import AutoModel
 
10
  from videoclipper import VideoClipper
11
+ from introduction import top_md_1, top_md_3, top_md_4
12
+ from llm.openai_api import openai_call
13
+ from llm.g4f_openai_api import g4f_openai_call
14
+ from llm.qwen_api import call_qwen_model
15
+ from utils.trans_utils import extract_timestamps
16
 
17
 
18
  if __name__ == "__main__":
19
+
20
+ funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
21
+ vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
22
+ punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
23
+ spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
24
+ )
25
+ audio_clipper = VideoClipper(funasr_model)
 
 
 
 
 
 
26
 
27
+ def audio_recog(audio_input, sd_switch, hotwords, output_dir):
28
+ return audio_clipper.recog(audio_input, sd_switch, None, hotwords, output_dir=output_dir)
29
 
30
+ def video_recog(video_input, sd_switch, hotwords, output_dir):
31
+ return audio_clipper.video_recog(video_input, sd_switch, hotwords, output_dir=output_dir)
32
 
33
+ def video_clip(dest_text, video_spk_input, start_ost, end_ost, state, output_dir):
34
+ return audio_clipper.video_clip(
35
+ dest_text, start_ost, end_ost, state, dest_spk=video_spk_input, output_dir=output_dir
36
+ )
37
 
38
+ def mix_recog(video_input, audio_input, hotwords, output_dir):
39
+ output_dir = output_dir.strip()
40
+ if not len(output_dir):
41
+ output_dir = None
42
+ else:
43
+ output_dir = os.path.abspath(output_dir)
44
+ audio_state, video_state = None, None
45
+ if video_input is not None:
46
+ res_text, res_srt, video_state = video_recog(
47
+ video_input, 'No', hotwords, output_dir=output_dir)
48
+ return res_text, res_srt, video_state, None
49
+ if audio_input is not None:
50
+ res_text, res_srt, audio_state = audio_recog(
51
+ audio_input, 'No', hotwords, output_dir=output_dir)
52
+ return res_text, res_srt, None, audio_state
53
 
54
+ def mix_recog_speaker(video_input, audio_input, hotwords, output_dir):
55
+ output_dir = output_dir.strip()
56
+ if not len(output_dir):
57
+ output_dir = None
58
+ else:
59
+ output_dir = os.path.abspath(output_dir)
60
+ audio_state, video_state = None, None
61
+ if video_input is not None:
62
+ res_text, res_srt, video_state = video_recog(
63
+ video_input, 'Yes', hotwords, output_dir=output_dir)
64
+ return res_text, res_srt, video_state, None
65
+ if audio_input is not None:
66
+ res_text, res_srt, audio_state = audio_recog(
67
+ audio_input, 'Yes', hotwords, output_dir=output_dir)
68
+ return res_text, res_srt, None, audio_state
69
+
70
+ def mix_clip(dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
71
+ output_dir = output_dir.strip()
72
+ if not len(output_dir):
73
+ output_dir = None
74
+ else:
75
+ output_dir = os.path.abspath(output_dir)
76
+ if video_state is not None:
77
+ clip_video_file, message, clip_srt = audio_clipper.video_clip(
78
+ dest_text, start_ost, end_ost, video_state, dest_spk=video_spk_input, output_dir=output_dir)
79
+ return clip_video_file, None, message, clip_srt
80
+ if audio_state is not None:
81
+ (sr, res_audio), message, clip_srt = audio_clipper.clip(
82
+ dest_text, start_ost, end_ost, audio_state, dest_spk=video_spk_input, output_dir=output_dir)
83
+ return None, (sr, res_audio), message, clip_srt
84
+
85
+ def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, output_dir, font_size, font_color):
86
+ return audio_clipper.video_clip(
87
+ dest_text, start_ost, end_ost, state,
88
+ font_size=font_size, font_color=font_color,
89
+ add_sub=True, dest_spk=video_spk_input, output_dir=output_dir
90
+ )
91
+
92
+ def llm_inference(system_content, user_content, srt_text, model, apikey):
93
+ SUPPORT_LLM_PREFIX = ['qwen', 'gpt', 'g4f', 'moonshot']
94
+ if model.startswith('qwen'):
95
+ return call_qwen_model(apikey, model, system_content, user_content+'\n'+srt_text)
96
+ if model.startswith('gpt') or model.startswith('moonshot'):
97
+ return openai_call(apikey, model, system_content, user_content+'\n'+srt_text)
98
+ elif model.startswith('g4f'):
99
+ model = "-".join(model.split('-')[1:])
100
+ return g4f_openai_call(model, system_content, user_content+'\n'+srt_text)
101
+ else:
102
+ logging.error("LLM name error, only {} are supported as LLM name prefix."
103
+ .format(SUPPORT_LLM_PREFIX))
104
+
105
+ def AI_clip(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
106
+ timestamp_list = extract_timestamps(LLM_res)
107
+ output_dir = output_dir.strip()
108
+ if not len(output_dir):
109
+ output_dir = None
110
+ else:
111
+ output_dir = os.path.abspath(output_dir)
112
+ if video_state is not None:
113
+ clip_video_file, message, clip_srt = audio_clipper.video_clip(
114
+ dest_text, start_ost, end_ost, video_state,
115
+ dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list)
116
+ return clip_video_file, None, message, clip_srt
117
+ if audio_state is not None:
118
+ (sr, res_audio), message, clip_srt = audio_clipper.clip(
119
+ dest_text, start_ost, end_ost, audio_state,
120
+ dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list)
121
+ return None, (sr, res_audio), message, clip_srt
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  # gradio interface
124
+ theme = gr.Theme.load("utils/theme.json")
125
+ with gr.Blocks(theme=theme) as funclip_service:
126
  gr.Markdown(top_md_1)
127
+ # gr.Markdown(top_md_2)
128
  gr.Markdown(top_md_3)
129
+ gr.Markdown(top_md_4)
130
+ video_state, audio_state = gr.State(), gr.State()
131
+ with gr.Row():
132
+ with gr.Column():
133
+ with gr.Row():
134
+ video_input = gr.Video(label="视频输入 | Video Input")
135
+ audio_input = gr.Audio(label="音频输入 | Audio Input")
 
 
 
 
 
 
 
136
  with gr.Column():
137
+ gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E5%A4%9A%E8%AF%BB%E4%B9%A6%EF%BC%9F%E8%BF%99%E6%98%AF%E6%88%91%E5%90%AC%E8%BF%87%E6%9C%80%E5%A5%BD%E7%9A%84%E7%AD%94%E6%A1%88-%E7%89%87%E6%AE%B5.mp4',
138
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/2022%E4%BA%91%E6%A0%96%E5%A4%A7%E4%BC%9A_%E7%89%87%E6%AE%B52.mp4',
139
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%BD%BF%E7%94%A8chatgpt_%E7%89%87%E6%AE%B5.mp4'],
140
+ [video_input],
141
+ label='示例视频 | Demo Video')
142
+ gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E8%AE%BF%E8%B0%88.mp4'],
143
+ [video_input],
144
+ label='多说话人示例视频 | Multi-speaker Demo Video')
145
+ gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E9%B2%81%E8%82%83%E9%87%87%E8%AE%BF%E7%89%87%E6%AE%B51.wav'],
146
+ [audio_input],
147
+ label="示例音频 | Demo Audio")
148
+ with gr.Column():
149
+ # with gr.Row():
150
+ # video_sd_switch = gr.Radio(["No", "Yes"], label="👥区分说话人 Get Speakers", value='No')
151
+ hotwords_input = gr.Textbox(label="🚒 热词 | Hotwords(可以为空,多个热词使用空格分隔,仅支持中文热词)")
152
+ output_dir = gr.Textbox(label="📁 文件输出路径 | File Output Dir (可以为空,Linux, mac系统可以稳定使用)", value=" ")
153
+ with gr.Row():
154
+ recog_button = gr.Button("👂 识别 | ASR", variant="primary")
155
+ recog_button2 = gr.Button("👂👫 识别+区分说话人 | ASR+SD")
156
+ video_text_output = gr.Textbox(label="✏️ 识别结果 | Recognition Result")
157
+ video_srt_output = gr.Textbox(label="📖 SRT字幕内容 | RST Subtitles")
158
+ with gr.Column():
159
+ with gr.Tab("🧠 LLM智能裁剪 | LLM Clipping"):
160
+ with gr.Column():
161
+ prompt_head = gr.Textbox(label="Prompt System", value=("你是一个视频srt字幕分析剪辑器,输入视频的srt字幕,"
162
+ "分析其中的精彩且尽可能连续的片段并裁剪出来,输出四条以内的片段,将片段中在时间上连续的多个句子及它们的时间戳合并为一条,"
163
+ "注意确保文字与时间戳的正确匹配。输出需严格按照如下格式:1. [开始时间-结束时间] 文本,注意其中的连接符是“-”"))
164
+ prompt_head2 = gr.Textbox(label="Prompt User", value=("这是待裁剪的视频srt字幕:"))
165
+ with gr.Column():
166
+ with gr.Row():
167
+ llm_model = gr.Dropdown(
168
+ choices=["qwen-plus",
169
+ "gpt-3.5-turbo",
170
+ "gpt-3.5-turbo-0125",
171
+ "gpt-4-turbo",
172
+ "g4f-gpt-3.5-turbo"],
173
+ value="qwen-plus",
174
+ label="LLM Model Name",
175
+ allow_custom_value=True)
176
+ apikey_input = gr.Textbox(label="APIKEY")
177
+ llm_button = gr.Button("LLM推理 | LLM Inference(首先进行识别,非g4f需配置对应apikey)", variant="primary")
178
+ llm_result = gr.Textbox(label="LLM Clipper Result")
179
+ with gr.Row():
180
+ llm_clip_button = gr.Button("🧠 LLM智能裁剪 | AI Clip", variant="primary")
181
+ # llm_clip_subti_button = gr.Button("🧠 LLM智能裁剪+字幕 | AI Clip+Subtitles")
182
+ with gr.Tab("✂️ 根据文本\说话人裁剪 | Text\Speaker Clipping"):
183
+ video_text_input = gr.Textbox(label="✏️ 待裁剪文本 | Text to Clip (多段文本使用'#'连接)")
184
+ video_spk_input = gr.Textbox(label="✏️ 待裁剪说话人 | Speaker to Clip (多个说话人使用'#'连接)")
185
  with gr.Row():
186
+ clip_button = gr.Button("✂️ 裁剪 | Clip", variant="primary")
187
+ # clip_subti_button = gr.Button("✂️ 裁剪+字幕 | Clip+Subtitles")
188
  with gr.Row():
189
+ video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label=" 开始位置偏移 | Start Offset (ms)")
190
+ video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label=" 结束位置偏移 | End Offset (ms)")
191
+ with gr.Row():
192
+ font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠 字幕字体大小 | Subtitle Font Size")
193
+ font_color = gr.Radio(["black", "white", "green", "red"], label="🌈 字幕颜色 | Subtitle Color", value='white')
194
+ # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
195
+ video_output = gr.Video(label="裁剪结果 | Video Clipped")
196
+ audio_output = gr.Audio(label="裁剪结果 | Audio Clipped")
197
+ clip_message = gr.Textbox(label="⚠️ 裁剪信息 | Clipping Log")
198
+ srt_clipped = gr.Textbox(label="📖 裁剪部分SRT字幕内容 | Clipped RST Subtitles")
199
+
200
+ recog_button.click(mix_recog,
201
+ inputs=[video_input,
202
+ audio_input,
203
+ hotwords_input,
204
+ output_dir,
205
+ ],
206
+ outputs=[video_text_output, video_srt_output, video_state, audio_state],
207
+ concurrency_limit=3)
208
+ recog_button2.click(mix_recog_speaker,
209
+ inputs=[video_input,
210
+ audio_input,
211
+ hotwords_input,
212
+ output_dir,
213
+ ],
214
+ outputs=[video_text_output, video_srt_output, video_state, audio_state],
215
+ concurrency_limit=3)
216
+ clip_button.click(mix_clip,
217
+ inputs=[video_text_input,
218
+ video_spk_input,
219
+ video_start_ost,
220
+ video_end_ost,
221
+ video_state,
222
+ audio_state,
223
+ output_dir
224
+ ],
225
+ outputs=[video_output, audio_output, clip_message, srt_clipped])
226
+ llm_button.click(llm_inference,
227
+ inputs=[prompt_head, prompt_head2, video_srt_output, llm_model, apikey_input],
228
+ outputs=[llm_result])
229
+ llm_clip_button.click(AI_clip,
230
+ inputs=[llm_result,
231
+ video_text_input,
232
+ video_spk_input,
233
+ video_start_ost,
234
+ video_end_ost,
235
+ video_state,
236
+ audio_state,
237
+ output_dir
238
+ ],
239
+ outputs=[video_output, audio_output, clip_message, srt_clipped])
240
 
241
  # start gradio service in local
242
+ # funclip_service.queue(concurrency_count=5)
243
+ funclip_service.launch(max_threads=8)
introduction.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ top_md_1 = ("""
2
+ <div align="center">
3
+ <div style="display:flex; gap: 0.25rem;" align="center">
4
+ FunClip: <a href='https://github.com/alibaba-damo-academy/FunClip'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
5
+ 🌟支持我们: <a href='https://github.com/alibaba-damo-academy/FunClip/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunClip.svg?style=social'></a>
6
+ </div>
7
+ </div>
8
+
9
+ 基于阿里巴巴通义实验室自研并开源的[FunASR](https://github.com/alibaba-damo-academy/FunASR)工具包及Paraformer系列模型及语音识别、端点检测、标点预测、时间戳预测、说话人区分、热词定制化开源链路
10
+
11
+ 准确识别,自由复制所需段落,或者设置说话人标识,一键裁剪、添加字幕
12
+
13
+ * Step1: 上传视频或音频文件(或使用下方的用例体验),点击 **<font color="#f7802b">识别</font>** 按钮
14
+ * Step2: 复制识别结果中所需的文字至右上方,或者右设置说话人标识,设置偏移与字幕配置(可选)
15
+ * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
16
+
17
+ 🔥 FunClip现在集成了大语言模型智能剪辑功能,选择LLM模型进行体验吧~
18
+ """)
19
+
20
+ top_md_3 = ("""访问FunASR项目与论文能够帮助您深入了解ParaClipper中所使用的语音处理相关模型:
21
+ <div align="center">
22
+ <div style="display:flex; gap: 0.25rem;" align="center">
23
+ FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
24
+ FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
25
+ 🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
26
+ </div>
27
+ </div>
28
+ """)
29
+
30
+ top_md_4 = ("""我们在「LLM智能裁剪」模块中提供三种LLM调用方式,
31
+ 1. 选择阿里云百炼平台通过api调用qwen系列模型,此时需要您准备百炼平台的apikey,请访问[阿里云百炼](https://bailian.console.aliyun.com/#/home);
32
+ 2. 选择GPT开头的模型即为调用openai官方api,此时需要您自备sk与网络环境;
33
+ 3. [gpt4free](https://github.com/xtekky/gpt4free?tab=readme-ov-file)项目也被集成进FunClip,可以通过它免费调用gpt模型;
34
+
35
+ 其中方式1与方式2需要在界面中传入相应的apikey
36
+ 方式3而可能非常不稳定,返回时间可能很长或者结果获取失败,可以多多尝试或者自己准备sk使用方式1
37
+ """)
llm/__pycache__/demo_prompt.cpython-311.pyc ADDED
Binary file (6.17 kB). View file
 
llm/__pycache__/g4f_openai_api.cpython-311.pyc ADDED
Binary file (1.46 kB). View file
 
llm/__pycache__/openai_api.cpython-311.pyc ADDED
Binary file (1.76 kB). View file
 
llm/__pycache__/qwen_api.cpython-311.pyc ADDED
Binary file (1.24 kB). View file
 
llm/demo_prompt.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ demo_prompt="""
2
+ 你是一个视频srt字幕剪辑工具,输入视频的srt字幕之后根据如下要求剪辑对应的片段并输出每个段落的开始与结束时间,
3
+ 剪辑出以下片段中最有意义的、尽可能连续的部分,按如下格式输出:1. [开始时间-结束时间] 文本,
4
+ 原始srt字幕如下:
5
+ 0
6
+ 00:00:00,50 --> 00:00:02,10
7
+ 读万卷书行万里路,
8
+ 1
9
+ 00:00:02,310 --> 00:00:03,990
10
+ 这里是读书三六九,
11
+ 2
12
+ 00:00:04,670 --> 00:00:07,990
13
+ 今天要和您分享的这篇文章是人民日报,
14
+ 3
15
+ 00:00:08,510 --> 00:00:09,730
16
+ 为什么要多读书?
17
+ 4
18
+ 00:00:10,90 --> 00:00:11,930
19
+ 这是我听过最好的答案,
20
+ 5
21
+ 00:00:12,310 --> 00:00:13,190
22
+ 经常有人问,
23
+ 6
24
+ 00:00:13,730 --> 00:00:14,690
25
+ 读了那么多书,
26
+ 7
27
+ 00:00:14,990 --> 00:00:17,250
28
+ 最终还不是要回到一座平凡的城,
29
+ 8
30
+ 00:00:17,610 --> 00:00:19,410
31
+ 打一份平凡的工组,
32
+ 9
33
+ 00:00:19,410 --> 00:00:20,670
34
+ 建一个平凡的家庭,
35
+ 10
36
+ 00:00:21,330 --> 00:00:25,960
37
+ 何苦折腾一个人读书的意义究竟是什么?
38
+ 11
39
+ 00:00:26,680 --> 00:00:30,80
40
+ 今天给大家分享人民日报推荐的八条理由,
41
+ 12
42
+ 00:00:30,540 --> 00:00:32,875
43
+ 告诉你人为什么要多读书?
44
+ 13
45
+ 00:00:34,690 --> 00:00:38,725
46
+ 一脚步丈量不到的地方文字可以。
47
+ 14
48
+ 00:00:40,300 --> 00:00:41,540
49
+ 钱钟书先生说过,
50
+ 15
51
+ 00:00:42,260 --> 00:00:43,140
52
+ 如果不读书,
53
+ 16
54
+ 00:00:43,520 --> 00:00:44,400
55
+ 行万里路,
56
+ 17
57
+ 00:00:44,540 --> 00:00:45,695
58
+ 也只是个邮差。
59
+ 18
60
+ 00:00:46,900 --> 00:00:47,320
61
+ 北京、
62
+ 19
63
+ 00:00:47,500 --> 00:00:47,980
64
+ 西安、
65
+ 20
66
+ 00:00:48,320 --> 00:00:51,200
67
+ 南京和洛阳少了学识的浸润,
68
+ 21
69
+ 00:00:51,600 --> 00:00:55,565
70
+ 他们只是一个个耳中熟悉又眼里陌生的地名。
71
+ 22
72
+ 00:00:56,560 --> 00:00:59,360
73
+ 故宫避暑山庄岱庙、
74
+ 23
75
+ 00:00:59,840 --> 00:01:02,920
76
+ 曲阜三孔有了文化照耀,
77
+ 24
78
+ 00:01:03,120 --> 00:01:05,340
79
+ 他们才不是被时间风化的标本。
80
+ 25
81
+ 00:01:05,820 --> 00:01:08,105
82
+ 而是活了成百上千年的生命,
83
+ 26
84
+ 00:01:09,650 --> 00:01:10,370
85
+ 不去读书,
86
+ 27
87
+ 00:01:10,670 --> 00:01:12,920
88
+ 就是一个邮差风景,
89
+ 28
90
+ 00:01:13,0 --> 00:01:13,835
91
+ 过眼就忘,
92
+ 29
93
+ 00:01:14,750 --> 00:01:17,365
94
+ 就算踏破铁鞋又有什么用处呢?
95
+ 30
96
+ 00:01:19,240 --> 00:01:22,380
97
+ 阅读不仅仅会让现实的旅行更加丰富,
98
+ 31
99
+ 00:01:23,120 --> 00:01:27,260
100
+ 更重要的是能让精神突破现实和身体的桎梏,
101
+ 32
102
+ 00:01:27,640 --> 00:01:29,985
103
+ 来一场灵魂长足的旅行。
104
+ 33
105
+ 00:01:31,850 --> 00:01:32,930
106
+ 听过这样一句话,
107
+ 34
108
+ 00:01:33,490 --> 00:01:35,190
109
+ 没有一艘非凡的船舰,
110
+ 35
111
+ 00:01:35,330 --> 00:01:36,430
112
+ 能像一册书籍,
113
+ 36
114
+ 00:01:36,690 --> 00:01:38,595
115
+ 把我们带到浩瀚的天地,
116
+ 37
117
+ 00:01:39,830 --> 00:01:42,685
118
+ 你无法到达的地方文字在你过去,
119
+ 38
120
+ 00:01:43,530 --> 00:01:45,750
121
+ 你无法经历的人生舒淇,
122
+ 39
123
+ 00:01:45,770 --> 00:01:46,595
124
+ 带你相遇。
125
+ 40
126
+ 00:01:47,640 --> 00:01:50,340
127
+ 那些读过的书会一本本充实,
128
+ 41
129
+ 00:01:50,340 --> 00:01:50,940
130
+ 你的内心,
131
+ 42
132
+ 00:01:51,640 --> 00:01:54,855
133
+ 让虚无单调的世界变得五彩斑斓。
134
+ 43
135
+ 00:01:55,930 --> 00:01:59,690
136
+ 那些书中的人物会在你深陷生活泥潭之时,
137
+ 44
138
+ 00:02:00,170 --> 00:02:01,190
139
+ 轻声的呼唤,
140
+ 45
141
+ 00:02:01,950 --> 00:02:03,270
142
+ 用他们心怀梦想、
143
+ 46
144
+ 00:02:03,630 --> 00:02:04,950
145
+ 不卑不亢的故事,
146
+ 47
147
+ 00:02:05,310 --> 00:02:07,90
148
+ 激励你抵御苦难,
149
+ 48
150
+ 00:02:07,430 --> 00:02:08,525
151
+ 勇往直前。
152
+ 49
153
+ 00:02:11,290 --> 00:02:11,695
154
+ 二、
155
+ 50
156
+ 00:02:12,440 --> 00:02:16,900
157
+ 读书的意义是使人虚心叫通达不固执、
158
+ 51
159
+ 00:02:17,200 --> 00:02:18,35
160
+ 不偏执。
161
+ 52
162
+ 00:02:20,290 --> 00:02:22,935
163
+ 读书越少的人越容易过得痛苦。
164
+ 53
165
+ 00:02:23,600 --> 00:02:24,400
166
+ 读书越多,
167
+ 54
168
+ 00:02:24,800 --> 00:02:26,185
169
+ 人才会越通透,
170
+ 55
171
+ 00:02:27,890 --> 00:02:30,30
172
+ 知乎上有位网友讲过自己的故事。
173
+ 56
174
+ 00:02:30,750 --> 00:02:31,310
175
+ 有一次,
176
+ 57
177
+ 00:02:31,530 --> 00:02:32,650
178
+ 他跟伴侣吵架,
179
+ 58
180
+ 00:02:33,190 --> 00:02:35,505
181
+ 气得连续好几个晚上没睡好,
182
+ 59
183
+ 00:02:36,360 --> 00:02:38,880
184
+ 直到他读到一本关于亲密关系的书。
185
+ 60
186
+ 00:02:39,500 --> 00:02:41,920
187
+ 书中有段关于夫妻关系的解读,
188
+ 61
189
+ 00:02:42,80 --> 00:02:43,100
190
+ 让他豁然开朗,
191
+ 62
192
+ 00:02:43,460 --> 00:02:47,170
193
+ 突然想明白了很多事气消了,
194
+ 63
195
+ 00:02:47,430 --> 00:02:48,410
196
+ 心情好了,
197
+ 64
198
+ 00:02:48,790 --> 00:02:50,194
199
+ 整个人也舒爽了。
200
+ 65
201
+ 00:02:51,780 --> 00:02:54,340
202
+ 一个人书读的不多见识,
203
+ 66
204
+ 00:02:54,380 --> 00:02:55,180
205
+ 难免受限,
206
+ 67
207
+ 00:02:55,720 --> 00:02:58,495
208
+ 结果就必须受着眼前世界的禁锢,
209
+ 68
210
+ 00:02:59,540 --> 00:03:00,740
211
+ 稍微遇到一点不顺,
212
+ 69
213
+ 00:03:00,940 --> 00:03:02,460
214
+ 就极易消极悲观,
215
+ 70
216
+ 00:03:02,900 --> 00:03:03,720
217
+ 郁郁寡欢,
218
+ 71
219
+ 00:03:04,140 --> 00:03:05,765
220
+ 让自己困在情绪里,
221
+ 72
222
+ 00:03:06,900 --> 00:03:09,760
223
+ 只有通过阅读才能看透人生真相,
224
+ 73
225
+ 00:03:10,300 --> 00:03:12,140
226
+ 收获为人处事的智慧,
227
+ 74
228
+ 00:03:12,480 --> 00:03:14,95
229
+ 把日子越过越好。
230
+ 75
231
+ 00:03:16,730 --> 00:03:17,890
232
+ 生活的艺术里说,
233
+ 76
234
+ 00:03:18,410 --> 00:03:20,30
235
+ 人一定要时时读书,
236
+ 77
237
+ 00:03:20,430 --> 00:03:22,915
238
+ 不然便会鄙令晚腐。
239
+ 78
240
+ 00:03:23,690 --> 00:03:28,730
241
+ 完剑俗剑生满身上一个人的落伍迂腐,
242
+ 79
243
+ 00:03:29,210 --> 00:03:31,205
244
+ 就是不肯实施读书所致。
245
+ 80
246
+ 00:03:33,10 --> 00:03:34,790
247
+ 只有在不断阅读的过程中,
248
+ 81
249
+ 00:03:34,990 --> 00:03:35,970
250
+ 修心养性,
251
+ 82
252
+ 00:03:36,430 --> 00:03:38,735
253
+ 才能摆脱我们的鄙俗和顽固。
254
+ 83
255
+ 00:03:39,920 --> 00:03:41,720
256
+ 这世间没有谁的生活,
257
+ 84
258
+ 00:03:41,800 --> 00:03:42,540
259
+ 没有烦恼,
260
+ 85
261
+ 00:03:43,140 --> 00:03:45,455
262
+ 唯读书是最好的解药。
263
+ 86
264
+ 00:03:47,730 --> 00:03:48,185
265
+ 三、
266
+ 87
267
+ 00:03:49,40 --> 00:03:50,720
268
+ 书中未必有黄金屋,
269
+ 88
270
+ 00:03:51,0 --> 00:03:52,595
271
+ 但一定有更好的自己。
272
+ """
llm/g4f_openai_api.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from g4f.client import Client
2
+
3
+ if __name__ == '__main__':
4
+ from llm.demo_prompt import demo_prompt
5
+ client = Client()
6
+ response = client.chat.completions.create(
7
+ model="gpt-3.5-turbo",
8
+ messages=[{"role": "user", "content": "你好你的名字是什么"}],
9
+ )
10
+ print(response.choices[0].message.content)
11
+
12
+
13
+ def g4f_openai_call(model="gpt-3.5-turbo",
14
+ user_content="如何做西红柿炖牛腩?",
15
+ system_content=None):
16
+ client = Client()
17
+ if system_content is not None and len(system_content.strip()):
18
+ messages = [
19
+ {'role': 'system', 'content': system_content},
20
+ {'role': 'user', 'content': user_content}
21
+ ]
22
+ else:
23
+ messages = [
24
+ {'role': 'user', 'content': user_content}
25
+ ]
26
+ response = client.chat.completions.create(
27
+ model=model,
28
+ messages=messages,
29
+ )
30
+ return(response.choices[0].message.content)
llm/openai_api.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from openai import OpenAI
4
+
5
+
6
+ if __name__ == '__main__':
7
+ from llm.demo_prompt import demo_prompt
8
+ client = OpenAI(
9
+ # This is the default and can be omitted
10
+ api_key=os.environ.get("OPENAI_API_KEY"),
11
+ )
12
+
13
+ chat_completion = client.chat.completions.create(
14
+ messages=[
15
+ {
16
+ "role": "user",
17
+ "content": demo_prompt,
18
+ }
19
+ ],
20
+ model="gpt-3.5-turbo-0125",
21
+ )
22
+ print(chat_completion.choices[0].message.content)
23
+
24
+
25
+ def openai_call(apikey,
26
+ model="gpt-3.5-turbo",
27
+ user_content="如何做西红柿炖牛腩?",
28
+ system_content=None):
29
+ client = OpenAI(
30
+ # This is the default and can be omitted
31
+ api_key=apikey,
32
+ )
33
+ if system_content is not None and len(system_content.strip()):
34
+ messages = [
35
+ {'role': 'system', 'content': system_content},
36
+ {'role': 'user', 'content': user_content}
37
+ ]
38
+ else:
39
+ messages = [
40
+ {'role': 'user', 'content': user_content}
41
+ ]
42
+
43
+ chat_completion = client.chat.completions.create(
44
+ messages=messages,
45
+ model=model,
46
+ )
47
+ logging.info("Openai model inference done.")
48
+ return chat_completion.choices[0].message.content
llm/qwen_api.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dashscope
2
+ from dashscope import Generation
3
+
4
+
5
+ def call_qwen_model(key=None,
6
+ model="qwen_plus",
7
+ user_content="如何做西红柿炖牛腩?",
8
+ system_content=None):
9
+ dashscope.api_key = key
10
+ if system_content is not None and len(system_content.strip()):
11
+ messages = [
12
+ {'role': 'system', 'content': system_content},
13
+ {'role': 'user', 'content': user_content}
14
+ ]
15
+ else:
16
+ messages = [
17
+ {'role': 'user', 'content': user_content}
18
+ ]
19
+ responses = Generation.call(model,
20
+ messages=messages,
21
+ result_format='message', # 设置输出为'message'格式
22
+ stream=False, # 设置输出方式为流式输出
23
+ incremental_output=False # 增量式流式输出
24
+ )
25
+ print(responses)
26
+ return responses['output']['choices'][0]['message']['content']
27
+
28
+
29
+ if __name__ == '__main__':
30
+ call_qwen_model('YOUR_BAILIAN_APIKEY')
requirements.txt CHANGED
@@ -1,8 +1,13 @@
1
  librosa
2
  soundfile
3
- funasr>=0.5.5
 
4
  moviepy
5
  numpy
 
6
  modelscope
7
- torch
8
- torchaudio
 
 
 
 
1
  librosa
2
  soundfile
3
+ scikit-learn>=1.3.2
4
+ funasr>=1.0.25
5
  moviepy
6
  numpy
7
+ gradio
8
  modelscope
9
+ torch>=1.13
10
+ torchaudio
11
+ openai
12
+ g4f
13
+ dashscope
test/imagemagick_test.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from moviepy.editor import *
2
+ from moviepy.video.tools.subtitles import SubtitlesClip
3
+
4
+ generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=48, color='white')
5
+ subs = [((0, 2), 'sub1中文字幕'),
6
+ ((2, 4), 'subs2'),
7
+ ((4, 6), 'subs3'),
8
+ ((6, 8), 'subs4')]
9
+
10
+ subtitles = SubtitlesClip(subs, generator)
11
+
12
+ video = VideoFileClip("examples/2022云栖大会_片段.mp4.mp4")
13
+ video = video.subclip(0, 8)
14
+ video = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
15
+
16
+ video.write_videofile("test_output.mp4")
test/test.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # step1: Recognize
2
+ python videoclipper.py --stage 1 \
3
+ --file ../examples/2022云栖大会_片段.mp4 \
4
+ --sd_switch yes \
5
+ --output_dir ./output
6
+ # now you can find recognition results and entire SRT file in ./output/
7
+ # step2: Clip
8
+ python videoclipper.py --stage 2 \
9
+ --file ../examples/2022云栖大会_片段.mp4 \
10
+ --output_dir ./output \
11
+ --dest_text '所以这个是我们办这个奖的初心啊,我们也会一届一届的办下去' \
12
+ # --dest_spk spk0 \
13
+ --start_ost 0 \
14
+ --end_ost 100 \
15
+ --output_file './output/res.mp4'
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (160 Bytes). View file
 
utils/__pycache__/argparse_tools.cpython-311.pyc ADDED
Binary file (4.07 kB). View file
 
utils/__pycache__/argparse_tools.cpython-39.pyc ADDED
Binary file (2.41 kB). View file
 
utils/__pycache__/subtitle_utils.cpython-311.pyc ADDED
Binary file (7.6 kB). View file
 
utils/__pycache__/subtitle_utils.cpython-39.pyc ADDED
Binary file (3.49 kB). View file
 
utils/__pycache__/trans_utils.cpython-311.pyc ADDED
Binary file (8.45 kB). View file
 
utils/__pycache__/trans_utils.cpython-39.pyc ADDED
Binary file (3 kB). View file
 
utils/argparse_tools.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ import argparse
7
+ from pathlib import Path
8
+
9
+ import yaml
10
+ import sys
11
+
12
+
13
+ class ArgumentParser(argparse.ArgumentParser):
14
+ """Simple implementation of ArgumentParser supporting config file
15
+
16
+ This class is originated from https://github.com/bw2/ConfigArgParse,
17
+ but this class is lack of some features that it has.
18
+
19
+ - Not supporting multiple config files
20
+ - Automatically adding "--config" as an option.
21
+ - Not supporting any formats other than yaml
22
+ - Not checking argument type
23
+
24
+ """
25
+
26
+ def __init__(self, *args, **kwargs):
27
+ super().__init__(*args, **kwargs)
28
+ self.add_argument("--config", help="Give config file in yaml format")
29
+
30
+ def parse_known_args(self, args=None, namespace=None):
31
+ # Once parsing for setting from "--config"
32
+ _args, _ = super().parse_known_args(args, namespace)
33
+ if _args.config is not None:
34
+ if not Path(_args.config).exists():
35
+ self.error(f"No such file: {_args.config}")
36
+
37
+ with open(_args.config, "r", encoding="utf-8") as f:
38
+ d = yaml.safe_load(f)
39
+ if not isinstance(d, dict):
40
+ self.error("Config file has non dict value: {_args.config}")
41
+
42
+ for key in d:
43
+ for action in self._actions:
44
+ if key == action.dest:
45
+ break
46
+ else:
47
+ self.error(f"unrecognized arguments: {key} (from {_args.config})")
48
+
49
+ # NOTE(kamo): Ignore "--config" from a config file
50
+ # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
51
+ # i.e. We can set any type value regardless of argument type.
52
+ self.set_defaults(**d)
53
+ return super().parse_known_args(args, namespace)
54
+
55
+
56
+ def get_commandline_args():
57
+ extra_chars = [
58
+ " ",
59
+ ";",
60
+ "&",
61
+ "(",
62
+ ")",
63
+ "|",
64
+ "^",
65
+ "<",
66
+ ">",
67
+ "?",
68
+ "*",
69
+ "[",
70
+ "]",
71
+ "$",
72
+ "`",
73
+ '"',
74
+ "\\",
75
+ "!",
76
+ "{",
77
+ "}",
78
+ ]
79
+
80
+ # Escape the extra characters for shell
81
+ argv = [
82
+ arg.replace("'", "'\\''")
83
+ if all(char not in arg for char in extra_chars)
84
+ else "'" + arg.replace("'", "'\\''") + "'"
85
+ for arg in sys.argv
86
+ ]
87
+
88
+ return sys.executable + " " + " ".join(argv)
utils/subtitle_utils.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ def time_convert(ms):
7
+ ms = int(ms)
8
+ tail = ms % 1000
9
+ s = ms // 1000
10
+ mi = s // 60
11
+ s = s % 60
12
+ h = mi // 60
13
+ mi = mi % 60
14
+ h = "00" if h == 0 else str(h)
15
+ mi = "00" if mi == 0 else str(mi)
16
+ s = "00" if s == 0 else str(s)
17
+ tail = str(tail)
18
+ if len(h) == 1: h = '0' + h
19
+ if len(mi) == 1: mi = '0' + mi
20
+ if len(s) == 1: s = '0' + s
21
+ return "{}:{}:{},{}".format(h, mi, s, tail)
22
+
23
+
24
+ class Text2SRT():
25
+ def __init__(self, text, timestamp, offset=0):
26
+ self.token_list = [i for i in text.split() if len(i)]
27
+ self.timestamp = timestamp
28
+ start, end = timestamp[0][0] - offset, timestamp[-1][1] - offset
29
+ self.start_sec, self.end_sec = start, end
30
+ self.start_time = time_convert(start)
31
+ self.end_time = time_convert(end)
32
+ def text(self):
33
+ res = ""
34
+ for word in self.token_list:
35
+ if '\u4e00' <= word <= '\u9fff':
36
+ res += word
37
+ else:
38
+ res += " " + word
39
+ return res
40
+ def len(self):
41
+ return len(self.token_list)
42
+ def srt(self, acc_ost=0.0):
43
+ return "{} --> {}\n{}\n".format(
44
+ time_convert(self.start_sec+acc_ost*1000),
45
+ time_convert(self.end_sec+acc_ost*1000),
46
+ self.text())
47
+ def time(self, acc_ost=0.0):
48
+ return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
49
+
50
+
51
+ def generate_srt(sentence_list):
52
+ srt_total = ''
53
+ for i, sent in enumerate(sentence_list):
54
+ t2s = Text2SRT(sent['text'], sent['timestamp'])
55
+ if 'spk' in sent:
56
+ srt_total += "{} spk{}\n{}".format(i, sent['spk'], t2s.srt())
57
+ else:
58
+ srt_total += "{}\n{}".format(i, t2s.srt())
59
+ return srt_total
60
+
61
+ def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
62
+ start, end = int(start * 1000), int(end * 1000)
63
+ srt_total = ''
64
+ cc = 1 + begin_index
65
+ subs = []
66
+ for _, sent in enumerate(sentence_list):
67
+ if sent['timestamp'][-1][1] <= start:
68
+ # print("CASE0")
69
+ continue
70
+ if sent['timestamp'][0][0] >= end:
71
+ # print("CASE4")
72
+ break
73
+ # parts in between
74
+ if (sent['timestamp'][-1][1] <= end and sent['timestamp'][0][0] > start) or (sent['timestamp'][-1][1] == end and sent['timestamp'][0][0] == start):
75
+ # print("CASE1")
76
+ t2s = Text2SRT(sent['text'], sent['timestamp'], offset=start)
77
+ srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
78
+ subs.append((t2s.time(time_acc_ost), t2s.text()))
79
+ cc += 1
80
+ continue
81
+ if sent['timestamp'][0][0] <= start:
82
+ # print("CASE2")
83
+ if not sent['timestamp'][-1][1] > end:
84
+ for j, ts in enumerate(sent['timestamp']):
85
+ if ts[1] > start:
86
+ break
87
+ _text = " ".join(sent['text'][j:])
88
+ _ts = sent['timestamp'][j:]
89
+ else:
90
+ for j, ts in enumerate(sent['timestamp']):
91
+ if ts[1] > start:
92
+ _start = j
93
+ break
94
+ for j, ts in enumerate(sent['timestamp']):
95
+ if ts[1] > end:
96
+ _end = j
97
+ break
98
+ _text = " ".join(sent['text'][_start:_end])
99
+ _ts = sent['timestamp'][_start:_end]
100
+ if len(ts):
101
+ t2s = Text2SRT(_text, _ts, offset=start)
102
+ srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
103
+ subs.append((t2s.time(time_acc_ost), t2s.text()))
104
+ cc += 1
105
+ continue
106
+ if sent['timestamp'][-1][1] > end:
107
+ # print("CASE3")
108
+ for j, ts in enumerate(sent['timestamp']):
109
+ if ts[1] > end:
110
+ break
111
+ _text = " ".join(sent['text'][:j])
112
+ _ts = sent['timestamp'][:j]
113
+ if len(_ts):
114
+ t2s = Text2SRT(_text, _ts, offset=start)
115
+ srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
116
+ subs.append(
117
+ (t2s.time(time_acc_ost), t2s.text())
118
+ )
119
+ cc += 1
120
+ continue
121
+ return srt_total, subs, cc
utils/theme.json ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "theme": {
3
+ "_font": [
4
+ {
5
+ "__gradio_font__": true,
6
+ "name": "Montserrat",
7
+ "class": "google"
8
+ },
9
+ {
10
+ "__gradio_font__": true,
11
+ "name": "ui-sans-serif",
12
+ "class": "font"
13
+ },
14
+ {
15
+ "__gradio_font__": true,
16
+ "name": "system-ui",
17
+ "class": "font"
18
+ },
19
+ {
20
+ "__gradio_font__": true,
21
+ "name": "sans-serif",
22
+ "class": "font"
23
+ }
24
+ ],
25
+ "_font_mono": [
26
+ {
27
+ "__gradio_font__": true,
28
+ "name": "IBM Plex Mono",
29
+ "class": "google"
30
+ },
31
+ {
32
+ "__gradio_font__": true,
33
+ "name": "ui-monospace",
34
+ "class": "font"
35
+ },
36
+ {
37
+ "__gradio_font__": true,
38
+ "name": "Consolas",
39
+ "class": "font"
40
+ },
41
+ {
42
+ "__gradio_font__": true,
43
+ "name": "monospace",
44
+ "class": "font"
45
+ }
46
+ ],
47
+ "background_fill_primary": "*neutral_50",
48
+ "background_fill_primary_dark": "*neutral_950",
49
+ "background_fill_secondary": "*neutral_50",
50
+ "background_fill_secondary_dark": "*neutral_900",
51
+ "block_background_fill": "white",
52
+ "block_background_fill_dark": "*neutral_800",
53
+ "block_border_color": "*border_color_primary",
54
+ "block_border_color_dark": "*border_color_primary",
55
+ "block_border_width": "0px",
56
+ "block_border_width_dark": "0px",
57
+ "block_info_text_color": "*body_text_color_subdued",
58
+ "block_info_text_color_dark": "*body_text_color_subdued",
59
+ "block_info_text_size": "*text_sm",
60
+ "block_info_text_weight": "400",
61
+ "block_label_background_fill": "*primary_100",
62
+ "block_label_background_fill_dark": "*primary_600",
63
+ "block_label_border_color": "*border_color_primary",
64
+ "block_label_border_color_dark": "*border_color_primary",
65
+ "block_label_border_width": "1px",
66
+ "block_label_border_width_dark": "1px",
67
+ "block_label_margin": "*spacing_md",
68
+ "block_label_padding": "*spacing_sm *spacing_md",
69
+ "block_label_radius": "*radius_md",
70
+ "block_label_right_radius": "0 calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px)",
71
+ "block_label_text_color": "*primary_500",
72
+ "block_label_text_color_dark": "*white",
73
+ "block_label_text_size": "*text_md",
74
+ "block_label_text_weight": "600",
75
+ "block_padding": "*spacing_xl calc(*spacing_xl + 2px)",
76
+ "block_radius": "*radius_lg",
77
+ "block_shadow": "none",
78
+ "block_shadow_dark": "none",
79
+ "block_title_background_fill": "*block_label_background_fill",
80
+ "block_title_background_fill_dark": "*block_label_background_fill",
81
+ "block_title_border_color": "none",
82
+ "block_title_border_color_dark": "none",
83
+ "block_title_border_width": "0px",
84
+ "block_title_border_width_dark": "0px",
85
+ "block_title_padding": "*block_label_padding",
86
+ "block_title_radius": "*block_label_radius",
87
+ "block_title_text_color": "*primary_500",
88
+ "block_title_text_color_dark": "*white",
89
+ "block_title_text_size": "*text_md",
90
+ "block_title_text_weight": "600",
91
+ "body_background_fill": "*background_fill_primary",
92
+ "body_background_fill_dark": "*background_fill_primary",
93
+ "body_text_color": "*neutral_800",
94
+ "body_text_color_dark": "*neutral_100",
95
+ "body_text_color_subdued": "*neutral_400",
96
+ "body_text_color_subdued_dark": "*neutral_400",
97
+ "body_text_size": "*text_md",
98
+ "body_text_weight": "400",
99
+ "border_color_accent": "*primary_300",
100
+ "border_color_accent_dark": "*neutral_600",
101
+ "border_color_primary": "*neutral_200",
102
+ "border_color_primary_dark": "*neutral_700",
103
+ "button_border_width": "*input_border_width",
104
+ "button_border_width_dark": "*input_border_width",
105
+ "button_cancel_background_fill": "*button_secondary_background_fill",
106
+ "button_cancel_background_fill_dark": "*button_secondary_background_fill",
107
+ "button_cancel_background_fill_hover": "*button_secondary_background_fill_hover",
108
+ "button_cancel_background_fill_hover_dark": "*button_secondary_background_fill_hover",
109
+ "button_cancel_border_color": "*button_secondary_border_color",
110
+ "button_cancel_border_color_dark": "*button_secondary_border_color",
111
+ "button_cancel_border_color_hover": "*button_cancel_border_color",
112
+ "button_cancel_border_color_hover_dark": "*button_cancel_border_color",
113
+ "button_cancel_text_color": "*button_secondary_text_color",
114
+ "button_cancel_text_color_dark": "*button_secondary_text_color",
115
+ "button_cancel_text_color_hover": "*button_cancel_text_color",
116
+ "button_cancel_text_color_hover_dark": "*button_cancel_text_color",
117
+ "button_large_padding": "*spacing_lg calc(2 * *spacing_lg)",
118
+ "button_large_radius": "*radius_lg",
119
+ "button_large_text_size": "*text_lg",
120
+ "button_large_text_weight": "600",
121
+ "button_primary_background_fill": "*primary_500",
122
+ "button_primary_background_fill_dark": "*primary_700",
123
+ "button_primary_background_fill_hover": "*primary_400",
124
+ "button_primary_background_fill_hover_dark": "*primary_500",
125
+ "button_primary_border_color": "*primary_200",
126
+ "button_primary_border_color_dark": "*primary_600",
127
+ "button_primary_border_color_hover": "*button_primary_border_color",
128
+ "button_primary_border_color_hover_dark": "*button_primary_border_color",
129
+ "button_primary_text_color": "white",
130
+ "button_primary_text_color_dark": "white",
131
+ "button_primary_text_color_hover": "*button_primary_text_color",
132
+ "button_primary_text_color_hover_dark": "*button_primary_text_color",
133
+ "button_secondary_background_fill": "white",
134
+ "button_secondary_background_fill_dark": "*neutral_600",
135
+ "button_secondary_background_fill_hover": "*neutral_100",
136
+ "button_secondary_background_fill_hover_dark": "*primary_500",
137
+ "button_secondary_border_color": "*neutral_200",
138
+ "button_secondary_border_color_dark": "*neutral_600",
139
+ "button_secondary_border_color_hover": "*button_secondary_border_color",
140
+ "button_secondary_border_color_hover_dark": "*button_secondary_border_color",
141
+ "button_secondary_text_color": "*neutral_800",
142
+ "button_secondary_text_color_dark": "white",
143
+ "button_secondary_text_color_hover": "*button_secondary_text_color",
144
+ "button_secondary_text_color_hover_dark": "*button_secondary_text_color",
145
+ "button_shadow": "*shadow_drop_lg",
146
+ "button_shadow_active": "*shadow_inset",
147
+ "button_shadow_hover": "*shadow_drop_lg",
148
+ "button_small_padding": "*spacing_sm calc(2 * *spacing_sm)",
149
+ "button_small_radius": "*radius_lg",
150
+ "button_small_text_size": "*text_md",
151
+ "button_small_text_weight": "400",
152
+ "button_transition": "background-color 0.2s ease",
153
+ "checkbox_background_color": "*background_fill_primary",
154
+ "checkbox_background_color_dark": "*neutral_800",
155
+ "checkbox_background_color_focus": "*checkbox_background_color",
156
+ "checkbox_background_color_focus_dark": "*checkbox_background_color",
157
+ "checkbox_background_color_hover": "*checkbox_background_color",
158
+ "checkbox_background_color_hover_dark": "*checkbox_background_color",
159
+ "checkbox_background_color_selected": "*primary_600",
160
+ "checkbox_background_color_selected_dark": "*primary_700",
161
+ "checkbox_border_color": "*neutral_100",
162
+ "checkbox_border_color_dark": "*neutral_600",
163
+ "checkbox_border_color_focus": "*primary_500",
164
+ "checkbox_border_color_focus_dark": "*primary_600",
165
+ "checkbox_border_color_hover": "*neutral_300",
166
+ "checkbox_border_color_hover_dark": "*neutral_600",
167
+ "checkbox_border_color_selected": "*primary_600",
168
+ "checkbox_border_color_selected_dark": "*primary_700",
169
+ "checkbox_border_radius": "*radius_sm",
170
+ "checkbox_border_width": "1px",
171
+ "checkbox_border_width_dark": "*input_border_width",
172
+ "checkbox_check": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e\")",
173
+ "checkbox_label_background_fill": "*button_secondary_background_fill",
174
+ "checkbox_label_background_fill_dark": "*button_secondary_background_fill",
175
+ "checkbox_label_background_fill_hover": "*button_secondary_background_fill_hover",
176
+ "checkbox_label_background_fill_hover_dark": "*button_secondary_background_fill_hover",
177
+ "checkbox_label_background_fill_selected": "*primary_500",
178
+ "checkbox_label_background_fill_selected_dark": "*primary_600",
179
+ "checkbox_label_border_color": "*border_color_primary",
180
+ "checkbox_label_border_color_dark": "*border_color_primary",
181
+ "checkbox_label_border_color_hover": "*checkbox_label_border_color",
182
+ "checkbox_label_border_color_hover_dark": "*checkbox_label_border_color",
183
+ "checkbox_label_border_width": "*input_border_width",
184
+ "checkbox_label_border_width_dark": "*input_border_width",
185
+ "checkbox_label_gap": "*spacing_lg",
186
+ "checkbox_label_padding": "*spacing_md calc(2 * *spacing_md)",
187
+ "checkbox_label_shadow": "*shadow_drop_lg",
188
+ "checkbox_label_text_color": "*body_text_color",
189
+ "checkbox_label_text_color_dark": "*body_text_color",
190
+ "checkbox_label_text_color_selected": "white",
191
+ "checkbox_label_text_color_selected_dark": "*checkbox_label_text_color",
192
+ "checkbox_label_text_size": "*text_md",
193
+ "checkbox_label_text_weight": "400",
194
+ "checkbox_shadow": "none",
195
+ "color_accent": "*primary_500",
196
+ "color_accent_soft": "*primary_50",
197
+ "color_accent_soft_dark": "*neutral_700",
198
+ "container_radius": "*radius_lg",
199
+ "embed_radius": "*radius_lg",
200
+ "error_background_fill": "#fee2e2",
201
+ "error_background_fill_dark": "*background_fill_primary",
202
+ "error_border_color": "#fecaca",
203
+ "error_border_color_dark": "*border_color_primary",
204
+ "error_border_width": "1px",
205
+ "error_border_width_dark": "1px",
206
+ "error_text_color": "#ef4444",
207
+ "error_text_color_dark": "#ef4444",
208
+ "font": "'Montserrat', 'ui-sans-serif', 'system-ui', sans-serif",
209
+ "font_mono": "'IBM Plex Mono', 'ui-monospace', 'Consolas', monospace",
210
+ "form_gap_width": "0px",
211
+ "input_background_fill": "white",
212
+ "input_background_fill_dark": "*neutral_700",
213
+ "input_background_fill_focus": "*secondary_500",
214
+ "input_background_fill_focus_dark": "*secondary_600",
215
+ "input_background_fill_hover": "*input_background_fill",
216
+ "input_background_fill_hover_dark": "*input_background_fill",
217
+ "input_border_color": "*neutral_50",
218
+ "input_border_color_dark": "*border_color_primary",
219
+ "input_border_color_focus": "*secondary_300",
220
+ "input_border_color_focus_dark": "*neutral_700",
221
+ "input_border_color_hover": "*input_border_color",
222
+ "input_border_color_hover_dark": "*input_border_color",
223
+ "input_border_width": "0px",
224
+ "input_border_width_dark": "0px",
225
+ "input_padding": "*spacing_xl",
226
+ "input_placeholder_color": "*neutral_400",
227
+ "input_placeholder_color_dark": "*neutral_500",
228
+ "input_radius": "*radius_lg",
229
+ "input_shadow": "*shadow_drop",
230
+ "input_shadow_dark": "*shadow_drop",
231
+ "input_shadow_focus": "*shadow_drop_lg",
232
+ "input_shadow_focus_dark": "*shadow_drop_lg",
233
+ "input_text_size": "*text_md",
234
+ "input_text_weight": "400",
235
+ "layout_gap": "*spacing_xxl",
236
+ "link_text_color": "*secondary_600",
237
+ "link_text_color_active": "*secondary_600",
238
+ "link_text_color_active_dark": "*secondary_500",
239
+ "link_text_color_dark": "*secondary_500",
240
+ "link_text_color_hover": "*secondary_700",
241
+ "link_text_color_hover_dark": "*secondary_400",
242
+ "link_text_color_visited": "*secondary_500",
243
+ "link_text_color_visited_dark": "*secondary_600",
244
+ "loader_color": "*color_accent",
245
+ "loader_color_dark": "*color_accent",
246
+ "name": "base",
247
+ "neutral_100": "#f3f4f6",
248
+ "neutral_200": "#e5e7eb",
249
+ "neutral_300": "#d1d5db",
250
+ "neutral_400": "#9ca3af",
251
+ "neutral_50": "#f9fafb",
252
+ "neutral_500": "#6b7280",
253
+ "neutral_600": "#4b5563",
254
+ "neutral_700": "#374151",
255
+ "neutral_800": "#1f2937",
256
+ "neutral_900": "#111827",
257
+ "neutral_950": "#0b0f19",
258
+ "panel_background_fill": "*background_fill_secondary",
259
+ "panel_background_fill_dark": "*background_fill_secondary",
260
+ "panel_border_color": "*border_color_primary",
261
+ "panel_border_color_dark": "*border_color_primary",
262
+ "panel_border_width": "1px",
263
+ "panel_border_width_dark": "1px",
264
+ "primary_100": "#e0e7ff",
265
+ "primary_200": "#c7d2fe",
266
+ "primary_300": "#a5b4fc",
267
+ "primary_400": "#818cf8",
268
+ "primary_50": "#eef2ff",
269
+ "primary_500": "#6366f1",
270
+ "primary_600": "#4f46e5",
271
+ "primary_700": "#4338ca",
272
+ "primary_800": "#3730a3",
273
+ "primary_900": "#312e81",
274
+ "primary_950": "#2b2c5e",
275
+ "prose_header_text_weight": "600",
276
+ "prose_text_size": "*text_md",
277
+ "prose_text_weight": "400",
278
+ "radio_circle": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e\")",
279
+ "radius_lg": "6px",
280
+ "radius_md": "4px",
281
+ "radius_sm": "2px",
282
+ "radius_xl": "8px",
283
+ "radius_xs": "1px",
284
+ "radius_xxl": "12px",
285
+ "radius_xxs": "1px",
286
+ "secondary_100": "#ecfccb",
287
+ "secondary_200": "#d9f99d",
288
+ "secondary_300": "#bef264",
289
+ "secondary_400": "#a3e635",
290
+ "secondary_50": "#f7fee7",
291
+ "secondary_500": "#84cc16",
292
+ "secondary_600": "#65a30d",
293
+ "secondary_700": "#4d7c0f",
294
+ "secondary_800": "#3f6212",
295
+ "secondary_900": "#365314",
296
+ "secondary_950": "#2f4e14",
297
+ "section_header_text_size": "*text_md",
298
+ "section_header_text_weight": "400",
299
+ "shadow_drop": "0 1px 4px 0 rgb(0 0 0 / 0.1)",
300
+ "shadow_drop_lg": "0 2px 5px 0 rgb(0 0 0 / 0.1)",
301
+ "shadow_inset": "rgba(0,0,0,0.05) 0px 2px 4px 0px inset",
302
+ "shadow_spread": "6px",
303
+ "shadow_spread_dark": "1px",
304
+ "slider_color": "*primary_500",
305
+ "slider_color_dark": "*primary_600",
306
+ "spacing_lg": "6px",
307
+ "spacing_md": "4px",
308
+ "spacing_sm": "2px",
309
+ "spacing_xl": "9px",
310
+ "spacing_xs": "1px",
311
+ "spacing_xxl": "12px",
312
+ "spacing_xxs": "1px",
313
+ "stat_background_fill": "*primary_300",
314
+ "stat_background_fill_dark": "*primary_500",
315
+ "table_border_color": "*neutral_300",
316
+ "table_border_color_dark": "*neutral_700",
317
+ "table_even_background_fill": "white",
318
+ "table_even_background_fill_dark": "*neutral_950",
319
+ "table_odd_background_fill": "*neutral_50",
320
+ "table_odd_background_fill_dark": "*neutral_900",
321
+ "table_radius": "*radius_lg",
322
+ "table_row_focus": "*color_accent_soft",
323
+ "table_row_focus_dark": "*color_accent_soft",
324
+ "text_lg": "16px",
325
+ "text_md": "14px",
326
+ "text_sm": "12px",
327
+ "text_xl": "22px",
328
+ "text_xs": "10px",
329
+ "text_xxl": "26px",
330
+ "text_xxs": "9px"
331
+ },
332
+ "version": "0.0.1"
333
+ }
utils/trans_utils.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ import os
7
+ import re
8
+ import numpy as np
9
+
10
+ PUNC_LIST = [',', '。', '!', '?', '、']
11
+
12
+ def pre_proc(text):
13
+ res = ''
14
+ for i in range(len(text)):
15
+ if text[i] in PUNC_LIST:
16
+ continue
17
+ if '\u4e00' <= text[i] <= '\u9fff':
18
+ if len(res) and res[-1] != " ":
19
+ res += ' ' + text[i]+' '
20
+ else:
21
+ res += text[i]+' '
22
+ else:
23
+ res += text[i]
24
+ if res[-1] == ' ':
25
+ res = res[:-1]
26
+ return res
27
+
28
+ def proc(raw_text, timestamp, dest_text):
29
+ # simple matching
30
+ ld = len(dest_text.split())
31
+ mi, ts = [], []
32
+ offset = 0
33
+ while True:
34
+ fi = raw_text.find(dest_text, offset, len(raw_text))
35
+ ti = raw_text[:fi].count(' ')
36
+ if fi == -1:
37
+ break
38
+ offset = fi + ld
39
+ mi.append(fi)
40
+ ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
41
+ return ts
42
+
43
+ def proc_spk(dest_spk, sd_sentences):
44
+ ts = []
45
+ for d in sd_sentences:
46
+ d_start = d['timestamp'][0][0]
47
+ d_end = d['timestamp'][-1][1]
48
+ spkid=dest_spk[3:]
49
+ if str(d['spk']) == spkid and d_end-d_start>999:
50
+ ts.append([d['start']*16, d['end']*16])
51
+ return ts
52
+
53
+ def generate_vad_data(data, sd_sentences, sr=16000):
54
+ assert len(data.shape) == 1
55
+ vad_data = []
56
+ for d in sd_sentences:
57
+ d_start = round(d['ts_list'][0][0]/1000, 2)
58
+ d_end = round(d['ts_list'][-1][1]/1000, 2)
59
+ vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]])
60
+ return vad_data
61
+
62
+ def write_state(output_dir, state):
63
+ for key in ['/recog_res_raw', '/timestamp', '/sentences']:#, '/sd_sentences']:
64
+ with open(output_dir+key, 'w') as fout:
65
+ fout.write(str(state[key[1:]]))
66
+ if 'sd_sentences' in state:
67
+ with open(output_dir+'/sd_sentences', 'w') as fout:
68
+ fout.write(str(state['sd_sentences']))
69
+
70
+ def load_state(output_dir):
71
+ state = {}
72
+ with open(output_dir+'/recog_res_raw') as fin:
73
+ line = fin.read()
74
+ state['recog_res_raw'] = line
75
+ with open(output_dir+'/timestamp') as fin:
76
+ line = fin.read()
77
+ state['timestamp'] = eval(line)
78
+ with open(output_dir+'/sentences') as fin:
79
+ line = fin.read()
80
+ state['sentences'] = eval(line)
81
+ if os.path.exists(output_dir+'/sd_sentences'):
82
+ with open(output_dir+'/sd_sentences') as fin:
83
+ line = fin.read()
84
+ state['sd_sentences'] = eval(line)
85
+ return state
86
+
87
+ def convert_pcm_to_float(data):
88
+ if data.dtype == np.float64:
89
+ return data
90
+ elif data.dtype == np.float32:
91
+ return data.astype(np.float64)
92
+ elif data.dtype == np.int16:
93
+ bit_depth = 16
94
+ elif data.dtype == np.int32:
95
+ bit_depth = 32
96
+ elif data.dtype == np.int8:
97
+ bit_depth = 8
98
+ else:
99
+ raise ValueError("Unsupported audio data type")
100
+
101
+ # Now handle the integer types
102
+ max_int_value = float(2 ** (bit_depth - 1))
103
+ if bit_depth == 8:
104
+ data = data - 128
105
+ return (data.astype(np.float64) / max_int_value)
106
+
107
+ def convert_time_to_millis(time_str):
108
+ # 格式: [小时:分钟:秒,毫秒]
109
+ hours, minutes, seconds, milliseconds = map(int, re.split('[:,]', time_str))
110
+ return (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
111
+
112
+ def extract_timestamps(input_text):
113
+ # 使用正则表达式查找所有时间戳
114
+ timestamps = re.findall(r'\[(\d{2}:\d{2}:\d{2},\d{2,3})\s*-\s*(\d{2}:\d{2}:\d{2},\d{2,3})\]', input_text)
115
+ times_list = []
116
+ print(timestamps)
117
+ # 循环遍历找到的所有时间戳,并转换为毫秒
118
+ for start_time, end_time in timestamps:
119
+ start_millis = convert_time_to_millis(start_time)
120
+ end_millis = convert_time_to_millis(end_time)
121
+ times_list.append([start_millis, end_millis])
122
+
123
+ return times_list
124
+
125
+
126
+ if __name__ == '__main__':
127
+ text = ("1. [00:00:00,500-00:00:05,850] 在我们的设计普惠当中,有一个我经常津津乐道的项目叫寻找远方的美好。"
128
+ "2. [00:00:07,120-00:00:12,940] 啊,在这样一个我们叫寻美在这样的一个项目当中,我们把它跟乡村振兴去结合起来,利用我们的设计的能力。"
129
+ "3. [00:00:13,240-00:00:25,620] 问我们自身员工的设设计能力,我们设计生态伙伴的能力,帮助乡村振兴当中,要希望把他的产品推向市场,把他的农产品把他加工产品推向市场的这样的伙伴做一件事情,")
130
+
131
+ print(extract_timestamps(text))
videoclipper.py CHANGED
@@ -1,3 +1,10 @@
 
 
 
 
 
 
 
1
  import sys
2
  import copy
3
  import librosa
@@ -5,36 +12,56 @@ import logging
5
  import argparse
6
  import numpy as np
7
  import soundfile as sf
8
- import moviepy.editor as mpy
9
- from modelscope.pipelines import pipeline
10
- from modelscope.utils.constant import Tasks
11
- from subtitle_utils import generate_srt, generate_srt_clip
12
- from trans_utils import pre_proc, proc, write_state, load_state
13
-
14
  from moviepy.editor import *
 
15
  from moviepy.video.tools.subtitles import SubtitlesClip
 
 
 
16
 
17
 
18
  class VideoClipper():
19
- def __init__(self, asr_pipeline):
20
  logging.warning("Initializing VideoClipper.")
21
- self.asr_pipeline = asr_pipeline
 
22
 
23
- def recog(self, audio_input, state=None):
24
  if state is None:
25
  state = {}
26
- state['audio_input'] = audio_input
27
- _, data = audio_input
28
- data = data.astype(np.float64)
29
- rec_result = self.asr_pipeline(audio_in=data)
30
- state['recog_res_raw'] = rec_result['text_postprocessed']
31
- state['timestamp'] = rec_result['time_stamp']
32
- state['sentences'] = rec_result['sentences']
33
- res_text = rec_result['text']
34
- res_srt = generate_srt(rec_result['sentences'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  return res_text, res_srt, state
36
 
37
- def clip(self, dest_text, start_ost, end_ost, state):
38
  # get from state
39
  audio_input = state['audio_input']
40
  recog_res_raw = state['recog_res_raw']
@@ -43,12 +70,37 @@ class VideoClipper():
43
  sr, data = audio_input
44
  data = data.astype(np.float64)
45
 
46
- all_ts = []
47
- for _dest_text in dest_text.split('#'):
48
- _dest_text = pre_proc(_dest_text)
49
- ts = proc(recog_res_raw, timestamp, _dest_text)
50
- for _ts in ts: all_ts.append(_ts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  ts = all_ts
 
52
  srt_index = 0
53
  clip_srt = ""
54
  if len(ts):
@@ -68,79 +120,292 @@ class VideoClipper():
68
  srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
69
  clip_srt += srt_clip
70
  if len(ts):
71
- message = "{} periods found in the speech: ".format(len(ts)) + start_end_info
72
  else:
73
  message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
 
74
  return (sr, res_audio), message, clip_srt
75
 
76
- def video_recog(self, vedio_filename):
77
- vedio_filename = vedio_filename
78
- clip_video_file = vedio_filename[:-4] + '_clip.mp4'
79
- video = mpy.VideoFileClip(vedio_filename)
80
- audio_file = vedio_filename[:-3] + 'wav'
 
 
 
 
 
 
 
 
 
81
  video.audio.write_audiofile(audio_file)
82
  wav = librosa.load(audio_file, sr=16000)[0]
 
 
 
83
  state = {
84
- 'vedio_filename': vedio_filename,
85
  'clip_video_file': clip_video_file,
86
  'video': video,
87
  }
88
  # res_text, res_srt = self.recog((16000, wav), state)
89
- return self.recog((16000, wav), state)
90
 
91
- def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False):
 
 
 
 
 
 
 
 
 
 
92
  # get from state
93
  recog_res_raw = state['recog_res_raw']
94
  timestamp = state['timestamp']
95
  sentences = state['sentences']
96
  video = state['video']
97
  clip_video_file = state['clip_video_file']
98
- vedio_filename = state['vedio_filename']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- all_ts = []
101
  srt_index = 0
102
- for _dest_text in dest_text.split('#'):
103
- _dest_text = pre_proc(_dest_text)
104
- ts = proc(recog_res_raw, timestamp, _dest_text)
105
- for _ts in ts: all_ts.append(_ts)
106
  ts = all_ts
 
107
  clip_srt = ""
108
  if len(ts):
109
  start, end = ts[0][0] / 16000, ts[0][1] / 16000
 
110
  start, end = start+start_ost/1000.0, end+end_ost/1000.0
111
  video_clip = video.subclip(start, end)
112
- clip_video_file = clip_video_file
113
  start_end_info = "from {} to {}".format(start, end)
114
- # message = "{} periods found in the audio: from {} to {}.".format(len(ts), start, end)
115
- srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index)
116
  clip_srt += srt_clip
117
  if add_sub:
118
  generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
119
  subtitles = SubtitlesClip(subs, generator)
120
  video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
121
  concate_clip = [video_clip]
 
122
  for _ts in ts[1:]:
123
  start, end = _ts[0] / 16000, _ts[1] / 16000
 
 
 
 
 
124
  start, end = start+start_ost/1000.0, end+end_ost/1000.0
125
  _video_clip = video.subclip(start, end)
126
- clip_video_file = clip_video_file
127
  start_end_info += ", from {} to {}".format(start, end)
128
- srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1)
129
  clip_srt += srt_clip
130
  if add_sub:
131
  generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
132
- subtitles = SubtitlesClip(subs, generator)
133
  _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
 
134
  concate_clip.append(copy.copy(_video_clip))
 
135
  message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
136
  logging.warning("Concating...")
137
  if len(concate_clip) > 1:
138
  video_clip = concatenate_videoclips(concate_clip)
139
- video_clip.write_videofile(clip_video_file)
 
 
 
 
 
 
 
 
 
 
 
 
140
  else:
141
- clip_video_file = vedio_filename
142
  message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
143
  srt_clip = ''
144
  return clip_video_file, message, clip_srt
145
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ import re
7
+ import os
8
  import sys
9
  import copy
10
  import librosa
 
12
  import argparse
13
  import numpy as np
14
  import soundfile as sf
 
 
 
 
 
 
15
  from moviepy.editor import *
16
+ import moviepy.editor as mpy
17
  from moviepy.video.tools.subtitles import SubtitlesClip
18
+ from utils.subtitle_utils import generate_srt, generate_srt_clip
19
+ from utils.argparse_tools import ArgumentParser, get_commandline_args
20
+ from utils.trans_utils import pre_proc, proc, write_state, load_state, proc_spk, convert_pcm_to_float
21
 
22
 
23
  class VideoClipper():
24
+ def __init__(self, funasr_model):
25
  logging.warning("Initializing VideoClipper.")
26
+ self.funasr_model = funasr_model
27
+ self.GLOBAL_COUNT = 0
28
 
29
+ def recog(self, audio_input, sd_switch='no', state=None, hotwords="", output_dir=None):
30
  if state is None:
31
  state = {}
32
+ sr, data = audio_input
33
+
34
+ # Convert to float64 consistently (includes data type checking)
35
+ data = convert_pcm_to_float(data)
36
+
37
+ # assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
38
+ if sr != 16000: # resample with librosa
39
+ data = librosa.resample(data, orig_sr=sr, target_sr=16000)
40
+ if len(data.shape) == 2: # multi-channel wav input
41
+ logging.warning("Input wav shape: {}, only first channel reserved.".format(data.shape))
42
+ data = data[:,0]
43
+ state['audio_input'] = (sr, data)
44
+ if sd_switch == 'Yes':
45
+ rec_result = self.funasr_model.generate(data, return_raw_text=True, is_final=True, hotword=hotwords, cache={})
46
+ res_srt = generate_srt(rec_result[0]['sentence_info'])
47
+ state['sd_sentences'] = rec_result[0]['sentence_info']
48
+ else:
49
+ rec_result = self.funasr_model.generate(data,
50
+ return_spk_res=False,
51
+ sentence_timestamp=True,
52
+ return_raw_text=True,
53
+ is_final=True,
54
+ hotword=hotwords,
55
+ output_dir=output_dir,
56
+ cache={})
57
+ res_srt = generate_srt(rec_result[0]['sentence_info'])
58
+ state['recog_res_raw'] = rec_result[0]['raw_text']
59
+ state['timestamp'] = rec_result[0]['timestamp']
60
+ state['sentences'] = rec_result[0]['sentence_info']
61
+ res_text = rec_result[0]['text']
62
  return res_text, res_srt, state
63
 
64
+ def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None, output_dir=None, timestamp_list=None):
65
  # get from state
66
  audio_input = state['audio_input']
67
  recog_res_raw = state['recog_res_raw']
 
70
  sr, data = audio_input
71
  data = data.astype(np.float64)
72
 
73
+ if timestamp_list is not None:
74
+ all_ts = []
75
+ if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
76
+ for _dest_text in dest_text.split('#'):
77
+ if '[' in _dest_text:
78
+ match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
79
+ if match:
80
+ offset_b, offset_e = map(int, match.groups())
81
+ log_append = ""
82
+ else:
83
+ offset_b, offset_e = 0, 0
84
+ log_append = "(Bracket detected in dest_text but offset time matching failed)"
85
+ _dest_text = _dest_text[:_dest_text.find('[')]
86
+ else:
87
+ log_append = ""
88
+ offset_b, offset_e = 0, 0
89
+ _dest_text = pre_proc(_dest_text)
90
+ ts = proc(recog_res_raw, timestamp, _dest_text)
91
+ for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
92
+ if len(ts) > 1 and match:
93
+ log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
94
+ offsets are applied to all periods)'
95
+ else:
96
+ for _dest_spk in dest_spk.split('#'):
97
+ ts = proc_spk(_dest_spk, state['sd_sentences'])
98
+ for _ts in ts: all_ts.append(_ts)
99
+ log_append = ""
100
+ else:
101
+ all_ts = timestamp_list
102
  ts = all_ts
103
+ # ts.sort()
104
  srt_index = 0
105
  clip_srt = ""
106
  if len(ts):
 
120
  srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
121
  clip_srt += srt_clip
122
  if len(ts):
123
+ message = "{} periods found in the speech: ".format(len(ts)) + start_end_info + log_append
124
  else:
125
  message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
126
+ res_audio = data
127
  return (sr, res_audio), message, clip_srt
128
 
129
+ def video_recog(self, video_filename, sd_switch='no', hotwords="", output_dir=None):
130
+ video = mpy.VideoFileClip(video_filename)
131
+ # Extract the base name, add '_clip.mp4', and 'wav'
132
+ if output_dir is not None:
133
+ os.makedirs(output_dir, exist_ok=True)
134
+ _, base_name = os.path.split(video_filename)
135
+ base_name, _ = os.path.splitext(base_name)
136
+ clip_video_file = base_name + '_clip.mp4'
137
+ audio_file = base_name + '.wav'
138
+ audio_file = os.path.join(output_dir, audio_file)
139
+ else:
140
+ base_name, _ = os.path.splitext(video_filename)
141
+ clip_video_file = base_name + '_clip.mp4'
142
+ audio_file = base_name + '.wav'
143
  video.audio.write_audiofile(audio_file)
144
  wav = librosa.load(audio_file, sr=16000)[0]
145
+ # delete the audio file after processing
146
+ if os.path.exists(audio_file):
147
+ os.remove(audio_file)
148
  state = {
149
+ 'video_filename': video_filename,
150
  'clip_video_file': clip_video_file,
151
  'video': video,
152
  }
153
  # res_text, res_srt = self.recog((16000, wav), state)
154
+ return self.recog((16000, wav), sd_switch, state, hotwords, output_dir)
155
 
156
+ def video_clip(self,
157
+ dest_text,
158
+ start_ost,
159
+ end_ost,
160
+ state,
161
+ font_size=32,
162
+ font_color='white',
163
+ add_sub=False,
164
+ dest_spk=None,
165
+ output_dir=None,
166
+ timestamp_list=None):
167
  # get from state
168
  recog_res_raw = state['recog_res_raw']
169
  timestamp = state['timestamp']
170
  sentences = state['sentences']
171
  video = state['video']
172
  clip_video_file = state['clip_video_file']
173
+ video_filename = state['video_filename']
174
+
175
+ if timestamp_list is None:
176
+ all_ts = []
177
+ if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
178
+ for _dest_text in dest_text.split('#'):
179
+ if '[' in _dest_text:
180
+ match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
181
+ if match:
182
+ offset_b, offset_e = map(int, match.groups())
183
+ log_append = ""
184
+ else:
185
+ offset_b, offset_e = 0, 0
186
+ log_append = "(Bracket detected in dest_text but offset time matching failed)"
187
+ _dest_text = _dest_text[:_dest_text.find('[')]
188
+ else:
189
+ offset_b, offset_e = 0, 0
190
+ log_append = ""
191
+ _dest_text = pre_proc(_dest_text)
192
+ ts = proc(recog_res_raw, timestamp, _dest_text)
193
+ for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
194
+ if len(ts) > 1 and match:
195
+ log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
196
+ offsets are applied to all periods)'
197
+ else:
198
+ for _dest_spk in dest_spk.split('#'):
199
+ ts = proc_spk(_dest_spk, state['sd_sentences'])
200
+ for _ts in ts: all_ts.append(_ts)
201
+ else: # AI clip pass timestamp as input directly
202
+ all_ts = [[i[0]*16.0, i[1]*16.0] for i in timestamp_list]
203
 
 
204
  srt_index = 0
205
+ time_acc_ost = 0.0
 
 
 
206
  ts = all_ts
207
+ # ts.sort()
208
  clip_srt = ""
209
  if len(ts):
210
  start, end = ts[0][0] / 16000, ts[0][1] / 16000
211
+ srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
212
  start, end = start+start_ost/1000.0, end+end_ost/1000.0
213
  video_clip = video.subclip(start, end)
 
214
  start_end_info = "from {} to {}".format(start, end)
 
 
215
  clip_srt += srt_clip
216
  if add_sub:
217
  generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
218
  subtitles = SubtitlesClip(subs, generator)
219
  video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
220
  concate_clip = [video_clip]
221
+ time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
222
  for _ts in ts[1:]:
223
  start, end = _ts[0] / 16000, _ts[1] / 16000
224
+ srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
225
+ chi_subs = []
226
+ sub_starts = subs[0][0][0]
227
+ for sub in subs:
228
+ chi_subs.append(((sub[0][0]-sub_starts, sub[0][1]-sub_starts), sub[1]))
229
  start, end = start+start_ost/1000.0, end+end_ost/1000.0
230
  _video_clip = video.subclip(start, end)
 
231
  start_end_info += ", from {} to {}".format(start, end)
 
232
  clip_srt += srt_clip
233
  if add_sub:
234
  generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
235
+ subtitles = SubtitlesClip(chi_subs, generator)
236
  _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
237
+ # _video_clip.write_videofile("debug.mp4", audio_codec="aac")
238
  concate_clip.append(copy.copy(_video_clip))
239
+ time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
240
  message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
241
  logging.warning("Concating...")
242
  if len(concate_clip) > 1:
243
  video_clip = concatenate_videoclips(concate_clip)
244
+ # clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
245
+ if output_dir is not None:
246
+ os.makedirs(output_dir, exist_ok=True)
247
+ _, file_with_extension = os.path.split(clip_video_file)
248
+ clip_video_file_name, _ = os.path.splitext(file_with_extension)
249
+ print(output_dir, clip_video_file)
250
+ clip_video_file = os.path.join(output_dir, "{}_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
251
+ temp_audio_file = os.path.join(output_dir, "{}_tempaudio_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
252
+ else:
253
+ clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
254
+ temp_audio_file = clip_video_file[:-4] + '_tempaudio_no{}.mp4'.format(self.GLOBAL_COUNT)
255
+ video_clip.write_videofile(clip_video_file, audio_codec="aac", temp_audiofile=temp_audio_file)
256
+ self.GLOBAL_COUNT += 1
257
  else:
258
+ clip_video_file = video_filename
259
  message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
260
  srt_clip = ''
261
  return clip_video_file, message, clip_srt
262
 
263
 
264
+ def get_parser():
265
+ parser = ArgumentParser(
266
+ description="ClipVideo Argument",
267
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
268
+ )
269
+ parser.add_argument(
270
+ "--stage",
271
+ type=int,
272
+ choices=(1, 2),
273
+ help="Stage, 0 for recognizing and 1 for clipping",
274
+ required=True
275
+ )
276
+ parser.add_argument(
277
+ "--file",
278
+ type=str,
279
+ default=None,
280
+ help="Input file path",
281
+ required=True
282
+ )
283
+ parser.add_argument(
284
+ "--sd_switch",
285
+ type=str,
286
+ choices=("no", "yes"),
287
+ default="no",
288
+ help="Turn on the speaker diarization or not",
289
+ )
290
+ parser.add_argument(
291
+ "--output_dir",
292
+ type=str,
293
+ default='./output',
294
+ help="Output files path",
295
+ )
296
+ parser.add_argument(
297
+ "--dest_text",
298
+ type=str,
299
+ default=None,
300
+ help="Destination text string for clipping",
301
+ )
302
+ parser.add_argument(
303
+ "--dest_spk",
304
+ type=str,
305
+ default=None,
306
+ help="Destination spk id for clipping",
307
+ )
308
+ parser.add_argument(
309
+ "--start_ost",
310
+ type=int,
311
+ default=0,
312
+ help="Offset time in ms at beginning for clipping"
313
+ )
314
+ parser.add_argument(
315
+ "--end_ost",
316
+ type=int,
317
+ default=0,
318
+ help="Offset time in ms at ending for clipping"
319
+ )
320
+ parser.add_argument(
321
+ "--output_file",
322
+ type=str,
323
+ default=None,
324
+ help="Output file path"
325
+ )
326
+ return parser
327
+
328
+
329
+ def runner(stage, file, sd_switch, output_dir, dest_text, dest_spk, start_ost, end_ost, output_file, config=None):
330
+ audio_suffixs = ['.wav','.mp3','.aac','.m4a','.flac']
331
+ video_suffixs = ['.mp4','.avi','.mkv','.flv','.mov','.webm','.ts','.mpeg']
332
+ _,ext = os.path.splitext(file)
333
+ if ext.lower() in audio_suffixs:
334
+ mode = 'audio'
335
+ elif ext.lower() in video_suffixs:
336
+ mode = 'video'
337
+ else:
338
+ logging.error("Unsupported file format: {}\n\nplease choise one of the following: {}".format(file),audio_suffixs+video_suffixs)
339
+ sys.exit(1) # exit if the file is not supported
340
+ while output_dir.endswith('/'):
341
+ output_dir = output_dir[:-1]
342
+ if not os.path.exists(output_dir):
343
+ os.mkdir(output_dir)
344
+ if stage == 1:
345
+ from funasr import AutoModel
346
+ # initialize funasr automodel
347
+ logging.warning("Initializing modelscope asr pipeline.")
348
+ funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
349
+ vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
350
+ punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
351
+ spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
352
+ )
353
+ audio_clipper = VideoClipper(funasr_model)
354
+ if mode == 'audio':
355
+ logging.warning("Recognizing audio file: {}".format(file))
356
+ wav, sr = librosa.load(file, sr=16000)
357
+ res_text, res_srt, state = audio_clipper.recog((sr, wav), sd_switch)
358
+ if mode == 'video':
359
+ logging.warning("Recognizing video file: {}".format(file))
360
+ res_text, res_srt, state = audio_clipper.video_recog(file, sd_switch)
361
+ total_srt_file = output_dir + '/total.srt'
362
+ with open(total_srt_file, 'w') as fout:
363
+ fout.write(res_srt)
364
+ logging.warning("Write total subtitle to {}".format(total_srt_file))
365
+ write_state(output_dir, state)
366
+ logging.warning("Recognition successed. You can copy the text segment from below and use stage 2.")
367
+ print(res_text)
368
+ if stage == 2:
369
+ audio_clipper = VideoClipper(None)
370
+ if mode == 'audio':
371
+ state = load_state(output_dir)
372
+ wav, sr = librosa.load(file, sr=16000)
373
+ state['audio_input'] = (sr, wav)
374
+ (sr, audio), message, srt_clip = audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk)
375
+ if output_file is None:
376
+ output_file = output_dir + '/result.wav'
377
+ clip_srt_file = output_file[:-3] + 'srt'
378
+ logging.warning(message)
379
+ sf.write(output_file, audio, 16000)
380
+ assert output_file.endswith('.wav'), "output_file must ends with '.wav'"
381
+ logging.warning("Save clipped wav file to {}".format(output_file))
382
+ with open(clip_srt_file, 'w') as fout:
383
+ fout.write(srt_clip)
384
+ logging.warning("Write clipped subtitle to {}".format(clip_srt_file))
385
+ if mode == 'video':
386
+ state = load_state(output_dir)
387
+ state['video_filename'] = file
388
+ if output_file is None:
389
+ state['clip_video_file'] = file[:-4] + '_clip.mp4'
390
+ else:
391
+ state['clip_video_file'] = output_file
392
+ clip_srt_file = state['clip_video_file'][:-3] + 'srt'
393
+ state['video'] = mpy.VideoFileClip(file)
394
+ clip_video_file, message, srt_clip = audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk)
395
+ logging.warning("Clipping Log: {}".format(message))
396
+ logging.warning("Save clipped mp4 file to {}".format(clip_video_file))
397
+ with open(clip_srt_file, 'w') as fout:
398
+ fout.write(srt_clip)
399
+ logging.warning("Write clipped subtitle to {}".format(clip_srt_file))
400
+
401
+
402
+ def main(cmd=None):
403
+ print(get_commandline_args(), file=sys.stderr)
404
+ parser = get_parser()
405
+ args = parser.parse_args(cmd)
406
+ kwargs = vars(args)
407
+ runner(**kwargs)
408
+
409
+
410
+ if __name__ == '__main__':
411
+ main()