Spaces:

R1ckShi
/

FunClip

Running

App Files Files Community

R1ckShi commited on May 16, 2024

Commit

1427ef7

verified ·

1 Parent(s): 2aacd40

update to v2.0.0

Browse files

Files changed (27) hide show

__init__.py +0 -0
app.py +224 -119
introduction.py +37 -0
llm/__pycache__/demo_prompt.cpython-311.pyc +0 -0
llm/__pycache__/g4f_openai_api.cpython-311.pyc +0 -0
llm/__pycache__/openai_api.cpython-311.pyc +0 -0
llm/__pycache__/qwen_api.cpython-311.pyc +0 -0
llm/demo_prompt.py +272 -0
llm/g4f_openai_api.py +30 -0
llm/openai_api.py +48 -0
llm/qwen_api.py +30 -0
requirements.txt +8 -3
test/imagemagick_test.py +16 -0
test/test.sh +15 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-311.pyc +0 -0
utils/__pycache__/argparse_tools.cpython-311.pyc +0 -0
utils/__pycache__/argparse_tools.cpython-39.pyc +0 -0
utils/__pycache__/subtitle_utils.cpython-311.pyc +0 -0
utils/__pycache__/subtitle_utils.cpython-39.pyc +0 -0
utils/__pycache__/trans_utils.cpython-311.pyc +0 -0
utils/__pycache__/trans_utils.cpython-39.pyc +0 -0
utils/argparse_tools.py +88 -0
utils/subtitle_utils.py +121 -0
utils/theme.json +333 -0
utils/trans_utils.py +131 -0
videoclipper.py +312 -47

__init__.py ADDED Viewed

File without changes

app.py CHANGED Viewed

@@ -1,138 +1,243 @@
 import gradio as gr
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
 from videoclipper import VideoClipper
 if __name__ == "__main__":
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
-        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
-        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
-    )
-    audio_clipper = VideoClipper(inference_pipeline)
-    def audio_recog(audio_input):
-        return audio_clipper.recog(audio_input)
-    def audio_clip(dest_text, start_ost, end_ost, state):
-        return audio_clipper.clip(dest_text, start_ost, end_ost, state)
-    def video_recog(video_input):
-        return audio_clipper.video_recog(video_input)
-    def video_clip(dest_text, start_ost, end_ost, state):
-        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state)
-    def video_clip_addsub(dest_text, start_ost, end_ost, state, font_size, font_color):
-        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True)
-    top_md_1 = ("""
-    A video clip tool based on Paraformer-long's VAD, ASR, timestamp prediction, punctuation restoration abilities.
-    Get the video clip simply following steps:
-    * Step1: Upload video file (or try examples below), click **<font color="#f7802b">Recognize</font>** button
-    * Step2: Copy text segments you need to 'Text to Clip', set the subtitle settings (if you need)
-    * Step3: Click **<font color="#f7802b">Clip</font>** button or **<font color="#f7802b">Clip and Generate Subtitles</font>** button
-    """)
-    top_md_2 = ("""
-    The video had better to have size under 40Mb,
-    For video in large size, you can split the audio from it and use 'Audio Clip',
-    or **<font color="#1785c4">establish your own gradio service with the source code (recommanded)</font>** :
-    <div align="center">
-    <div style="display:flex; gap: 0.25rem;" align="center">
-    FunASR_APP: <a href='https://github.com/alibaba/funasr-app'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
-    🌟Support Us: <a href='https://github.com/alibaba/funasr-app/stargazers'><img src='https://img.shields.io/github/stars/alibaba/funasr-app.svg?style=social'></a>
-    </div>
-    </div>
-    """)
-    top_md_3 = ("""You may understand FunASR futher with source code and paper:
-    <div align="center">
-    <div style="display:flex; gap: 0.25rem;" align="center">
-        FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
-        FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
-        🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
-    </div>
-    </div>
-    """)
     # gradio interface
-    with gr.Blocks() as demo:
-        #gr.Image("./examples/guide.png", show_label=False)
         gr.Markdown(top_md_1)
-        gr.Markdown(top_md_2)
         gr.Markdown(top_md_3)
-        video_state = gr.State()
-        audio_state = gr.State()
-        with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
-            with gr.Row():
-                with gr.Column():
-                    video_input = gr.Video(label="🎥视频输入 Video Input")
-                    gr.Examples(['examples/2022云栖大会_片段2.mp4',
-                                 'examples/2022云栖大会_片段.mp4',
-                                 'examples/为什么要多读书？这是我听过最好的答案-片段.mp4',
-                                 'examples/使用chatgpt_片段.mp4'],
-                                [video_input])
-                    recog_button2 = gr.Button("👂识别 Recognize")
-                    video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
-                    video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
                 with gr.Column():
-                    video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
                     with gr.Row():
-                        video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
-                        video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
                     with gr.Row():
-                        font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
-                        font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
-                        # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
-                    with gr.Row():
-                        clip_button2 = gr.Button("✂️裁剪\nClip")
-                        clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
-                    video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
-                    video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
-                    video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
-        with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
-            with gr.Row():
-                with gr.Column():
-                    audio_input = gr.Audio(label="🔊音频输入 Audio Input")
-                    gr.Examples(['examples/鲁肃采访片段1.wav'], [audio_input])
-                    recog_button1 = gr.Button("👂识别 Recognize")
-                    audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
-                    audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
-                with gr.Column():
-                    audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
-                    with gr.Row():
-                        audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
-                        audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
-                    with gr.Row():
-                        clip_button1 = gr.Button("✂️裁剪 Clip")
-                    audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
-                    audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
-                    audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
-        recog_button1.click(audio_recog,
-                            inputs=audio_input,
-                            outputs=[audio_text_output, audio_srt_output, audio_state])
-        clip_button1.click(audio_clip,
-                           inputs=[audio_text_input, audio_start_ost, audio_end_ost, audio_state],
-                           outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
-        recog_button2.click(video_recog,
-                            inputs=video_input,
-                            outputs=[video_text_output, video_srt_output, video_state])
-        clip_button2.click(video_clip,
-                           inputs=[video_text_input, video_start_ost, video_end_ost, video_state],
-                           outputs=[video_output, video_mess_output, video_srt_clip_output])
-        clip_button3.click(video_clip_addsub,
-                           inputs=[video_text_input, video_start_ost, video_end_ost, video_state, font_size, font_color],
-                           outputs=[video_output, video_mess_output, video_srt_clip_output])
     # start gradio service in local
-    demo.queue(concurrency_count=3).launch()

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import os
+import logging
 import gradio as gr
+from funasr import AutoModel
 from videoclipper import VideoClipper
+from introduction import top_md_1, top_md_3, top_md_4
+from llm.openai_api import openai_call
+from llm.g4f_openai_api import g4f_openai_call
+from llm.qwen_api import call_qwen_model
+from utils.trans_utils import extract_timestamps
 if __name__ == "__main__":
+    funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+                             vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                             punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+                             spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
+                            )
+    audio_clipper = VideoClipper(funasr_model)
+    def audio_recog(audio_input, sd_switch, hotwords, output_dir):
+        return audio_clipper.recog(audio_input, sd_switch, None, hotwords, output_dir=output_dir)
+    def video_recog(video_input, sd_switch, hotwords, output_dir):
+        return audio_clipper.video_recog(video_input, sd_switch, hotwords, output_dir=output_dir)
+    def video_clip(dest_text, video_spk_input, start_ost, end_ost, state, output_dir):
+        return audio_clipper.video_clip(
+            dest_text, start_ost, end_ost, state, dest_spk=video_spk_input, output_dir=output_dir
+            )
+    def mix_recog(video_input, audio_input, hotwords, output_dir):
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        audio_state, video_state = None, None
+        if video_input is not None:
+            res_text, res_srt, video_state = video_recog(
+                video_input, 'No', hotwords, output_dir=output_dir)
+            return res_text, res_srt, video_state, None
+        if audio_input is not None:
+            res_text, res_srt, audio_state = audio_recog(
+                audio_input, 'No', hotwords, output_dir=output_dir)
+            return res_text, res_srt, None, audio_state
+    def mix_recog_speaker(video_input, audio_input, hotwords, output_dir):
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        audio_state, video_state = None, None
+        if video_input is not None:
+            res_text, res_srt, video_state = video_recog(
+                video_input, 'Yes', hotwords, output_dir=output_dir)
+            return res_text, res_srt, video_state, None
+        if audio_input is not None:
+            res_text, res_srt, audio_state = audio_recog(
+                audio_input, 'Yes', hotwords, output_dir=output_dir)
+            return res_text, res_srt, None, audio_state
+    def mix_clip(dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        if video_state is not None:
+            clip_video_file, message, clip_srt = audio_clipper.video_clip(
+                dest_text, start_ost, end_ost, video_state, dest_spk=video_spk_input, output_dir=output_dir)
+            return clip_video_file, None, message, clip_srt
+        if audio_state is not None:
+            (sr, res_audio), message, clip_srt = audio_clipper.clip(
+                dest_text, start_ost, end_ost, audio_state, dest_spk=video_spk_input, output_dir=output_dir)
+            return None, (sr, res_audio), message, clip_srt
+    def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, output_dir, font_size, font_color):
+        return audio_clipper.video_clip(
+            dest_text, start_ost, end_ost, state,
+            font_size=font_size, font_color=font_color,
+            add_sub=True, dest_spk=video_spk_input, output_dir=output_dir
+            )
+    def llm_inference(system_content, user_content, srt_text, model, apikey):
+        SUPPORT_LLM_PREFIX = ['qwen', 'gpt', 'g4f', 'moonshot']
+        if model.startswith('qwen'):
+            return call_qwen_model(apikey, model, system_content, user_content+'\n'+srt_text)
+        if model.startswith('gpt') or model.startswith('moonshot'):
+            return openai_call(apikey, model, system_content, user_content+'\n'+srt_text)
+        elif model.startswith('g4f'):
+            model = "-".join(model.split('-')[1:])
+            return g4f_openai_call(model, system_content, user_content+'\n'+srt_text)
+        else:
+            logging.error("LLM name error, only {} are supported as LLM name prefix."
+                          .format(SUPPORT_LLM_PREFIX))
+    def AI_clip(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
+        timestamp_list = extract_timestamps(LLM_res)
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        if video_state is not None:
+            clip_video_file, message, clip_srt = audio_clipper.video_clip(
+                dest_text, start_ost, end_ost, video_state,
+                dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list)
+            return clip_video_file, None, message, clip_srt
+        if audio_state is not None:
+            (sr, res_audio), message, clip_srt = audio_clipper.clip(
+                dest_text, start_ost, end_ost, audio_state,
+                dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list)
+            return None, (sr, res_audio), message, clip_srt
     # gradio interface
+    theme = gr.Theme.load("utils/theme.json")
+    with gr.Blocks(theme=theme) as funclip_service:
         gr.Markdown(top_md_1)
+        # gr.Markdown(top_md_2)
         gr.Markdown(top_md_3)
+        gr.Markdown(top_md_4)
+        video_state, audio_state = gr.State(), gr.State()
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    video_input = gr.Video(label="视频输入 | Video Input")
+                    audio_input = gr.Audio(label="音频输入 | Audio Input")
                 with gr.Column():
+                    gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E5%A4%9A%E8%AF%BB%E4%B9%A6%EF%BC%9F%E8%BF%99%E6%98%AF%E6%88%91%E5%90%AC%E8%BF%87%E6%9C%80%E5%A5%BD%E7%9A%84%E7%AD%94%E6%A1%88-%E7%89%87%E6%AE%B5.mp4',
+                                 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/2022%E4%BA%91%E6%A0%96%E5%A4%A7%E4%BC%9A_%E7%89%87%E6%AE%B52.mp4',
+                                 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%BD%BF%E7%94%A8chatgpt_%E7%89%87%E6%AE%B5.mp4'],
+                                [video_input],
+                                label='示例视频 | Demo Video')
+                    gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E8%AE%BF%E8%B0%88.mp4'],
+                                [video_input],
+                                label='多说话人示例视频 | Multi-speaker Demo Video')
+                    gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E9%B2%81%E8%82%83%E9%87%87%E8%AE%BF%E7%89%87%E6%AE%B51.wav'],
+                                [audio_input],
+                                label="示例音频 | Demo Audio")
+                    with gr.Column():
+                        # with gr.Row():
+                            # video_sd_switch = gr.Radio(["No", "Yes"], label="👥区分说话人 Get Speakers", value='No')
+                        hotwords_input = gr.Textbox(label="🚒 热词 | Hotwords(可以为空，多个热词使用空格分隔，仅支持中文热词)")
+                        output_dir = gr.Textbox(label="📁 文件输出路径 | File Output Dir (可以为空，Linux, mac系统可以稳定使用)", value=" ")
+                        with gr.Row():
+                            recog_button = gr.Button("👂 识别 | ASR", variant="primary")
+                            recog_button2 = gr.Button("👂👫 识别+区分说话人 | ASR+SD")
+                video_text_output = gr.Textbox(label="✏️ 识别结果 | Recognition Result")
+                video_srt_output = gr.Textbox(label="📖 SRT字幕内容 | RST Subtitles")
+            with gr.Column():
+                with gr.Tab("🧠 LLM智能裁剪 | LLM Clipping"):
+                    with gr.Column():
+                        prompt_head = gr.Textbox(label="Prompt System", value=("你是一个视频srt字幕分析剪辑器，输入视频的srt字幕，"
+                                "分析其中的精彩且尽可能连续的片段并裁剪出来，输出四条以内的片段，将片段中在时间上连续的多个句子及它们的时间戳合并为一条，"
+                                "注意确保文字与时间戳的正确匹配。输出需严格按照如下格式：1. [开始时间-结束时间] 文本，注意其中的连接符是“-”"))
+                        prompt_head2 = gr.Textbox(label="Prompt User", value=("这是待裁剪的视频srt字幕："))
+                        with gr.Column():
+                            with gr.Row():
+                                llm_model = gr.Dropdown(
+                                    choices=["qwen-plus",
+                                             "gpt-3.5-turbo",
+                                             "gpt-3.5-turbo-0125",
+                                             "gpt-4-turbo",
+                                             "g4f-gpt-3.5-turbo"],
+                                    value="qwen-plus",
+                                    label="LLM Model Name",
+                                    allow_custom_value=True)
+                                apikey_input = gr.Textbox(label="APIKEY")
+                            llm_button =  gr.Button("LLM推理 | LLM Inference（首先进行识别，非g4f需配置对应apikey）", variant="primary")
+                        llm_result = gr.Textbox(label="LLM Clipper Result")
+                        with gr.Row():
+                            llm_clip_button = gr.Button("🧠 LLM智能裁剪 | AI Clip", variant="primary")
+                            # llm_clip_subti_button = gr.Button("🧠 LLM智能裁剪+字幕 | AI Clip+Subtitles")
+                with gr.Tab("✂️ 根据文本\说话人裁剪 | Text\Speaker Clipping"):
+                    video_text_input = gr.Textbox(label="✏️ 待裁剪文本 | Text to Clip (多段文本使用'#'连接)")
+                    video_spk_input = gr.Textbox(label="✏️ 待裁剪说话人 | Speaker to Clip (多个说话人使用'#'连接)")
                     with gr.Row():
+                        clip_button = gr.Button("✂️ 裁剪 | Clip", variant="primary")
+                        # clip_subti_button = gr.Button("✂️ 裁剪+字幕 | Clip+Subtitles")
                     with gr.Row():
+                        video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪ 开始位置偏移 | Start Offset (ms)")
+                        video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩ 结束位置偏移 | End Offset (ms)")
+                with gr.Row():
+                    font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠 字幕字体大小 | Subtitle Font Size")
+                    font_color = gr.Radio(["black", "white", "green", "red"], label="🌈 字幕颜色 | Subtitle Color", value='white')
+                    # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
+                video_output = gr.Video(label="裁剪结果 | Video Clipped")
+                audio_output = gr.Audio(label="裁剪结果 | Audio Clipped")
+                clip_message = gr.Textbox(label="⚠️ 裁剪信息 | Clipping Log")
+                srt_clipped = gr.Textbox(label="📖 裁剪部分SRT字幕内容 | Clipped RST Subtitles")
+        recog_button.click(mix_recog,
+                            inputs=[video_input,
+                                    audio_input,
+                                    hotwords_input,
+                                    output_dir,
+                                    ],
+                            outputs=[video_text_output, video_srt_output, video_state, audio_state],
+                            concurrency_limit=3)
+        recog_button2.click(mix_recog_speaker,
+                            inputs=[video_input,
+                                    audio_input,
+                                    hotwords_input,
+                                    output_dir,
+                                    ],
+                            outputs=[video_text_output, video_srt_output, video_state, audio_state],
+                            concurrency_limit=3)
+        clip_button.click(mix_clip,
+                           inputs=[video_text_input,
+                                   video_spk_input,
+                                   video_start_ost,
+                                   video_end_ost,
+                                   video_state,
+                                   audio_state,
+                                   output_dir
+                                   ],
+                           outputs=[video_output, audio_output, clip_message, srt_clipped])
+        llm_button.click(llm_inference,
+                         inputs=[prompt_head, prompt_head2, video_srt_output, llm_model, apikey_input],
+                         outputs=[llm_result])
+        llm_clip_button.click(AI_clip,
+                           inputs=[llm_result,
+                                   video_text_input,
+                                   video_spk_input,
+                                   video_start_ost,
+                                   video_end_ost,
+                                   video_state,
+                                   audio_state,
+                                   output_dir
+                                   ],
+                           outputs=[video_output, audio_output, clip_message, srt_clipped])
     # start gradio service in local
+    # funclip_service.queue(concurrency_count=5)
+    funclip_service.launch(max_threads=8)

introduction.py ADDED Viewed

	@@ -0,0 +1,37 @@

+top_md_1 = ("""
+    <div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+    FunClip: <a href='https://github.com/alibaba-damo-academy/FunClip'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+    🌟支持我们: <a href='https://github.com/alibaba-damo-academy/FunClip/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunClip.svg?style=social'></a>
+    </div>
+    </div>
+    基于阿里巴巴通义实验室自研并开源的[FunASR](https://github.com/alibaba-damo-academy/FunASR)工具包及Paraformer系列模型及语音识别、端点检测、标点预测、时间戳预测、说话人区分、热词定制化开源链路
+    准确识别，自由复制所需段落，或者设置说话人标识，一键裁剪、添加字幕
+    * Step1: 上传视频或音频文件（或使用下方的用例体验），点击 **<font color="#f7802b">识别</font>** 按钮
+    * Step2: 复制识别结果中所需的文字至右上方，或者右设置说话人标识，设置偏移与字幕配置（可选）
+    * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
+    🔥 FunClip现在集成了大语言模型智能剪辑功能，选择LLM模型进行体验吧~
+    """)
+top_md_3 = ("""访问FunASR项目与论文能够帮助您深入了解ParaClipper中所使用的语音处理相关模型：
+    <div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+        FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+        FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
+        🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
+    </div>
+    </div>
+    """)
+top_md_4 = ("""我们在「LLM智能裁剪」模块中提供三种LLM调用方式，
+            1. 选择阿里云百炼平台通过api调用qwen系列模型，此时需要您准备百炼平台的apikey，请访问[阿里云百炼](https://bailian.console.aliyun.com/#/home)；
+            2. 选择GPT开头的模型即为调用openai官方api，此时需要您自备sk与网络环境；
+            3. [gpt4free](https://github.com/xtekky/gpt4free?tab=readme-ov-file)项目也被集成进FunClip，可以通过它免费调用gpt模型；
+            其中方式1与方式2需要在界面中传入相应的apikey
+            方式3而可能非常不稳定，返回时间可能很长或者结果获取失败，可以多多尝试或者自己准备sk使用方式1
+            """)

llm/__pycache__/demo_prompt.cpython-311.pyc ADDED Viewed

Binary file (6.17 kB). View file

llm/__pycache__/g4f_openai_api.cpython-311.pyc ADDED Viewed

Binary file (1.46 kB). View file

llm/__pycache__/openai_api.cpython-311.pyc ADDED Viewed

Binary file (1.76 kB). View file

llm/__pycache__/qwen_api.cpython-311.pyc ADDED Viewed

Binary file (1.24 kB). View file

llm/demo_prompt.py ADDED Viewed

	@@ -0,0 +1,272 @@

+demo_prompt="""
+你是一个视频srt字幕剪辑工具，输入视频的srt字幕之后根据如下要求剪辑对应的片段并输出每个段落的开始与结束时间，
+剪辑出以下片段中最有意义的、尽可能连续的部分，按如下格式输出：1. [开始时间-结束时间] 文本，
+原始srt字幕如下：
+0
+00:00:00,50 --> 00:00:02,10
+读万卷书行万里路，
+1
+00:00:02,310 --> 00:00:03,990
+这里是读书三六九，
+2
+00:00:04,670 --> 00:00:07,990
+今天要和您分享的这篇文章是人民日报，
+3
+00:00:08,510 --> 00:00:09,730
+为什么要多读书？
+4
+00:00:10,90 --> 00:00:11,930
+这是我听过最好的答案，
+5
+00:00:12,310 --> 00:00:13,190
+经常有人问，
+6
+00:00:13,730 --> 00:00:14,690
+读了那么多书，
+7
+00:00:14,990 --> 00:00:17,250
+最终还不是要回到一座平凡的城，
+8
+00:00:17,610 --> 00:00:19,410
+打一份平凡的工组，
+9
+00:00:19,410 --> 00:00:20,670
+建一个平凡的家庭，
+10
+00:00:21,330 --> 00:00:25,960
+何苦折腾一个人读书的意义究竟是什么？
+11
+00:00:26,680 --> 00:00:30,80
+今天给大家分享人民日报推荐的八条理由，
+12
+00:00:30,540 --> 00:00:32,875
+告诉你人为什么要多读书？
+13
+00:00:34,690 --> 00:00:38,725
+一脚步丈量不到的地方文字可以。
+14
+00:00:40,300 --> 00:00:41,540
+钱钟书先生说过，
+15
+00:00:42,260 --> 00:00:43,140
+如果不读书，
+16
+00:00:43,520 --> 00:00:44,400
+行万里路，
+17
+00:00:44,540 --> 00:00:45,695
+也只是个邮差。
+18
+00:00:46,900 --> 00:00:47,320
+北京、
+19
+00:00:47,500 --> 00:00:47,980
+西安、
+20
+00:00:48,320 --> 00:00:51,200
+南京和洛阳少了学识的浸润，
+21
+00:00:51,600 --> 00:00:55,565
+他们只是一个个耳中熟悉又眼里陌生的地名。
+22
+00:00:56,560 --> 00:00:59,360
+故宫避暑山庄岱庙、
+23
+00:00:59,840 --> 00:01:02,920
+曲阜三孔有了文化照耀，
+24
+00:01:03,120 --> 00:01:05,340
+他们才不是被时间风化的标本。
+25
+00:01:05,820 --> 00:01:08,105
+而是活了成百上千年的生命，
+26
+00:01:09,650 --> 00:01:10,370
+不去读书，
+27
+00:01:10,670 --> 00:01:12,920
+就是一个邮差风景，
+28
+00:01:13,0 --> 00:01:13,835
+过眼就忘，
+29
+00:01:14,750 --> 00:01:17,365
+就算踏破铁鞋又有什么用处呢？
+30
+00:01:19,240 --> 00:01:22,380
+阅读不仅仅会让现实的旅行更加丰富，
+31
+00:01:23,120 --> 00:01:27,260
+更重要的是能让精神突破现实和身体的桎梏，
+32
+00:01:27,640 --> 00:01:29,985
+来一场灵魂长足的旅行。
+33
+00:01:31,850 --> 00:01:32,930
+听过这样一句话，
+34
+00:01:33,490 --> 00:01:35,190
+没有一艘非凡的船舰，
+35
+00:01:35,330 --> 00:01:36,430
+能像一册书籍，
+36
+00:01:36,690 --> 00:01:38,595
+把我们带到浩瀚的天地，
+37
+00:01:39,830 --> 00:01:42,685
+你无法到达的地方文字在你过去，
+38
+00:01:43,530 --> 00:01:45,750
+你无法经历的人生舒淇，
+39
+00:01:45,770 --> 00:01:46,595
+带你相遇。
+40
+00:01:47,640 --> 00:01:50,340
+那些读过的书会一本本充实，
+41
+00:01:50,340 --> 00:01:50,940
+你的内心，
+42
+00:01:51,640 --> 00:01:54,855
+让虚无单调的世界变得五彩斑斓。
+43
+00:01:55,930 --> 00:01:59,690
+那些书中的人物会在你深陷生活泥潭之时，
+44
+00:02:00,170 --> 00:02:01,190
+轻声的呼唤，
+45
+00:02:01,950 --> 00:02:03,270
+用他们心怀梦想、
+46
+00:02:03,630 --> 00:02:04,950
+不卑不亢的故事，
+47
+00:02:05,310 --> 00:02:07,90
+激励你抵御苦难，
+48
+00:02:07,430 --> 00:02:08,525
+勇往直前。
+49
+00:02:11,290 --> 00:02:11,695
+二、
+50
+00:02:12,440 --> 00:02:16,900
+读书的意义是使人虚心叫通达不固执、
+51
+00:02:17,200 --> 00:02:18,35
+不偏执。
+52
+00:02:20,290 --> 00:02:22,935
+读书越少的人越容易过得痛苦。
+53
+00:02:23,600 --> 00:02:24,400
+读书越多，
+54
+00:02:24,800 --> 00:02:26,185
+人才会越通透，
+55
+00:02:27,890 --> 00:02:30,30
+知乎上有位网友讲过自己的故事。
+56
+00:02:30,750 --> 00:02:31,310
+有一次，
+57
+00:02:31,530 --> 00:02:32,650
+他跟伴侣吵架，
+58
+00:02:33,190 --> 00:02:35,505
+气得连续好几个晚上没睡好，
+59
+00:02:36,360 --> 00:02:38,880
+直到他读到一本关于亲密关系的书。
+60
+00:02:39,500 --> 00:02:41,920
+书中有段关于夫妻关系的解读，
+61
+00:02:42,80 --> 00:02:43,100
+让他豁然开朗，
+62
+00:02:43,460 --> 00:02:47,170
+突然想明白了很多事气消了，
+63
+00:02:47,430 --> 00:02:48,410
+心情好了，
+64
+00:02:48,790 --> 00:02:50,194
+整个人也舒爽了。
+65
+00:02:51,780 --> 00:02:54,340
+一个人书读的不多见识，
+66
+00:02:54,380 --> 00:02:55,180
+难免受限，
+67
+00:02:55,720 --> 00:02:58,495
+结果就必须受着眼前世界的禁锢，
+68
+00:02:59,540 --> 00:03:00,740
+稍微遇到一点不顺，
+69
+00:03:00,940 --> 00:03:02,460
+就极易消极悲观，
+70
+00:03:02,900 --> 00:03:03,720
+郁郁寡欢，
+71
+00:03:04,140 --> 00:03:05,765
+让自己困在情绪里，
+72
+00:03:06,900 --> 00:03:09,760
+只有通过阅读才能看透人生真相，
+73
+00:03:10,300 --> 00:03:12,140
+收获为人处事的智慧，
+74
+00:03:12,480 --> 00:03:14,95
+把日子越过越好。
+75
+00:03:16,730 --> 00:03:17,890
+生活的艺术里说，
+76
+00:03:18,410 --> 00:03:20,30
+人一定要时时读书，
+77
+00:03:20,430 --> 00:03:22,915
+不然便会鄙令晚腐。
+78
+00:03:23,690 --> 00:03:28,730
+完剑俗剑生满身上一个人的落伍迂腐，
+79
+00:03:29,210 --> 00:03:31,205
+就是不肯实施读书所致。
+80
+00:03:33,10 --> 00:03:34,790
+只有在不断阅读的过程中，
+81
+00:03:34,990 --> 00:03:35,970
+修心养性，
+82
+00:03:36,430 --> 00:03:38,735
+才能摆脱我们的鄙俗和顽固。
+83
+00:03:39,920 --> 00:03:41,720
+这世间没有谁的生活，
+84
+00:03:41,800 --> 00:03:42,540
+没有烦恼，
+85
+00:03:43,140 --> 00:03:45,455
+唯读书是最好的解药。
+86
+00:03:47,730 --> 00:03:48,185
+三、
+87
+00:03:49,40 --> 00:03:50,720
+书中未必有黄金屋，
+88
+00:03:51,0 --> 00:03:52,595
+但一定有更好的自己。
+"""

llm/g4f_openai_api.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from g4f.client import Client
+if __name__ == '__main__':
+    from llm.demo_prompt import demo_prompt
+    client = Client()
+    response = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": "你好你的名字是什么"}],
+    )
+    print(response.choices[0].message.content)
+def g4f_openai_call(model="gpt-3.5-turbo",
+                    user_content="如何做西红柿炖牛腩？",
+                    system_content=None):
+    client = Client()
+    if system_content is not None and len(system_content.strip()):
+        messages = [
+            {'role': 'system', 'content': system_content},
+            {'role': 'user', 'content': user_content}
+      ]
+    else:
+        messages = [
+            {'role': 'user', 'content': user_content}
+      ]
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+    )
+    return(response.choices[0].message.content)

llm/openai_api.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import logging
+from openai import OpenAI
+if __name__ == '__main__':
+    from llm.demo_prompt import demo_prompt
+    client = OpenAI(
+        # This is the default and can be omitted
+        api_key=os.environ.get("OPENAI_API_KEY"),
+    )
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": demo_prompt,
+            }
+        ],
+        model="gpt-3.5-turbo-0125",
+    )
+    print(chat_completion.choices[0].message.content)
+def openai_call(apikey,
+                model="gpt-3.5-turbo",
+                user_content="如何做西红柿炖牛腩？",
+                system_content=None):
+    client = OpenAI(
+        # This is the default and can be omitted
+        api_key=apikey,
+    )
+    if system_content is not None and len(system_content.strip()):
+        messages = [
+            {'role': 'system', 'content': system_content},
+            {'role': 'user', 'content': user_content}
+      ]
+    else:
+        messages = [
+            {'role': 'user', 'content': user_content}
+      ]
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+    )
+    logging.info("Openai model inference done.")
+    return chat_completion.choices[0].message.content

llm/qwen_api.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import dashscope
+from dashscope import Generation
+def call_qwen_model(key=None,
+                    model="qwen_plus",
+                    user_content="如何做西红柿炖牛腩？",
+                    system_content=None):
+    dashscope.api_key = key
+    if system_content is not None and len(system_content.strip()):
+        messages = [
+            {'role': 'system', 'content': system_content},
+            {'role': 'user', 'content': user_content}
+      ]
+    else:
+        messages = [
+            {'role': 'user', 'content': user_content}
+      ]
+    responses = Generation.call(model,
+                                messages=messages,
+                                result_format='message',  # 设置输出为'message'格式
+                                stream=False, # 设置输出方式为流式输出
+                                incremental_output=False  # 增量式流式输出
+                                )
+    print(responses)
+    return responses['output']['choices'][0]['message']['content']
+if __name__ == '__main__':
+    call_qwen_model('YOUR_BAILIAN_APIKEY')

requirements.txt CHANGED Viewed

@@ -1,8 +1,13 @@
 librosa
 soundfile
-funasr>=0.5.5
 moviepy
 numpy
 modelscope
-torch
-torchaudio

 librosa
 soundfile
+scikit-learn>=1.3.2
+funasr>=1.0.25
 moviepy
 numpy
+gradio
 modelscope
+torch>=1.13
+torchaudio
+openai
+g4f
+dashscope

test/imagemagick_test.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from moviepy.editor import *
+from moviepy.video.tools.subtitles import SubtitlesClip
+generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=48, color='white')
+subs = [((0, 2), 'sub1中文字幕'),
+        ((2, 4), 'subs2'),
+        ((4, 6), 'subs3'),
+        ((6, 8), 'subs4')]
+subtitles = SubtitlesClip(subs, generator)
+video = VideoFileClip("examples/2022云栖大会_片段.mp4.mp4")
+video = video.subclip(0, 8)
+video = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
+video.write_videofile("test_output.mp4")

test/test.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+# step1: Recognize
+python videoclipper.py --stage 1 \
+                       --file ../examples/2022云栖大会_片段.mp4 \
+                       --sd_switch yes \
+                       --output_dir ./output
+# now you can find recognition results and entire SRT file in ./output/
+# step2: Clip
+python videoclipper.py --stage 2 \
+                       --file ../examples/2022云栖大会_片段.mp4 \
+                       --output_dir ./output \
+                       --dest_text '所以这个是我们办这个奖的初心啊，我们也会一届一届的办下去' \
+                    #    --dest_spk spk0 \
+                       --start_ost 0 \
+                       --end_ost 100 \
+                       --output_file './output/res.mp4'

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (160 Bytes). View file

utils/__pycache__/argparse_tools.cpython-311.pyc ADDED Viewed

Binary file (4.07 kB). View file

utils/__pycache__/argparse_tools.cpython-39.pyc ADDED Viewed

Binary file (2.41 kB). View file

utils/__pycache__/subtitle_utils.cpython-311.pyc ADDED Viewed

Binary file (7.6 kB). View file

utils/__pycache__/subtitle_utils.cpython-39.pyc ADDED Viewed

Binary file (3.49 kB). View file

utils/__pycache__/trans_utils.cpython-311.pyc ADDED Viewed

Binary file (8.45 kB). View file

utils/__pycache__/trans_utils.cpython-39.pyc ADDED Viewed

Binary file (3 kB). View file

utils/argparse_tools.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import argparse
+from pathlib import Path
+import yaml
+import sys
+class ArgumentParser(argparse.ArgumentParser):
+    """Simple implementation of ArgumentParser supporting config file
+    This class is originated from https://github.com/bw2/ConfigArgParse,
+    but this class is lack of some features that it has.
+    - Not supporting multiple config files
+    - Automatically adding "--config" as an option.
+    - Not supporting any formats other than yaml
+    - Not checking argument type
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.add_argument("--config", help="Give config file in yaml format")
+    def parse_known_args(self, args=None, namespace=None):
+        # Once parsing for setting from "--config"
+        _args, _ = super().parse_known_args(args, namespace)
+        if _args.config is not None:
+            if not Path(_args.config).exists():
+                self.error(f"No such file: {_args.config}")
+            with open(_args.config, "r", encoding="utf-8") as f:
+                d = yaml.safe_load(f)
+            if not isinstance(d, dict):
+                self.error("Config file has non dict value: {_args.config}")
+            for key in d:
+                for action in self._actions:
+                    if key == action.dest:
+                        break
+                else:
+                    self.error(f"unrecognized arguments: {key} (from {_args.config})")
+            # NOTE(kamo): Ignore "--config" from a config file
+            # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
+            #   i.e. We can set any type value regardless of argument type.
+            self.set_defaults(**d)
+        return super().parse_known_args(args, namespace)
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''")
+        if all(char not in arg for char in extra_chars)
+        else "'" + arg.replace("'", "'\\''") + "'"
+        for arg in sys.argv
+    ]
+    return sys.executable + " " + " ".join(argv)

utils/subtitle_utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+def time_convert(ms):
+    ms = int(ms)
+    tail = ms % 1000
+    s = ms // 1000
+    mi = s // 60
+    s = s % 60
+    h = mi // 60
+    mi = mi % 60
+    h = "00" if h == 0 else str(h)
+    mi = "00" if mi == 0 else str(mi)
+    s = "00" if s == 0 else str(s)
+    tail = str(tail)
+    if len(h) == 1: h = '0' + h
+    if len(mi) == 1: mi = '0' + mi
+    if len(s) == 1: s = '0' + s
+    return "{}:{}:{},{}".format(h, mi, s, tail)
+class Text2SRT():
+    def __init__(self, text, timestamp, offset=0):
+        self.token_list = [i for i in text.split() if len(i)]
+        self.timestamp = timestamp
+        start, end = timestamp[0][0] - offset, timestamp[-1][1] - offset
+        self.start_sec, self.end_sec = start, end
+        self.start_time = time_convert(start)
+        self.end_time = time_convert(end)
+    def text(self):
+        res = ""
+        for word in self.token_list:
+            if '\u4e00' <= word <= '\u9fff':
+                res += word
+            else:
+                res += " " + word
+        return res
+    def len(self):
+        return len(self.token_list)
+    def srt(self, acc_ost=0.0):
+        return "{} --> {}\n{}\n".format(
+            time_convert(self.start_sec+acc_ost*1000),
+            time_convert(self.end_sec+acc_ost*1000),
+            self.text())
+    def time(self, acc_ost=0.0):
+        return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
+def generate_srt(sentence_list):
+    srt_total = ''
+    for i, sent in enumerate(sentence_list):
+        t2s = Text2SRT(sent['text'], sent['timestamp'])
+        if 'spk' in sent:
+            srt_total += "{}  spk{}\n{}".format(i, sent['spk'], t2s.srt())
+        else:
+            srt_total += "{}\n{}".format(i, t2s.srt())
+    return srt_total
+def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
+    start, end = int(start * 1000), int(end * 1000)
+    srt_total = ''
+    cc = 1 + begin_index
+    subs = []
+    for _, sent in enumerate(sentence_list):
+        if sent['timestamp'][-1][1] <= start:
+            # print("CASE0")
+            continue
+        if sent['timestamp'][0][0] >= end:
+            # print("CASE4")
+            break
+        # parts in between
+        if (sent['timestamp'][-1][1] <= end and sent['timestamp'][0][0] > start) or (sent['timestamp'][-1][1] == end and sent['timestamp'][0][0] == start):
+            # print("CASE1")
+            t2s = Text2SRT(sent['text'], sent['timestamp'], offset=start)
+            srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+            subs.append((t2s.time(time_acc_ost), t2s.text()))
+            cc += 1
+            continue
+        if sent['timestamp'][0][0] <= start:
+            # print("CASE2")
+            if not sent['timestamp'][-1][1] > end:
+                for j, ts in enumerate(sent['timestamp']):
+                    if ts[1] > start:
+                        break
+                _text = " ".join(sent['text'][j:])
+                _ts = sent['timestamp'][j:]
+            else:
+                for j, ts in enumerate(sent['timestamp']):
+                    if ts[1] > start:
+                        _start = j
+                        break
+                for j, ts in enumerate(sent['timestamp']):
+                    if ts[1] > end:
+                        _end = j
+                        break
+                _text = " ".join(sent['text'][_start:_end])
+                _ts = sent['timestamp'][_start:_end]
+            if len(ts):
+                t2s = Text2SRT(_text, _ts, offset=start)
+                srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+                subs.append((t2s.time(time_acc_ost), t2s.text()))
+                cc += 1
+            continue
+        if sent['timestamp'][-1][1] > end:
+            # print("CASE3")
+            for j, ts in enumerate(sent['timestamp']):
+                if ts[1] > end:
+                    break
+            _text = " ".join(sent['text'][:j])
+            _ts = sent['timestamp'][:j]
+            if len(_ts):
+                t2s = Text2SRT(_text, _ts, offset=start)
+                srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+                subs.append(
+                    (t2s.time(time_acc_ost), t2s.text())
+                    )
+                cc += 1
+            continue
+    return srt_total, subs, cc

utils/theme.json ADDED Viewed

	@@ -0,0 +1,333 @@

+{
+	"theme": {
+	"_font": [
+	{
+	"__gradio_font__": true,
+	"name": "Montserrat",
+	"class": "google"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "ui-sans-serif",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "system-ui",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "sans-serif",
+	"class": "font"
+	}
+	],
+	"_font_mono": [
+	{
+	"__gradio_font__": true,
+	"name": "IBM Plex Mono",
+	"class": "google"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "ui-monospace",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "Consolas",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "monospace",
+	"class": "font"
+	}
+	],
+	"background_fill_primary": "*neutral_50",
+	"background_fill_primary_dark": "*neutral_950",
+	"background_fill_secondary": "*neutral_50",
+	"background_fill_secondary_dark": "*neutral_900",
+	"block_background_fill": "white",
+	"block_background_fill_dark": "*neutral_800",
+	"block_border_color": "*border_color_primary",
+	"block_border_color_dark": "*border_color_primary",
+	"block_border_width": "0px",
+	"block_border_width_dark": "0px",
+	"block_info_text_color": "*body_text_color_subdued",
+	"block_info_text_color_dark": "*body_text_color_subdued",
+	"block_info_text_size": "*text_sm",
+	"block_info_text_weight": "400",
+	"block_label_background_fill": "*primary_100",
+	"block_label_background_fill_dark": "*primary_600",
+	"block_label_border_color": "*border_color_primary",
+	"block_label_border_color_dark": "*border_color_primary",
+	"block_label_border_width": "1px",
+	"block_label_border_width_dark": "1px",
+	"block_label_margin": "*spacing_md",
+	"block_label_padding": "*spacing_sm *spacing_md",
+	"block_label_radius": "*radius_md",
+	"block_label_right_radius": "0 calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px)",
+	"block_label_text_color": "*primary_500",
+	"block_label_text_color_dark": "*white",
+	"block_label_text_size": "*text_md",
+	"block_label_text_weight": "600",
+	"block_padding": "*spacing_xl calc(*spacing_xl + 2px)",
+	"block_radius": "*radius_lg",
+	"block_shadow": "none",
+	"block_shadow_dark": "none",
+	"block_title_background_fill": "*block_label_background_fill",
+	"block_title_background_fill_dark": "*block_label_background_fill",
+	"block_title_border_color": "none",
+	"block_title_border_color_dark": "none",
+	"block_title_border_width": "0px",
+	"block_title_border_width_dark": "0px",
+	"block_title_padding": "*block_label_padding",
+	"block_title_radius": "*block_label_radius",
+	"block_title_text_color": "*primary_500",
+	"block_title_text_color_dark": "*white",
+	"block_title_text_size": "*text_md",
+	"block_title_text_weight": "600",
+	"body_background_fill": "*background_fill_primary",
+	"body_background_fill_dark": "*background_fill_primary",
+	"body_text_color": "*neutral_800",
+	"body_text_color_dark": "*neutral_100",
+	"body_text_color_subdued": "*neutral_400",
+	"body_text_color_subdued_dark": "*neutral_400",
+	"body_text_size": "*text_md",
+	"body_text_weight": "400",
+	"border_color_accent": "*primary_300",
+	"border_color_accent_dark": "*neutral_600",
+	"border_color_primary": "*neutral_200",
+	"border_color_primary_dark": "*neutral_700",
+	"button_border_width": "*input_border_width",
+	"button_border_width_dark": "*input_border_width",
+	"button_cancel_background_fill": "*button_secondary_background_fill",
+	"button_cancel_background_fill_dark": "*button_secondary_background_fill",
+	"button_cancel_background_fill_hover": "*button_secondary_background_fill_hover",
+	"button_cancel_background_fill_hover_dark": "*button_secondary_background_fill_hover",
+	"button_cancel_border_color": "*button_secondary_border_color",
+	"button_cancel_border_color_dark": "*button_secondary_border_color",
+	"button_cancel_border_color_hover": "*button_cancel_border_color",
+	"button_cancel_border_color_hover_dark": "*button_cancel_border_color",
+	"button_cancel_text_color": "*button_secondary_text_color",
+	"button_cancel_text_color_dark": "*button_secondary_text_color",
+	"button_cancel_text_color_hover": "*button_cancel_text_color",
+	"button_cancel_text_color_hover_dark": "*button_cancel_text_color",
+	"button_large_padding": "*spacing_lg calc(2 * *spacing_lg)",
+	"button_large_radius": "*radius_lg",
+	"button_large_text_size": "*text_lg",
+	"button_large_text_weight": "600",
+	"button_primary_background_fill": "*primary_500",
+	"button_primary_background_fill_dark": "*primary_700",
+	"button_primary_background_fill_hover": "*primary_400",
+	"button_primary_background_fill_hover_dark": "*primary_500",
+	"button_primary_border_color": "*primary_200",
+	"button_primary_border_color_dark": "*primary_600",
+	"button_primary_border_color_hover": "*button_primary_border_color",
+	"button_primary_border_color_hover_dark": "*button_primary_border_color",
+	"button_primary_text_color": "white",
+	"button_primary_text_color_dark": "white",
+	"button_primary_text_color_hover": "*button_primary_text_color",
+	"button_primary_text_color_hover_dark": "*button_primary_text_color",
+	"button_secondary_background_fill": "white",
+	"button_secondary_background_fill_dark": "*neutral_600",
+	"button_secondary_background_fill_hover": "*neutral_100",
+	"button_secondary_background_fill_hover_dark": "*primary_500",
+	"button_secondary_border_color": "*neutral_200",
+	"button_secondary_border_color_dark": "*neutral_600",
+	"button_secondary_border_color_hover": "*button_secondary_border_color",
+	"button_secondary_border_color_hover_dark": "*button_secondary_border_color",
+	"button_secondary_text_color": "*neutral_800",
+	"button_secondary_text_color_dark": "white",
+	"button_secondary_text_color_hover": "*button_secondary_text_color",
+	"button_secondary_text_color_hover_dark": "*button_secondary_text_color",
+	"button_shadow": "*shadow_drop_lg",
+	"button_shadow_active": "*shadow_inset",
+	"button_shadow_hover": "*shadow_drop_lg",
+	"button_small_padding": "*spacing_sm calc(2 * *spacing_sm)",
+	"button_small_radius": "*radius_lg",
+	"button_small_text_size": "*text_md",
+	"button_small_text_weight": "400",
+	"button_transition": "background-color 0.2s ease",
+	"checkbox_background_color": "*background_fill_primary",
+	"checkbox_background_color_dark": "*neutral_800",
+	"checkbox_background_color_focus": "*checkbox_background_color",
+	"checkbox_background_color_focus_dark": "*checkbox_background_color",
+	"checkbox_background_color_hover": "*checkbox_background_color",
+	"checkbox_background_color_hover_dark": "*checkbox_background_color",
+	"checkbox_background_color_selected": "*primary_600",
+	"checkbox_background_color_selected_dark": "*primary_700",
+	"checkbox_border_color": "*neutral_100",
+	"checkbox_border_color_dark": "*neutral_600",
+	"checkbox_border_color_focus": "*primary_500",
+	"checkbox_border_color_focus_dark": "*primary_600",
+	"checkbox_border_color_hover": "*neutral_300",
+	"checkbox_border_color_hover_dark": "*neutral_600",
+	"checkbox_border_color_selected": "*primary_600",
+	"checkbox_border_color_selected_dark": "*primary_700",
+	"checkbox_border_radius": "*radius_sm",
+	"checkbox_border_width": "1px",
+	"checkbox_border_width_dark": "*input_border_width",
+	"checkbox_check": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e\")",
+	"checkbox_label_background_fill": "*button_secondary_background_fill",
+	"checkbox_label_background_fill_dark": "*button_secondary_background_fill",
+	"checkbox_label_background_fill_hover": "*button_secondary_background_fill_hover",
+	"checkbox_label_background_fill_hover_dark": "*button_secondary_background_fill_hover",
+	"checkbox_label_background_fill_selected": "*primary_500",
+	"checkbox_label_background_fill_selected_dark": "*primary_600",
+	"checkbox_label_border_color": "*border_color_primary",
+	"checkbox_label_border_color_dark": "*border_color_primary",
+	"checkbox_label_border_color_hover": "*checkbox_label_border_color",
+	"checkbox_label_border_color_hover_dark": "*checkbox_label_border_color",
+	"checkbox_label_border_width": "*input_border_width",
+	"checkbox_label_border_width_dark": "*input_border_width",
+	"checkbox_label_gap": "*spacing_lg",
+	"checkbox_label_padding": "*spacing_md calc(2 * *spacing_md)",
+	"checkbox_label_shadow": "*shadow_drop_lg",
+	"checkbox_label_text_color": "*body_text_color",
+	"checkbox_label_text_color_dark": "*body_text_color",
+	"checkbox_label_text_color_selected": "white",
+	"checkbox_label_text_color_selected_dark": "*checkbox_label_text_color",
+	"checkbox_label_text_size": "*text_md",
+	"checkbox_label_text_weight": "400",
+	"checkbox_shadow": "none",
+	"color_accent": "*primary_500",
+	"color_accent_soft": "*primary_50",
+	"color_accent_soft_dark": "*neutral_700",
+	"container_radius": "*radius_lg",
+	"embed_radius": "*radius_lg",
+	"error_background_fill": "#fee2e2",
+	"error_background_fill_dark": "*background_fill_primary",
+	"error_border_color": "#fecaca",
+	"error_border_color_dark": "*border_color_primary",
+	"error_border_width": "1px",
+	"error_border_width_dark": "1px",
+	"error_text_color": "#ef4444",
+	"error_text_color_dark": "#ef4444",
+	"font": "'Montserrat', 'ui-sans-serif', 'system-ui', sans-serif",
+	"font_mono": "'IBM Plex Mono', 'ui-monospace', 'Consolas', monospace",
+	"form_gap_width": "0px",
+	"input_background_fill": "white",
+	"input_background_fill_dark": "*neutral_700",
+	"input_background_fill_focus": "*secondary_500",
+	"input_background_fill_focus_dark": "*secondary_600",
+	"input_background_fill_hover": "*input_background_fill",
+	"input_background_fill_hover_dark": "*input_background_fill",
+	"input_border_color": "*neutral_50",
+	"input_border_color_dark": "*border_color_primary",
+	"input_border_color_focus": "*secondary_300",
+	"input_border_color_focus_dark": "*neutral_700",
+	"input_border_color_hover": "*input_border_color",
+	"input_border_color_hover_dark": "*input_border_color",
+	"input_border_width": "0px",
+	"input_border_width_dark": "0px",
+	"input_padding": "*spacing_xl",
+	"input_placeholder_color": "*neutral_400",
+	"input_placeholder_color_dark": "*neutral_500",
+	"input_radius": "*radius_lg",
+	"input_shadow": "*shadow_drop",
+	"input_shadow_dark": "*shadow_drop",
+	"input_shadow_focus": "*shadow_drop_lg",
+	"input_shadow_focus_dark": "*shadow_drop_lg",
+	"input_text_size": "*text_md",
+	"input_text_weight": "400",
+	"layout_gap": "*spacing_xxl",
+	"link_text_color": "*secondary_600",
+	"link_text_color_active": "*secondary_600",
+	"link_text_color_active_dark": "*secondary_500",
+	"link_text_color_dark": "*secondary_500",
+	"link_text_color_hover": "*secondary_700",
+	"link_text_color_hover_dark": "*secondary_400",
+	"link_text_color_visited": "*secondary_500",
+	"link_text_color_visited_dark": "*secondary_600",
+	"loader_color": "*color_accent",
+	"loader_color_dark": "*color_accent",
+	"name": "base",
+	"neutral_100": "#f3f4f6",
+	"neutral_200": "#e5e7eb",
+	"neutral_300": "#d1d5db",
+	"neutral_400": "#9ca3af",
+	"neutral_50": "#f9fafb",
+	"neutral_500": "#6b7280",
+	"neutral_600": "#4b5563",
+	"neutral_700": "#374151",
+	"neutral_800": "#1f2937",
+	"neutral_900": "#111827",
+	"neutral_950": "#0b0f19",
+	"panel_background_fill": "*background_fill_secondary",
+	"panel_background_fill_dark": "*background_fill_secondary",
+	"panel_border_color": "*border_color_primary",
+	"panel_border_color_dark": "*border_color_primary",
+	"panel_border_width": "1px",
+	"panel_border_width_dark": "1px",
+	"primary_100": "#e0e7ff",
+	"primary_200": "#c7d2fe",
+	"primary_300": "#a5b4fc",
+	"primary_400": "#818cf8",
+	"primary_50": "#eef2ff",
+	"primary_500": "#6366f1",
+	"primary_600": "#4f46e5",
+	"primary_700": "#4338ca",
+	"primary_800": "#3730a3",
+	"primary_900": "#312e81",
+	"primary_950": "#2b2c5e",
+	"prose_header_text_weight": "600",
+	"prose_text_size": "*text_md",
+	"prose_text_weight": "400",
+	"radio_circle": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e\")",
+	"radius_lg": "6px",
+	"radius_md": "4px",
+	"radius_sm": "2px",
+	"radius_xl": "8px",
+	"radius_xs": "1px",
+	"radius_xxl": "12px",
+	"radius_xxs": "1px",
+	"secondary_100": "#ecfccb",
+	"secondary_200": "#d9f99d",
+	"secondary_300": "#bef264",
+	"secondary_400": "#a3e635",
+	"secondary_50": "#f7fee7",
+	"secondary_500": "#84cc16",
+	"secondary_600": "#65a30d",
+	"secondary_700": "#4d7c0f",
+	"secondary_800": "#3f6212",
+	"secondary_900": "#365314",
+	"secondary_950": "#2f4e14",
+	"section_header_text_size": "*text_md",
+	"section_header_text_weight": "400",
+	"shadow_drop": "0 1px 4px 0 rgb(0 0 0 / 0.1)",
+	"shadow_drop_lg": "0 2px 5px 0 rgb(0 0 0 / 0.1)",
+	"shadow_inset": "rgba(0,0,0,0.05) 0px 2px 4px 0px inset",
+	"shadow_spread": "6px",
+	"shadow_spread_dark": "1px",
+	"slider_color": "*primary_500",
+	"slider_color_dark": "*primary_600",
+	"spacing_lg": "6px",
+	"spacing_md": "4px",
+	"spacing_sm": "2px",
+	"spacing_xl": "9px",
+	"spacing_xs": "1px",
+	"spacing_xxl": "12px",
+	"spacing_xxs": "1px",
+	"stat_background_fill": "*primary_300",
+	"stat_background_fill_dark": "*primary_500",
+	"table_border_color": "*neutral_300",
+	"table_border_color_dark": "*neutral_700",
+	"table_even_background_fill": "white",
+	"table_even_background_fill_dark": "*neutral_950",
+	"table_odd_background_fill": "*neutral_50",
+	"table_odd_background_fill_dark": "*neutral_900",
+	"table_radius": "*radius_lg",
+	"table_row_focus": "*color_accent_soft",
+	"table_row_focus_dark": "*color_accent_soft",
+	"text_lg": "16px",
+	"text_md": "14px",
+	"text_sm": "12px",
+	"text_xl": "22px",
+	"text_xs": "10px",
+	"text_xxl": "26px",
+	"text_xxs": "9px"
+	},
+	"version": "0.0.1"
+	}

utils/trans_utils.py ADDED Viewed

	@@ -0,0 +1,131 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import os
+import re
+import numpy as np
+PUNC_LIST = ['，', '。', '！', '？', '、']
+def pre_proc(text):
+    res = ''
+    for i in range(len(text)):
+        if text[i] in PUNC_LIST:
+            continue
+        if '\u4e00' <= text[i] <= '\u9fff':
+            if len(res) and res[-1] != " ":
+                res += ' ' + text[i]+' '
+            else:
+                res += text[i]+' '
+        else:
+            res += text[i]
+    if res[-1] == ' ':
+        res = res[:-1]
+    return res
+def proc(raw_text, timestamp, dest_text):
+    # simple matching
+    ld = len(dest_text.split())
+    mi, ts = [], []
+    offset = 0
+    while True:
+        fi = raw_text.find(dest_text, offset, len(raw_text))
+        ti = raw_text[:fi].count(' ')
+        if fi == -1:
+            break
+        offset = fi + ld
+        mi.append(fi)
+        ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
+    return ts
+def proc_spk(dest_spk, sd_sentences):
+    ts = []
+    for d in sd_sentences:
+        d_start = d['timestamp'][0][0]
+        d_end = d['timestamp'][-1][1]
+        spkid=dest_spk[3:]
+        if str(d['spk']) == spkid and d_end-d_start>999:
+            ts.append([d['start']*16, d['end']*16])
+    return ts
+def generate_vad_data(data, sd_sentences, sr=16000):
+    assert len(data.shape) == 1
+    vad_data = []
+    for d in sd_sentences:
+        d_start = round(d['ts_list'][0][0]/1000, 2)
+        d_end = round(d['ts_list'][-1][1]/1000, 2)
+        vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]])
+    return vad_data
+def write_state(output_dir, state):
+    for key in ['/recog_res_raw', '/timestamp', '/sentences']:#, '/sd_sentences']:
+        with open(output_dir+key, 'w') as fout:
+            fout.write(str(state[key[1:]]))
+    if 'sd_sentences' in state:
+        with open(output_dir+'/sd_sentences', 'w') as fout:
+            fout.write(str(state['sd_sentences']))
+def load_state(output_dir):
+    state = {}
+    with open(output_dir+'/recog_res_raw') as fin:
+        line = fin.read()
+        state['recog_res_raw'] = line
+    with open(output_dir+'/timestamp') as fin:
+        line = fin.read()
+        state['timestamp'] = eval(line)
+    with open(output_dir+'/sentences') as fin:
+        line = fin.read()
+        state['sentences'] = eval(line)
+    if os.path.exists(output_dir+'/sd_sentences'):
+        with open(output_dir+'/sd_sentences') as fin:
+            line = fin.read()
+            state['sd_sentences'] = eval(line)
+    return state
+def convert_pcm_to_float(data):
+    if data.dtype == np.float64:
+        return data
+    elif data.dtype == np.float32:
+        return data.astype(np.float64)
+    elif data.dtype == np.int16:
+        bit_depth = 16
+    elif data.dtype == np.int32:
+        bit_depth = 32
+    elif data.dtype == np.int8:
+        bit_depth = 8
+    else:
+        raise ValueError("Unsupported audio data type")
+    # Now handle the integer types
+    max_int_value = float(2 ** (bit_depth - 1))
+    if bit_depth == 8:
+        data = data - 128
+    return (data.astype(np.float64) / max_int_value)
+def convert_time_to_millis(time_str):
+    # 格式: [小时:分钟:秒,毫秒]
+    hours, minutes, seconds, milliseconds = map(int, re.split('[:,]', time_str))
+    return (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
+def extract_timestamps(input_text):
+    # 使用正则表达式查找所有时间戳
+    timestamps = re.findall(r'\[(\d{2}:\d{2}:\d{2},\d{2,3})\s*-\s*(\d{2}:\d{2}:\d{2},\d{2,3})\]', input_text)
+    times_list = []
+    print(timestamps)
+    # 循环遍历找到的所有时间戳，并转换为毫秒
+    for start_time, end_time in timestamps:
+        start_millis = convert_time_to_millis(start_time)
+        end_millis = convert_time_to_millis(end_time)
+        times_list.append([start_millis, end_millis])
+    return times_list
+if __name__ == '__main__':
+    text = ("1. [00:00:00,500-00:00:05,850] 在我们的设计普惠当中，有一个我经常津津乐道的项目叫寻找远方的美好。"
+    "2. [00:00:07,120-00:00:12,940] 啊，在这样一个我们叫寻美在这样的一个项目当中，我们把它跟乡村振兴去结合起来，利用我们的设计的能力。"
+    "3. [00:00:13,240-00:00:25,620] 问我们自身员工的设设计能力，我们设计生态伙伴的能力，帮助乡村振兴当中，要希望把他的产品推向市场，把他的农产品把他加工产品推向市场的这样的伙伴做一件事情，")
+    print(extract_timestamps(text))

videoclipper.py CHANGED Viewed

@@ -1,3 +1,10 @@
 import sys
 import copy
 import librosa
@@ -5,36 +12,56 @@ import logging
 import argparse
 import numpy as np
 import soundfile as sf
-import moviepy.editor as mpy
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from subtitle_utils import generate_srt, generate_srt_clip
-from trans_utils import pre_proc, proc, write_state, load_state
 from moviepy.editor import *
 from moviepy.video.tools.subtitles import SubtitlesClip
 class VideoClipper():
-    def __init__(self, asr_pipeline):
         logging.warning("Initializing VideoClipper.")
-        self.asr_pipeline = asr_pipeline
-    def recog(self, audio_input, state=None):
         if state is None:
             state = {}
-        state['audio_input'] = audio_input
-        _, data = audio_input
-        data = data.astype(np.float64)
-        rec_result = self.asr_pipeline(audio_in=data)
-        state['recog_res_raw'] = rec_result['text_postprocessed']
-        state['timestamp'] = rec_result['time_stamp']
-        state['sentences'] = rec_result['sentences']
-        res_text = rec_result['text']
-        res_srt = generate_srt(rec_result['sentences'])
         return res_text, res_srt, state
-    def clip(self, dest_text, start_ost, end_ost, state):
         # get from state
         audio_input = state['audio_input']
         recog_res_raw = state['recog_res_raw']
@@ -43,12 +70,37 @@ class VideoClipper():
         sr, data = audio_input
         data = data.astype(np.float64)
-        all_ts = []
-        for _dest_text in dest_text.split('#'):
-            _dest_text = pre_proc(_dest_text)
-            ts = proc(recog_res_raw, timestamp, _dest_text)
-            for _ts in ts: all_ts.append(_ts)
         ts = all_ts
         srt_index = 0
         clip_srt = ""
         if len(ts):
@@ -68,79 +120,292 @@ class VideoClipper():
                 srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
                 clip_srt += srt_clip
         if len(ts):
-            message = "{} periods found in the speech: ".format(len(ts)) + start_end_info
         else:
             message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
         return (sr, res_audio), message, clip_srt
-    def video_recog(self, vedio_filename):
-        vedio_filename = vedio_filename
-        clip_video_file = vedio_filename[:-4] + '_clip.mp4'
-        video = mpy.VideoFileClip(vedio_filename)
-        audio_file = vedio_filename[:-3] + 'wav'
         video.audio.write_audiofile(audio_file)
         wav = librosa.load(audio_file, sr=16000)[0]
         state = {
-            'vedio_filename': vedio_filename,
             'clip_video_file': clip_video_file,
             'video': video,
         }
         # res_text, res_srt = self.recog((16000, wav), state)
-        return self.recog((16000, wav), state)
-    def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False):
         # get from state
         recog_res_raw = state['recog_res_raw']
         timestamp = state['timestamp']
         sentences = state['sentences']
         video = state['video']
         clip_video_file = state['clip_video_file']
-        vedio_filename = state['vedio_filename']
-        all_ts = []
         srt_index = 0
-        for _dest_text in dest_text.split('#'):
-            _dest_text = pre_proc(_dest_text)
-            ts = proc(recog_res_raw, timestamp, _dest_text)
-            for _ts in ts: all_ts.append(_ts)
         ts = all_ts
         clip_srt = ""
         if len(ts):
             start, end = ts[0][0] / 16000, ts[0][1] / 16000
             start, end = start+start_ost/1000.0, end+end_ost/1000.0
             video_clip = video.subclip(start, end)
-            clip_video_file = clip_video_file
             start_end_info = "from {} to {}".format(start, end)
-            # message = "{} periods found in the audio: from {} to {}.".format(len(ts), start, end)
-            srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index)
             clip_srt += srt_clip
             if add_sub:
                 generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
                 subtitles = SubtitlesClip(subs, generator)
                 video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
             concate_clip = [video_clip]
             for _ts in ts[1:]:
                 start, end = _ts[0] / 16000, _ts[1] / 16000
                 start, end = start+start_ost/1000.0, end+end_ost/1000.0
                 _video_clip = video.subclip(start, end)
-                clip_video_file = clip_video_file
                 start_end_info += ", from {} to {}".format(start, end)
-                srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1)
                 clip_srt += srt_clip
                 if add_sub:
                     generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
-                    subtitles = SubtitlesClip(subs, generator)
                     _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
                 concate_clip.append(copy.copy(_video_clip))
             message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
             logging.warning("Concating...")
             if len(concate_clip) > 1:
                 video_clip = concatenate_videoclips(concate_clip)
-            video_clip.write_videofile(clip_video_file)
         else:
-            clip_video_file = vedio_filename
             message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
             srt_clip = ''
         return clip_video_file, message, clip_srt

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import re
+import os
 import sys
 import copy
 import librosa
 import argparse
 import numpy as np
 import soundfile as sf
 from moviepy.editor import *
+import moviepy.editor as mpy
 from moviepy.video.tools.subtitles import SubtitlesClip
+from utils.subtitle_utils import generate_srt, generate_srt_clip
+from utils.argparse_tools import ArgumentParser, get_commandline_args
+from utils.trans_utils import pre_proc, proc, write_state, load_state, proc_spk, convert_pcm_to_float
 class VideoClipper():
+    def __init__(self, funasr_model):
         logging.warning("Initializing VideoClipper.")
+        self.funasr_model = funasr_model
+        self.GLOBAL_COUNT = 0
+    def recog(self, audio_input, sd_switch='no', state=None, hotwords="", output_dir=None):
         if state is None:
             state = {}
+        sr, data = audio_input
+        # Convert to float64 consistently (includes data type checking)
+        data = convert_pcm_to_float(data)
+        # assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
+        if sr != 16000: # resample with librosa
+            data = librosa.resample(data, orig_sr=sr, target_sr=16000)
+        if len(data.shape) == 2:  # multi-channel wav input
+            logging.warning("Input wav shape: {}, only first channel reserved.".format(data.shape))
+            data = data[:,0]
+        state['audio_input'] = (sr, data)
+        if sd_switch == 'Yes':
+            rec_result = self.funasr_model.generate(data, return_raw_text=True, is_final=True, hotword=hotwords, cache={})
+            res_srt = generate_srt(rec_result[0]['sentence_info'])
+            state['sd_sentences'] = rec_result[0]['sentence_info']
+        else:
+            rec_result = self.funasr_model.generate(data,
+                                                    return_spk_res=False,
+                                                    sentence_timestamp=True,
+                                                    return_raw_text=True,
+                                                    is_final=True,
+                                                    hotword=hotwords,
+                                                    output_dir=output_dir,
+                                                    cache={})
+            res_srt = generate_srt(rec_result[0]['sentence_info'])
+        state['recog_res_raw'] = rec_result[0]['raw_text']
+        state['timestamp'] = rec_result[0]['timestamp']
+        state['sentences'] = rec_result[0]['sentence_info']
+        res_text = rec_result[0]['text']
         return res_text, res_srt, state
+    def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None, output_dir=None, timestamp_list=None):
         # get from state
         audio_input = state['audio_input']
         recog_res_raw = state['recog_res_raw']
         sr, data = audio_input
         data = data.astype(np.float64)
+        if timestamp_list is not None:
+            all_ts = []
+            if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
+                for _dest_text in dest_text.split('#'):
+                    if '[' in _dest_text:
+                        match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
+                        if match:
+                            offset_b, offset_e = map(int, match.groups())
+                            log_append = ""
+                        else:
+                            offset_b, offset_e = 0, 0
+                            log_append = "(Bracket detected in dest_text but offset time matching failed)"
+                        _dest_text = _dest_text[:_dest_text.find('[')]
+                    else:
+                        log_append = ""
+                        offset_b, offset_e = 0, 0
+                    _dest_text = pre_proc(_dest_text)
+                    ts = proc(recog_res_raw, timestamp, _dest_text)
+                    for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
+                    if len(ts) > 1 and match:
+                        log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
+                            offsets are applied to all periods)'
+            else:
+                for _dest_spk in dest_spk.split('#'):
+                    ts = proc_spk(_dest_spk, state['sd_sentences'])
+                    for _ts in ts: all_ts.append(_ts)
+                log_append = ""
+        else:
+            all_ts = timestamp_list
         ts = all_ts
+        # ts.sort()
         srt_index = 0
         clip_srt = ""
         if len(ts):
                 srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
                 clip_srt += srt_clip
         if len(ts):
+            message = "{} periods found in the speech: ".format(len(ts)) + start_end_info + log_append
         else:
             message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
+            res_audio = data
         return (sr, res_audio), message, clip_srt
+    def video_recog(self, video_filename, sd_switch='no', hotwords="", output_dir=None):
+        video = mpy.VideoFileClip(video_filename)
+        # Extract the base name, add '_clip.mp4', and 'wav'
+        if output_dir is not None:
+            os.makedirs(output_dir, exist_ok=True)
+            _, base_name = os.path.split(video_filename)
+            base_name, _ = os.path.splitext(base_name)
+            clip_video_file = base_name + '_clip.mp4'
+            audio_file = base_name + '.wav'
+            audio_file = os.path.join(output_dir, audio_file)
+        else:
+            base_name, _ = os.path.splitext(video_filename)
+            clip_video_file = base_name + '_clip.mp4'
+            audio_file = base_name + '.wav'
         video.audio.write_audiofile(audio_file)
         wav = librosa.load(audio_file, sr=16000)[0]
+        # delete the audio file after processing
+        if os.path.exists(audio_file):
+            os.remove(audio_file)
         state = {
+            'video_filename': video_filename,
             'clip_video_file': clip_video_file,
             'video': video,
         }
         # res_text, res_srt = self.recog((16000, wav), state)
+        return self.recog((16000, wav), sd_switch, state, hotwords, output_dir)
+    def video_clip(self,
+                   dest_text,
+                   start_ost,
+                   end_ost,
+                   state,
+                   font_size=32,
+                   font_color='white',
+                   add_sub=False,
+                   dest_spk=None,
+                   output_dir=None,
+                   timestamp_list=None):
         # get from state
         recog_res_raw = state['recog_res_raw']
         timestamp = state['timestamp']
         sentences = state['sentences']
         video = state['video']
         clip_video_file = state['clip_video_file']
+        video_filename = state['video_filename']
+        if timestamp_list is None:
+            all_ts = []
+            if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
+                for _dest_text in dest_text.split('#'):
+                    if '[' in _dest_text:
+                        match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
+                        if match:
+                            offset_b, offset_e = map(int, match.groups())
+                            log_append = ""
+                        else:
+                            offset_b, offset_e = 0, 0
+                            log_append = "(Bracket detected in dest_text but offset time matching failed)"
+                        _dest_text = _dest_text[:_dest_text.find('[')]
+                    else:
+                        offset_b, offset_e = 0, 0
+                        log_append = ""
+                    _dest_text = pre_proc(_dest_text)
+                    ts = proc(recog_res_raw, timestamp, _dest_text)
+                    for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
+                    if len(ts) > 1 and match:
+                        log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
+                            offsets are applied to all periods)'
+            else:
+                for _dest_spk in dest_spk.split('#'):
+                    ts = proc_spk(_dest_spk, state['sd_sentences'])
+                    for _ts in ts: all_ts.append(_ts)
+        else:  # AI clip pass timestamp as input directly
+            all_ts = [[i[0]*16.0, i[1]*16.0] for i in timestamp_list]
         srt_index = 0
+        time_acc_ost = 0.0
         ts = all_ts
+        # ts.sort()
         clip_srt = ""
         if len(ts):
             start, end = ts[0][0] / 16000, ts[0][1] / 16000
+            srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
             start, end = start+start_ost/1000.0, end+end_ost/1000.0
             video_clip = video.subclip(start, end)
             start_end_info = "from {} to {}".format(start, end)
             clip_srt += srt_clip
             if add_sub:
                 generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
                 subtitles = SubtitlesClip(subs, generator)
                 video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
             concate_clip = [video_clip]
+            time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
             for _ts in ts[1:]:
                 start, end = _ts[0] / 16000, _ts[1] / 16000
+                srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
+                chi_subs = []
+                sub_starts = subs[0][0][0]
+                for sub in subs:
+                    chi_subs.append(((sub[0][0]-sub_starts, sub[0][1]-sub_starts), sub[1]))
                 start, end = start+start_ost/1000.0, end+end_ost/1000.0
                 _video_clip = video.subclip(start, end)
                 start_end_info += ", from {} to {}".format(start, end)
                 clip_srt += srt_clip
                 if add_sub:
                     generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
+                    subtitles = SubtitlesClip(chi_subs, generator)
                     _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
+                    # _video_clip.write_videofile("debug.mp4", audio_codec="aac")
                 concate_clip.append(copy.copy(_video_clip))
+                time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
             message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
             logging.warning("Concating...")
             if len(concate_clip) > 1:
                 video_clip = concatenate_videoclips(concate_clip)
+            # clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
+            if output_dir is not None:
+                os.makedirs(output_dir, exist_ok=True)
+                _, file_with_extension = os.path.split(clip_video_file)
+                clip_video_file_name, _ = os.path.splitext(file_with_extension)
+                print(output_dir, clip_video_file)
+                clip_video_file = os.path.join(output_dir, "{}_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
+                temp_audio_file = os.path.join(output_dir, "{}_tempaudio_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
+            else:
+                clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
+                temp_audio_file = clip_video_file[:-4] + '_tempaudio_no{}.mp4'.format(self.GLOBAL_COUNT)
+            video_clip.write_videofile(clip_video_file, audio_codec="aac", temp_audiofile=temp_audio_file)
+            self.GLOBAL_COUNT += 1
         else:
+            clip_video_file = video_filename
             message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
             srt_clip = ''
         return clip_video_file, message, clip_srt
+def get_parser():
+    parser = ArgumentParser(
+        description="ClipVideo Argument",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--stage",
+        type=int,
+        choices=(1, 2),
+        help="Stage, 0 for recognizing and 1 for clipping",
+        required=True
+    )
+    parser.add_argument(
+        "--file",
+        type=str,
+        default=None,
+        help="Input file path",
+        required=True
+    )
+    parser.add_argument(
+        "--sd_switch",
+        type=str,
+        choices=("no", "yes"),
+        default="no",
+        help="Turn on the speaker diarization or not",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default='./output',
+        help="Output files path",
+    )
+    parser.add_argument(
+        "--dest_text",
+        type=str,
+        default=None,
+        help="Destination text string for clipping",
+    )
+    parser.add_argument(
+        "--dest_spk",
+        type=str,
+        default=None,
+        help="Destination spk id for clipping",
+    )
+    parser.add_argument(
+        "--start_ost",
+        type=int,
+        default=0,
+        help="Offset time in ms at beginning for clipping"
+    )
+    parser.add_argument(
+        "--end_ost",
+        type=int,
+        default=0,
+        help="Offset time in ms at ending for clipping"
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        default=None,
+        help="Output file path"
+    )
+    return parser
+def runner(stage, file, sd_switch, output_dir, dest_text, dest_spk, start_ost, end_ost, output_file, config=None):
+    audio_suffixs = ['.wav','.mp3','.aac','.m4a','.flac']
+    video_suffixs = ['.mp4','.avi','.mkv','.flv','.mov','.webm','.ts','.mpeg']
+    _,ext = os.path.splitext(file)
+    if ext.lower() in audio_suffixs:
+        mode = 'audio'
+    elif ext.lower() in video_suffixs:
+        mode = 'video'
+    else:
+        logging.error("Unsupported file format: {}\n\nplease choise one of the following: {}".format(file),audio_suffixs+video_suffixs)
+        sys.exit(1) # exit if the file is not supported
+    while output_dir.endswith('/'):
+        output_dir = output_dir[:-1]
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    if stage == 1:
+        from funasr import AutoModel
+        # initialize funasr automodel
+        logging.warning("Initializing modelscope asr pipeline.")
+        funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+                  vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                  punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+                  spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
+                  )
+        audio_clipper = VideoClipper(funasr_model)
+        if mode == 'audio':
+            logging.warning("Recognizing audio file: {}".format(file))
+            wav, sr = librosa.load(file, sr=16000)
+            res_text, res_srt, state = audio_clipper.recog((sr, wav), sd_switch)
+        if mode == 'video':
+            logging.warning("Recognizing video file: {}".format(file))
+            res_text, res_srt, state = audio_clipper.video_recog(file, sd_switch)
+        total_srt_file = output_dir + '/total.srt'
+        with open(total_srt_file, 'w') as fout:
+            fout.write(res_srt)
+            logging.warning("Write total subtitle to {}".format(total_srt_file))
+        write_state(output_dir, state)
+        logging.warning("Recognition successed. You can copy the text segment from below and use stage 2.")
+        print(res_text)
+    if stage == 2:
+        audio_clipper = VideoClipper(None)
+        if mode == 'audio':
+            state = load_state(output_dir)
+            wav, sr = librosa.load(file, sr=16000)
+            state['audio_input'] = (sr, wav)
+            (sr, audio), message, srt_clip = audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk)
+            if output_file is None:
+                output_file = output_dir + '/result.wav'
+            clip_srt_file = output_file[:-3] + 'srt'
+            logging.warning(message)
+            sf.write(output_file, audio, 16000)
+            assert output_file.endswith('.wav'), "output_file must ends with '.wav'"
+            logging.warning("Save clipped wav file to {}".format(output_file))
+            with open(clip_srt_file, 'w') as fout:
+                fout.write(srt_clip)
+                logging.warning("Write clipped subtitle to {}".format(clip_srt_file))
+        if mode == 'video':
+            state = load_state(output_dir)
+            state['video_filename'] = file
+            if output_file is None:
+                state['clip_video_file'] = file[:-4] + '_clip.mp4'
+            else:
+                state['clip_video_file'] = output_file
+            clip_srt_file = state['clip_video_file'][:-3] + 'srt'
+            state['video'] = mpy.VideoFileClip(file)
+            clip_video_file, message, srt_clip = audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk)
+            logging.warning("Clipping Log: {}".format(message))
+            logging.warning("Save clipped mp4 file to {}".format(clip_video_file))
+            with open(clip_srt_file, 'w') as fout:
+                fout.write(srt_clip)
+                logging.warning("Write clipped subtitle to {}".format(clip_srt_file))
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    runner(**kwargs)
+if __name__ == '__main__':
+    main()