File size: 8,346 Bytes
76934e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from videoclipper import VideoClipper


if __name__ == "__main__":
    inference_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
    )
    audio_clipper = VideoClipper(inference_pipeline)

    def audio_recog(audio_input):
        return audio_clipper.recog(audio_input)

    def audio_clip(dest_text, start_ost, end_ost, state):
        return audio_clipper.clip(dest_text, start_ost, end_ost, state)

    def video_recog(video_input):
        return audio_clipper.video_recog(video_input)

    def video_clip(dest_text, start_ost, end_ost, state):
        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state)

    def video_clip_addsub(dest_text, start_ost, end_ost, state, font_size, font_color):
        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True)

    '''
    top_md_1 = ("""
    基于达摩院自研Paraformer-长音频版的语音识别、端点检测、标点预测、时间戳功能

    准确识别,自由复制所需段落并一键裁剪、添加字幕

    * Step1: 上传视频文件(或使用下方的用例体验),点击 **<font color="#f7802b">识别</font>** 按钮
    * Step2: 复制识别结果中所需的文字至右上方,设置偏移与字幕配置(可选)
    * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
    """)
    '''

    top_md_2 = ("""
    受到网络传输与服务资源的限制,用于体验的视频最好大小在40mb以下
    过大的视频可以尝试分离音轨使用音频剪辑,或 **<font color="#1785c4">通过源代码将您的ClipVideo服务部署在本地(推荐)</font>** :
    <div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
    FunASR_APP: <a href='https://github.com/alibaba/funasr-app'><img src='https://img.shields.io/badge/Github-Code-blue'></a> 
    🌟支持我们: <a href='https://github.com/alibaba/funasr-app/stargazers'><img src='https://img.shields.io/github/stars/alibaba/funasr-app.svg?style=social'></a>
    </div>
    </div>
    """)

    top_md_3 = ("""访问FunASR项目与论文能够帮助您深入了解ClipVideo中所使用的语音处理相关模型:
    <div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
        FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a> 
        FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a> 
        🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
    </div>
    </div>
    """)

    # gradio interface
    with gr.Blocks() as demo:
        #gr.Image("./examples/guide.png", show_label=False)
        # gr.Markdown(top_md_1)
        #gr.Markdown(top_md_2)
        #gr.Markdown(top_md_3)
        video_state = gr.State()
        audio_state = gr.State()
        with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
            with gr.Row():
                with gr.Column():
                    video_input = gr.Video(label="🎥视频输入 Video Input")
                    gr.Examples(['examples/2022云栖大会_片段2.mp4', 
                                 'examples/2022云栖大会_片段.mp4', 
                                 'examples/为什么要多读书?这是我听过最好的答案-片段.mp4', 
                                 'examples/使用chatgpt_片段.mp4'],
                                [video_input])
                    recog_button2 = gr.Button("👂识别 Recognize")
                    video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
                    video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
                with gr.Column():
                    video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
                    with gr.Row():
                        video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
                        video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
                    with gr.Row():
                        font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
                        font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
                        # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
                    with gr.Row():
                        clip_button2 = gr.Button("✂️裁剪\nClip")
                        clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
                    video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
                    video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
                    video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")

        with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(label="🔊音频输入 Audio Input")
                    gr.Examples(['examples/鲁肃采访片段1.wav'], [audio_input])
                    recog_button1 = gr.Button("👂识别 Recognize")
                    audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
                    audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
                with gr.Column():
                    audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
                    with gr.Row():
                        audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
                        audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
                    with gr.Row():
                        clip_button1 = gr.Button("✂️裁剪 Clip")
                    audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
                    audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
                    audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
        
        recog_button1.click(audio_recog, 
                            inputs=audio_input, 
                            outputs=[audio_text_output, audio_srt_output, audio_state])
        clip_button1.click(audio_clip, 
                           inputs=[audio_text_input, audio_start_ost, audio_end_ost, audio_state], 
                           outputs=[audio_output, audio_mess_output, audio_srt_clip_output])

        recog_button2.click(video_recog, 
                            inputs=video_input, 
                            outputs=[video_text_output, video_srt_output, video_state])
        clip_button2.click(video_clip, 
                           inputs=[video_text_input, video_start_ost, video_end_ost, video_state], 
                           outputs=[video_output, video_mess_output, video_srt_clip_output])
        clip_button3.click(video_clip_addsub, 
                           inputs=[video_text_input, video_start_ost, video_end_ost, video_state, font_size, font_color], 
                           outputs=[video_output, video_mess_output, video_srt_clip_output])
    
    # start gradio service in local
    demo.queue(concurrency_count=3).launch()