File size: 6,298 Bytes
d22ef5d
5ba51ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d22ef5d
5ba51ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d22ef5d
5ba51ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import gradio as gr
import cv2
#-*- coding:utf-8 -*-
import wave
import time
start1=time.time()
from pydub import AudioSegment
from pydub.silence import split_on_silence
import os
import shutil
import sys
import torch
import wenetruntime as wenet
file='./data/input.mp4'
print("载入术语库")
shuyu=[]
from ppasr.infer_utils.pun_predictor import PunctuationPredictor

def to_black(image):
    # -*- coding:utf-8 -*-
    import wave
    import time
    start1 = time.time()
    from pydub import AudioSegment
    from pydub.silence import split_on_silence
    import os
    import shutil
    import sys
    import torch
    import wenetruntime as wenet
    file = './data/input.mp4'
    print("载入术语库")
    shuyu = []
    from ppasr.infer_utils.pun_predictor import PunctuationPredictor

    pun_predictor = PunctuationPredictor(model_dir='pun_models')  # 加载一个生成标点符号的模型
    # 加入我们要加权重的术语
    shuyu_txt = open('术语_1.txt', 'r', encoding='utf-8')
    for i in shuyu_txt:
        shuyu.append(i.strip())
    print("载入术语库成功")

    import os
    a = os.system(
        "ffmpeg -i " + file + " -acodec pcm_s16le -f s16le -ac 1 -ar 16000 -f wav ./temp_wav/1.wav")  # 提取MP4中的音频
    print(a)
    print("开始处理...请等待")

    time.sleep(3)
    # 对声音进行压缩
    sound = AudioSegment.from_mp3("./temp_wav/1.wav")
    loudness = sound.dBFS
    # print(loudness)

    # 设置单声道
    sound = sound.set_channels(1)

    # 压缩帧率
    sound = sound.set_frame_rate(16000)

    # 对音频进行分段
    # min_silence_len,持续多少时间可认定为静默,默认值1000ms
    #
    # silence_thresh,声音大小小于多少时可认定为静默,默认值为-16dBFS,根据平均值-23,那么默认值无疑会导致没有输出,笔者调整为-30后,切分的总数位51 。
    #
    # keep_silence,为切分结果前端添加一段静默音频,默认值为100ms
    #
    # seek_step,两次切分处理的间隔时间,默认值1ms

    chunks, start, end = split_on_silence(sound,
                                          # must be silent for at least half a second,沉默半秒
                                          min_silence_len=400,
                                          # consider it silent if quieter than -16 dBFS
                                          silence_thresh=-46,
                                          keep_silence=400

                                          )
    print('总分段:', len(chunks))
    print(start, end)
    # 将音频分段,按照断句时间,暂时保存分段的wav文件
    for i, chunk in enumerate(chunks):
        chunk.export("./temp/-{0}.wav".format(i), format="wav")

        # print(i)
    # -*- coding: utf-8 -*-

    '''
    for x in range(0,int(len(sound)/1000)):
        print(x,sound[x*1000:(x+1)*1000].max_dBFS)
    '''

    def get_format_time(time_long):
        # 一个转成srt格式的函数
        def format_number(num):
            if len(str(num)) > 1:
                return str(num)
            else:
                return "0" + str(num)

        myhour = 0
        mysecond = int(time_long / 1000)
        myminute = 0
        mymilsec = 0
        if mysecond < 1:
            return "00:00:00,%s" % (time_long)
        else:
            if mysecond > 60:
                myminute = int(mysecond / 60)
                if myminute > 60:
                    myhour = int(myminute / 60)
                    myminute = myminute - myhour * 60
                    mysecond = mysecond - myhour * 3600 - myminute * 60
                    mymilsec = time_long - 1000 * (mysecond + myhour * 3600 + myminute * 60)
                    return "%s:%s:%s,%s" % (format_number(myhour), format_number(myminute), format_number(mysecond), \
                                            format_number(mymilsec))
                else:
                    mysecond = int(mysecond - myminute * 60)
                    mymilsec = time_long - 1000 * (mysecond + myminute * 60)
                    return "00:%s:%s,%s" % (format_number(myminute), format_number(mysecond), format_number(mymilsec))
            else:
                mymilsec = time_long - mysecond * 1000
                return "00:00:%s,%s" % (mysecond, mymilsec)

    decoder = wenet.Decoder(model_dir=r'C:\Users\Goya\.wenet\chs_1', lang='chs', context=shuyu, context_score=10.0)
    # 加载编译器,语音识别模型

    base_path = r'./temp'
    files = os.listdir(base_path)
    files.sort(key=lambda x: int(x.split('.')[0]), reverse=True)  # 得到当前目录下所有文件
    count = 1
    word = []
    for path in files:

        # print(full_path)
        ans = eval(decoder.decode_wav('./temp/' + path))  # 得到语音识别结果
        ans = ans["nbest"][0]["sentence"]
        ans = ans.replace('<context>', '')
        ans = ans.replace('</context>', '')
        ans = pun_predictor(ans)  # 加入标点符号
        if len(ans) > 37:  # 判断是否高于35个字,如果高于则分行

            x = list(ans)
            x.insert(37, '\n')
            ans = ''.join(x)
            if (len(ans)) > 74:
                x = list(ans)
                x.insert(74, '\n')
                ans = ''.join(x)

        print(count, ": ", ans)
        word.append(ans)
        count += 1

    x = open('./result/1.srt', 'w', encoding='utf-8')  # 生成字幕文件
    x2 = open('./result/2.txt', 'w', encoding='utf-8')  # 生成文本文件
    count2 = 1
    # 生成字幕文件
    for a, b, c in zip(start, end, word):
        x2.write(c + '\n')
        m = str(count2) + '\n' + get_format_time(a) + ' --> ' + get_format_time(b) + '\n' + c + '\n' + '\n'
        count2 += 1
        x.write(m)
    x.close()
    x2.close()

    print("正在加字幕,请稍后\n")
    import os
    a = os.system("ffmpeg -i " + file + "  -vf subtitles=./result/1.srt -y ./result/output.mp4")  # 使用a接收返回值

    shutil.rmtree('./temp')
    os.mkdir('./temp')
    shutil.rmtree('./temp_wav')
    os.mkdir('./temp_wav')
    # 删除一些暂存的文件
    print("消耗时间为:" + str(time.time() - start1))
    return

interface = gr.Interface(fn=to_black, inputs="video", outputs="video")
interface.launch()