File size: 6,298 Bytes
d22ef5d 5ba51ff d22ef5d 5ba51ff d22ef5d 5ba51ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import gradio as gr
import cv2
#-*- coding:utf-8 -*-
import wave
import time
start1=time.time()
from pydub import AudioSegment
from pydub.silence import split_on_silence
import os
import shutil
import sys
import torch
import wenetruntime as wenet
file='./data/input.mp4'
print("载入术语库")
shuyu=[]
from ppasr.infer_utils.pun_predictor import PunctuationPredictor
def to_black(image):
# -*- coding:utf-8 -*-
import wave
import time
start1 = time.time()
from pydub import AudioSegment
from pydub.silence import split_on_silence
import os
import shutil
import sys
import torch
import wenetruntime as wenet
file = './data/input.mp4'
print("载入术语库")
shuyu = []
from ppasr.infer_utils.pun_predictor import PunctuationPredictor
pun_predictor = PunctuationPredictor(model_dir='pun_models') # 加载一个生成标点符号的模型
# 加入我们要加权重的术语
shuyu_txt = open('术语_1.txt', 'r', encoding='utf-8')
for i in shuyu_txt:
shuyu.append(i.strip())
print("载入术语库成功")
import os
a = os.system(
"ffmpeg -i " + file + " -acodec pcm_s16le -f s16le -ac 1 -ar 16000 -f wav ./temp_wav/1.wav") # 提取MP4中的音频
print(a)
print("开始处理...请等待")
time.sleep(3)
# 对声音进行压缩
sound = AudioSegment.from_mp3("./temp_wav/1.wav")
loudness = sound.dBFS
# print(loudness)
# 设置单声道
sound = sound.set_channels(1)
# 压缩帧率
sound = sound.set_frame_rate(16000)
# 对音频进行分段
# min_silence_len,持续多少时间可认定为静默,默认值1000ms
#
# silence_thresh,声音大小小于多少时可认定为静默,默认值为-16dBFS,根据平均值-23,那么默认值无疑会导致没有输出,笔者调整为-30后,切分的总数位51 。
#
# keep_silence,为切分结果前端添加一段静默音频,默认值为100ms
#
# seek_step,两次切分处理的间隔时间,默认值1ms
chunks, start, end = split_on_silence(sound,
# must be silent for at least half a second,沉默半秒
min_silence_len=400,
# consider it silent if quieter than -16 dBFS
silence_thresh=-46,
keep_silence=400
)
print('总分段:', len(chunks))
print(start, end)
# 将音频分段,按照断句时间,暂时保存分段的wav文件
for i, chunk in enumerate(chunks):
chunk.export("./temp/-{0}.wav".format(i), format="wav")
# print(i)
# -*- coding: utf-8 -*-
'''
for x in range(0,int(len(sound)/1000)):
print(x,sound[x*1000:(x+1)*1000].max_dBFS)
'''
def get_format_time(time_long):
# 一个转成srt格式的函数
def format_number(num):
if len(str(num)) > 1:
return str(num)
else:
return "0" + str(num)
myhour = 0
mysecond = int(time_long / 1000)
myminute = 0
mymilsec = 0
if mysecond < 1:
return "00:00:00,%s" % (time_long)
else:
if mysecond > 60:
myminute = int(mysecond / 60)
if myminute > 60:
myhour = int(myminute / 60)
myminute = myminute - myhour * 60
mysecond = mysecond - myhour * 3600 - myminute * 60
mymilsec = time_long - 1000 * (mysecond + myhour * 3600 + myminute * 60)
return "%s:%s:%s,%s" % (format_number(myhour), format_number(myminute), format_number(mysecond), \
format_number(mymilsec))
else:
mysecond = int(mysecond - myminute * 60)
mymilsec = time_long - 1000 * (mysecond + myminute * 60)
return "00:%s:%s,%s" % (format_number(myminute), format_number(mysecond), format_number(mymilsec))
else:
mymilsec = time_long - mysecond * 1000
return "00:00:%s,%s" % (mysecond, mymilsec)
decoder = wenet.Decoder(model_dir=r'C:\Users\Goya\.wenet\chs_1', lang='chs', context=shuyu, context_score=10.0)
# 加载编译器,语音识别模型
base_path = r'./temp'
files = os.listdir(base_path)
files.sort(key=lambda x: int(x.split('.')[0]), reverse=True) # 得到当前目录下所有文件
count = 1
word = []
for path in files:
# print(full_path)
ans = eval(decoder.decode_wav('./temp/' + path)) # 得到语音识别结果
ans = ans["nbest"][0]["sentence"]
ans = ans.replace('<context>', '')
ans = ans.replace('</context>', '')
ans = pun_predictor(ans) # 加入标点符号
if len(ans) > 37: # 判断是否高于35个字,如果高于则分行
x = list(ans)
x.insert(37, '\n')
ans = ''.join(x)
if (len(ans)) > 74:
x = list(ans)
x.insert(74, '\n')
ans = ''.join(x)
print(count, ": ", ans)
word.append(ans)
count += 1
x = open('./result/1.srt', 'w', encoding='utf-8') # 生成字幕文件
x2 = open('./result/2.txt', 'w', encoding='utf-8') # 生成文本文件
count2 = 1
# 生成字幕文件
for a, b, c in zip(start, end, word):
x2.write(c + '\n')
m = str(count2) + '\n' + get_format_time(a) + ' --> ' + get_format_time(b) + '\n' + c + '\n' + '\n'
count2 += 1
x.write(m)
x.close()
x2.close()
print("正在加字幕,请稍后\n")
import os
a = os.system("ffmpeg -i " + file + " -vf subtitles=./result/1.srt -y ./result/output.mp4") # 使用a接收返回值
shutil.rmtree('./temp')
os.mkdir('./temp')
shutil.rmtree('./temp_wav')
os.mkdir('./temp_wav')
# 删除一些暂存的文件
print("消耗时间为:" + str(time.time() - start1))
return
interface = gr.Interface(fn=to_black, inputs="video", outputs="video")
interface.launch() |