zimu / app.py
Goya11's picture
Update app.py
5ba51ff
import gradio as gr
import cv2
#-*- coding:utf-8 -*-
import wave
import time
start1=time.time()
from pydub import AudioSegment
from pydub.silence import split_on_silence
import os
import shutil
import sys
import torch
import wenetruntime as wenet
file='./data/input.mp4'
print("载入术语库")
shuyu=[]
from ppasr.infer_utils.pun_predictor import PunctuationPredictor
def to_black(image):
# -*- coding:utf-8 -*-
import wave
import time
start1 = time.time()
from pydub import AudioSegment
from pydub.silence import split_on_silence
import os
import shutil
import sys
import torch
import wenetruntime as wenet
file = './data/input.mp4'
print("载入术语库")
shuyu = []
from ppasr.infer_utils.pun_predictor import PunctuationPredictor
pun_predictor = PunctuationPredictor(model_dir='pun_models') # 加载一个生成标点符号的模型
# 加入我们要加权重的术语
shuyu_txt = open('术语_1.txt', 'r', encoding='utf-8')
for i in shuyu_txt:
shuyu.append(i.strip())
print("载入术语库成功")
import os
a = os.system(
"ffmpeg -i " + file + " -acodec pcm_s16le -f s16le -ac 1 -ar 16000 -f wav ./temp_wav/1.wav") # 提取MP4中的音频
print(a)
print("开始处理...请等待")
time.sleep(3)
# 对声音进行压缩
sound = AudioSegment.from_mp3("./temp_wav/1.wav")
loudness = sound.dBFS
# print(loudness)
# 设置单声道
sound = sound.set_channels(1)
# 压缩帧率
sound = sound.set_frame_rate(16000)
# 对音频进行分段
# min_silence_len,持续多少时间可认定为静默,默认值1000ms
#
# silence_thresh,声音大小小于多少时可认定为静默,默认值为-16dBFS,根据平均值-23,那么默认值无疑会导致没有输出,笔者调整为-30后,切分的总数位51 。
#
# keep_silence,为切分结果前端添加一段静默音频,默认值为100ms
#
# seek_step,两次切分处理的间隔时间,默认值1ms
chunks, start, end = split_on_silence(sound,
# must be silent for at least half a second,沉默半秒
min_silence_len=400,
# consider it silent if quieter than -16 dBFS
silence_thresh=-46,
keep_silence=400
)
print('总分段:', len(chunks))
print(start, end)
# 将音频分段,按照断句时间,暂时保存分段的wav文件
for i, chunk in enumerate(chunks):
chunk.export("./temp/-{0}.wav".format(i), format="wav")
# print(i)
# -*- coding: utf-8 -*-
'''
for x in range(0,int(len(sound)/1000)):
print(x,sound[x*1000:(x+1)*1000].max_dBFS)
'''
def get_format_time(time_long):
# 一个转成srt格式的函数
def format_number(num):
if len(str(num)) > 1:
return str(num)
else:
return "0" + str(num)
myhour = 0
mysecond = int(time_long / 1000)
myminute = 0
mymilsec = 0
if mysecond < 1:
return "00:00:00,%s" % (time_long)
else:
if mysecond > 60:
myminute = int(mysecond / 60)
if myminute > 60:
myhour = int(myminute / 60)
myminute = myminute - myhour * 60
mysecond = mysecond - myhour * 3600 - myminute * 60
mymilsec = time_long - 1000 * (mysecond + myhour * 3600 + myminute * 60)
return "%s:%s:%s,%s" % (format_number(myhour), format_number(myminute), format_number(mysecond), \
format_number(mymilsec))
else:
mysecond = int(mysecond - myminute * 60)
mymilsec = time_long - 1000 * (mysecond + myminute * 60)
return "00:%s:%s,%s" % (format_number(myminute), format_number(mysecond), format_number(mymilsec))
else:
mymilsec = time_long - mysecond * 1000
return "00:00:%s,%s" % (mysecond, mymilsec)
decoder = wenet.Decoder(model_dir=r'C:\Users\Goya\.wenet\chs_1', lang='chs', context=shuyu, context_score=10.0)
# 加载编译器,语音识别模型
base_path = r'./temp'
files = os.listdir(base_path)
files.sort(key=lambda x: int(x.split('.')[0]), reverse=True) # 得到当前目录下所有文件
count = 1
word = []
for path in files:
# print(full_path)
ans = eval(decoder.decode_wav('./temp/' + path)) # 得到语音识别结果
ans = ans["nbest"][0]["sentence"]
ans = ans.replace('<context>', '')
ans = ans.replace('</context>', '')
ans = pun_predictor(ans) # 加入标点符号
if len(ans) > 37: # 判断是否高于35个字,如果高于则分行
x = list(ans)
x.insert(37, '\n')
ans = ''.join(x)
if (len(ans)) > 74:
x = list(ans)
x.insert(74, '\n')
ans = ''.join(x)
print(count, ": ", ans)
word.append(ans)
count += 1
x = open('./result/1.srt', 'w', encoding='utf-8') # 生成字幕文件
x2 = open('./result/2.txt', 'w', encoding='utf-8') # 生成文本文件
count2 = 1
# 生成字幕文件
for a, b, c in zip(start, end, word):
x2.write(c + '\n')
m = str(count2) + '\n' + get_format_time(a) + ' --> ' + get_format_time(b) + '\n' + c + '\n' + '\n'
count2 += 1
x.write(m)
x.close()
x2.close()
print("正在加字幕,请稍后\n")
import os
a = os.system("ffmpeg -i " + file + " -vf subtitles=./result/1.srt -y ./result/output.mp4") # 使用a接收返回值
shutil.rmtree('./temp')
os.mkdir('./temp')
shutil.rmtree('./temp_wav')
os.mkdir('./temp_wav')
# 删除一些暂存的文件
print("消耗时间为:" + str(time.time() - start1))
return
interface = gr.Interface(fn=to_black, inputs="video", outputs="video")
interface.launch()