Spaces:
Build error
Build error
File size: 5,804 Bytes
042a25e de22680 042a25e de22680 042a25e 61644eb 042a25e 61644eb 042a25e 61644eb de22680 6e86db4 de22680 61644eb 042a25e 61644eb 042a25e 61644eb de22680 61644eb 042a25e 61644eb 042a25e 61644eb de22680 61644eb 042a25e 61644eb 042a25e 61644eb 6e86db4 61644eb 042a25e 61644eb 042a25e 61644eb de22680 042a25e 6e86db4 a059f58 6e86db4 a059f58 6e86db4 042a25e 6e86db4 1eb4922 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import datetime
import os
os.system('pip install git+https://github.com/openai/whisper.git')
from whisper.audio import N_SAMPLES
import gradio as gr
import wave
import whisper
import logging
import torchaudio
import torchaudio.functional as F
LOGGING_FORMAT = '%(asctime)s %(message)s'
logging.basicConfig(format=LOGGING_FORMAT,level=logging.INFO)
RECOGNITION_INTERVAL = 2
CNT_PER_CHUNK = 6
# tmp dir to store audio files.
if not os.path.isdir('./tmp/'):
os.mkdir('./tmp')
class WhisperStreaming():
def __init__(self, model_name='base', language='en', fp16=False):
self.model_name = model_name
self.language = language
self.fp16 = fp16
self.whisper_model = whisper.load_model(f'{model_name}.{language}')
self.decode_option = whisper.DecodingOptions(language=self.language,
without_timestamps=True,
fp16=self.fp16)
self.whisper_sample_rate = 16000
def transcribe_audio_file(self, wave_file_path):
waveform, sample_rate = torchaudio.load(wave_file_path)
resampled_waveform = F.resample(waveform, sample_rate, self.whisper_sample_rate, lowpass_filter_width=6)
audio_tmp = whisper.pad_or_trim(resampled_waveform[0], length=N_SAMPLES)
mel = whisper.log_mel_spectrogram(audio_tmp)
results = self.whisper_model.decode(mel, self.decode_option)
return results
def concat_multiple_wav_files(wav_files):
logging.info(f'Concat {wav_files}')
concat_audio = []
for wav_file in wav_files:
w = wave.open(wav_file, 'rb')
concat_audio.append([w.getparams(), w.readframes(w.getnframes())])
w.close()
logging.info(f'Delete audio file {wav_file}')
os.remove(wav_file)
output_file_name = f'{datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")}.wav'
output_file_path = os.path.join('./tmp', output_file_name)
output = wave.open(output_file_path, 'wb')
output.setparams(concat_audio[0][0])
for i in range(len(concat_audio)):
output.writeframes(concat_audio[i][1])
output.close()
logging.info(f'Concat past {len(wav_files)} wav files into {output_file_path}')
return output_file_path
# fp16 indicates whether using Float16 or Float32. Normally, PyTorch does not support fp16 when run on CPU
whisper_model = WhisperStreaming(model_name='base', language='en', fp16=False)
def transcribe(audio, state={}):
logging.info(f'Transcribe audio file {audio}')
print('=====================')
logging.info(state)
# Whisper only take maximum 30s of audio as input.
# And the gradio streaming does not guarantee each callback is 1s, And I set CNT_PER_CHUNK as 6, it's just a rough guess that 6 callbacks does not sum up an audio longer than 30s.
# The logic of chunk splitting could be improved by reading exact how many samples in audio files.
# After count reach CNT_PER_CHUNK * n, a new audio file is created.
# However the text should not change.
if not state:
state['all_chunk_texts'] = 'Waitting...'
state['count'] = 0
state['chunks'] = {}
return state['all_chunk_texts'], state
chunk = state['count'] // CNT_PER_CHUNK
chunk_offset = state['count'] % CNT_PER_CHUNK
if chunk_offset == 0:
state['chunks'][chunk] = {}
state['chunks'][chunk]['concated_audio'] = audio
state['chunks'][chunk]['result_text'] = ''
else:
state['chunks'][chunk]['concated_audio'] = concat_multiple_wav_files([state['chunks'][chunk]['concated_audio'], audio])
state['count'] += 1
# Determin if recognizes current chunk.
if (chunk_offset + 1) % RECOGNITION_INTERVAL == 0 and chunk_offset > 0:
logging.info(f'start to transcribe chunk: {chunk}, offset: {chunk_offset}')
result = whisper_model.transcribe_audio_file(state['chunks'][chunk]['concated_audio'])
logging.info('complete transcribe.......')
state['chunks'][chunk]['result_text'] = result.text
logging.info('The text is:' + state['chunks'][chunk]['result_text'])
else:
logging.info(f'The offset of streaming chunk is {chunk_offset}, and skip speech recognition')
# Concat result_texts of all chunks
result_texts = ''
for tmp_chunk_idx, tmp_chunk_values in state['chunks'].items():
result_texts += tmp_chunk_values['result_text'] + ' '
state['all_chunk_texts'] = result_texts
return state['all_chunk_texts'], state
# Make sure not missing any audio clip.
assert CNT_PER_CHUNK % RECOGNITION_INTERVAL == 0
STEP_ONE_DESCRIPTION = '''
Model: base
Language: en
<div>
<h3>
Step1. Click button <i>"Record from microphone"</i> and allow this site to use your microphone.
</h3>
<note>Right now the continuous Speech to text transcription is lag and sometimes missing some sentences...</note>
</div>
'''
STEP_TWO_DESCRIPTION = '''
<div align=center>
<h3 style="font-weight: 900; margin-bottom: 7px;">
Step2. Try to play the video and see how Whisper transcribe!
</h3>
<p>
Note: make sure using speaker that your computer microphone is able to hear! i.e. computer default speaker
</p>
<video id="video" width=50% controls="" preload="none">
<source id="mp4" src="https://nomorewzx.github.io/near-continuous-whispering/demo_video/whisper_demo.mp4" type="video/mp4">
</videos>
</div>
'''
gr.Interface(fn=transcribe,
inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
outputs = ['text', 'state'],
description=STEP_ONE_DESCRIPTION,
article=STEP_TWO_DESCRIPTION,
live=True).queue(concurrency_count=5).launch() |