import streamlit as st import os os.system("pip install git+https://github.com/openai/whisper.git") import whisper from whisper import utils import ffmpeg import os from transformers import pipeline from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq import numpy as np SAMPLE_RATE = 16000 def load_audio(file: str, sr: int = SAMPLE_RATE): # This launches a subprocess to decode audio while down-mixing and resampling as necessary. # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. out, _ = ( ffmpeg.input(file, threads=0) .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) ) return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 if 'processor' not in locals(): with st.spinner('Wait for it...'): # processor = AutoProcessor.from_pretrained("openai/whisper-tiny") model=whisper.load_model("tiny") #pipe = pipeline('sentiment-analysis') #pipe2 = pipeline(task="image-bestify", model="beihai/GFPGAN-V1.3-whole-image") #text = st.text_area('entre com algum texto') #st.title("Wav a ser transcrito ") wav_up = st.file_uploader("Upload",type=['wav','ogg','mp3']) if wav_up is not None: file_details = {"FileName":wav_up.name,"FileType":wav_up.type} st.write(file_details) with open(wav_up.name,"wb") as f: f.write(wav_up.getbuffer()) st.success("Saved File") audio = whisper.load_audio(wav_up.name) audio = whisper.pad_or_trim(audio) st.audio(wav_up.name, format="audio/wav", start_time=0) if st.button('Processa'): if wav_up is not None: with st.spinner('Wait for it...'): # print('2') # transcription=model.transcribe( # audio, # language = None # ) #processado=np.frombuffer(wav_up.getbuffer(), np.int16).flatten().astype(np.float32) / 32768.0 # input_features = processor(audio , return_tensors="pt").input_features # forced_decoder_ids = processor.get_decoder_prompt_ids(language = None, task = "transcribe") transcription=model.transcribe( audio, language = 'pt' ) # predicted_ids = model.generate(input_features, forced_decoder_ids = forced_decoder_ids) #transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True) string1='' for i, segment in enumerate(transcription, start=1): # write srt lines string1=string1+"\n" + f"{str(i)}\n"+ f"{utils.format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> " + f"{utils.format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"+ f"{segment['text'].strip().replace('-->', '->')}\n"; st.write(string1 ) st.success("Texto Gerado") # print('3')