Spaces:
Runtime error
Runtime error
File size: 3,007 Bytes
ff2e3f8 f70dae2 a94cdbb 488bdb6 ff2e3f8 afd9eb4 420dbaa dcd9326 48f4751 dcd9326 5132feb dcd9326 5132feb dcd9326 5132feb dcd9326 1dccde5 1647498 48f4751 1647498 48f4751 afd9eb4 0701407 71d01f7 0701407 928ff71 ca52925 75acbad 0701407 48f4751 0cc7e88 75acbad 9ed6c43 0701407 75acbad 0701407 fedbdeb 64cb4e5 0701407 48f4751 0701407 48f4751 64cb4e5 2cd93d3 64cb4e5 2cd93d3 0701407 afd9eb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import streamlit as st
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import ffmpeg
import os
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import numpy as np
SAMPLE_RATE = 16000
def load_audio(file: str, sr: int = SAMPLE_RATE):
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
if 'processor' not in locals():
with st.spinner('Wait for it...'):
processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
model=whisper.load_model("tiny")
#pipe = pipeline('sentiment-analysis')
#pipe2 = pipeline(task="image-bestify", model="beihai/GFPGAN-V1.3-whole-image")
#text = st.text_area('entre com algum texto')
#st.title("Wav a ser transcrito ")
wav_up = st.file_uploader("Upload",type=['wav','ogg','mp3'])
if wav_up is not None:
file_details = {"FileName":wav_up.name,"FileType":wav_up.type}
st.write(file_details)
with open(wav_up.name,"wb") as f:
f.write(wav_up.getbuffer())
st.success("Saved File")
audio = whisper.load_audio(wav_up.name)
audio = whisper.pad_or_trim(audio)
st.audio(wav_up.name, format="audio/wav", start_time=0)
if st.button('Processa'):
if wav_up is not None:
with st.spinner('Wait for it...'):
# print('2')
# transcription=model.transcribe(
# audio,
# language = None
# )
#processado=np.frombuffer(wav_up.getbuffer(), np.int16).flatten().astype(np.float32) / 32768.0
input_features = processor(audio , return_tensors="pt").input_features
forced_decoder_ids = processor.get_decoder_prompt_ids(language = None, task = "transcribe")
transcription=model.transcribe(
audio,
language = 'pt'
)
predicted_ids = model.generate(input_features, forced_decoder_ids = forced_decoder_ids)
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True)
string1=''
# for i, segment in enumerate(transcription, start=1):
# write srt lines
# string1=string1+'\n' +
# f"{i}\n"+
# f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "+
# f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"+
# f"{segment['text'].strip().replace('-->', '->')}\n";
st.json(transcription )
st.success("Texto Gerado")
# print('3')
|