File size: 3,007 Bytes
ff2e3f8
f70dae2
 
a94cdbb
488bdb6
ff2e3f8
afd9eb4
420dbaa
dcd9326
 
48f4751
 
dcd9326
 
5132feb
dcd9326
 
5132feb
dcd9326
 
 
5132feb
 
dcd9326
 
1dccde5
1647498
 
48f4751
1647498
48f4751
 
afd9eb4
 
 
0701407
71d01f7
0701407
 
928ff71
ca52925
75acbad
0701407
 
 
 
 
 
48f4751
 
 
0cc7e88
75acbad
 
9ed6c43
0701407
75acbad
0701407
 
 
 
 
fedbdeb
64cb4e5
0701407
48f4751
 
 
 
0701407
 
48f4751
64cb4e5
2cd93d3
 
 
 
 
 
 
64cb4e5
 
2cd93d3
0701407
 
afd9eb4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import streamlit as st
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import ffmpeg
import os
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import numpy as np
SAMPLE_RATE = 16000



def load_audio(file: str, sr: int = SAMPLE_RATE):

   
        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
    out, _ = (
            ffmpeg.input(file, threads=0)
            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
    )
  

    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
if 'processor' not in locals():

  with st.spinner('Wait for it...'):
   
   processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
   model=whisper.load_model("tiny")
   



#pipe = pipeline('sentiment-analysis')
#pipe2 =  pipeline(task="image-bestify", model="beihai/GFPGAN-V1.3-whole-image")
#text = st.text_area('entre com algum texto')
#st.title("Wav a ser transcrito ")

wav_up = st.file_uploader("Upload",type=['wav','ogg','mp3'])
if wav_up is not None:
      file_details = {"FileName":wav_up.name,"FileType":wav_up.type}
      st.write(file_details)
     
      with open(wav_up.name,"wb") as f: 
        f.write(wav_up.getbuffer())         
        st.success("Saved File")
        audio = whisper.load_audio(wav_up.name)
        audio = whisper.pad_or_trim(audio)
       
      st.audio(wav_up.name, format="audio/wav", start_time=0)
if st.button('Processa'):
 if wav_up is not None:
  
      with st.spinner('Wait for it...'):

      #  print('2')
    # transcription=model.transcribe(
    #        audio,
    #        language = None
    #    )
        #processado=np.frombuffer(wav_up.getbuffer(), np.int16).flatten().astype(np.float32) / 32768.0
        input_features = processor(audio , return_tensors="pt").input_features 
        forced_decoder_ids = processor.get_decoder_prompt_ids(language = None, task = "transcribe")
        transcription=model.transcribe(
            audio,
            language = 'pt'
        )
        predicted_ids = model.generate(input_features, forced_decoder_ids = forced_decoder_ids)
        
        #transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True)
        string1=''
       #  for i, segment in enumerate(transcription, start=1):
       #  write srt lines
       #   string1=string1+'\n' +
       #       f"{i}\n"+
       #       f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "+
       #       f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"+
       #       f"{segment['text'].strip().replace('-->', '->')}\n";
              
          
        st.json(transcription )
      st.success("Texto Gerado")
   #   print('3')