File size: 3,002 Bytes
ff2e3f8
f70dae2
 
6b0e01c
2096575
a94cdbb
488bdb6
ff2e3f8
afd9eb4
420dbaa
dcd9326
 
48f4751
 
dcd9326
 
5132feb
dcd9326
 
5132feb
dcd9326
 
 
5132feb
 
dcd9326
 
1dccde5
1647498
 
48f4751
760892a
48f4751
 
afd9eb4
 
 
0701407
71d01f7
0701407
 
928ff71
ca52925
75acbad
0701407
 
 
 
 
 
48f4751
 
 
0cc7e88
75acbad
 
9ed6c43
0701407
75acbad
0701407
 
 
 
 
fedbdeb
8ce97ae
 
48f4751
 
 
 
760892a
0701407
48f4751
64cb4e5
3aecd1a
2cd93d3
cfcc701
64cb4e5
 
3aecd1a
0701407
 
afd9eb4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import streamlit as st
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import whisper
from whisper import utils
import ffmpeg
import os
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import numpy as np
SAMPLE_RATE = 16000



def load_audio(file: str, sr: int = SAMPLE_RATE):

   
        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
    out, _ = (
            ffmpeg.input(file, threads=0)
            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
    )
  

    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
if 'processor' not in locals():

  with st.spinner('Wait for it...'):
   
 #  processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
   model=whisper.load_model("tiny")
   



#pipe = pipeline('sentiment-analysis')
#pipe2 =  pipeline(task="image-bestify", model="beihai/GFPGAN-V1.3-whole-image")
#text = st.text_area('entre com algum texto')
#st.title("Wav a ser transcrito ")

wav_up = st.file_uploader("Upload",type=['wav','ogg','mp3'])
if wav_up is not None:
      file_details = {"FileName":wav_up.name,"FileType":wav_up.type}
      st.write(file_details)
     
      with open(wav_up.name,"wb") as f: 
        f.write(wav_up.getbuffer())         
        st.success("Saved File")
        audio = whisper.load_audio(wav_up.name)
        audio = whisper.pad_or_trim(audio)
       
      st.audio(wav_up.name, format="audio/wav", start_time=0)
if st.button('Processa'):
 if wav_up is not None:
  
      with st.spinner('Wait for it...'):

      #  print('2')
    # transcription=model.transcribe(
    #        audio,
    #        language = None
    #    )
        #processado=np.frombuffer(wav_up.getbuffer(), np.int16).flatten().astype(np.float32) / 32768.0
      #  input_features = processor(audio , return_tensors="pt").input_features 
    #    forced_decoder_ids = processor.get_decoder_prompt_ids(language = None, task = "transcribe")
        transcription=model.transcribe(
            audio,
            language = 'pt'
        )
       # predicted_ids = model.generate(input_features, forced_decoder_ids = forced_decoder_ids)
        
        #transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True)
        string1=''
        for i, segment in enumerate(transcription, start=1):
       #  write srt lines
           string1=string1+"\n" + f"{str(i)}\n"+  f"{utils.format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> " + f"{utils.format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"+ f"{segment['text'].strip().replace('-->', '->')}\n";
              
          
      st.write(string1 )
      st.success("Texto Gerado")
   #   print('3')