import streamlit as st
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import whisper
from whisper import utils
import ffmpeg
import os
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import numpy as np
SAMPLE_RATE = 16000


def load_audio(file: str, sr: int = SAMPLE_RATE):

   
        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
    out, _ = (
            ffmpeg.input(file, threads=0)
            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
    )
  

    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
if 'processor' not in locals():

  with st.spinner('Wait for it...'):
   
 #  processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
   model=whisper.load_model("tiny")
   

#pipe = pipeline('sentiment-analysis')
#pipe2 =  pipeline(task="image-bestify", model="beihai/GFPGAN-V1.3-whole-image")
#text = st.text_area('entre com algum texto')
#st.title("Wav a ser transcrito ")

wav_up = st.file_uploader("Upload",type=['wav','ogg','mp3'])
if wav_up is not None:
      file_details = {"FileName":wav_up.name,"FileType":wav_up.type}
      st.write(file_details)
     
      with open(wav_up.name,"wb") as f: 
        f.write(wav_up.getbuffer())         
        st.success("Saved File")
        audio = whisper.load_audio(wav_up.name)
        audio = whisper.pad_or_trim(audio)
       
      st.audio(wav_up.name, format="audio/wav", start_time=0)
if st.button('Processa'):
 if wav_up is not None:
  
      with st.spinner('Wait for it...'):

      #  print('2')
    # transcription=model.transcribe(
    #        audio,
    #        language = None
    #    )
        #processado=np.frombuffer(wav_up.getbuffer(), np.int16).flatten().astype(np.float32) / 32768.0
      #  input_features = processor(audio , return_tensors="pt").input_features 
    #    forced_decoder_ids = processor.get_decoder_prompt_ids(language = None, task = "transcribe")
        transcription=model.transcribe(
            audio,
            language = 'pt'
        )
       # predicted_ids = model.generate(input_features, forced_decoder_ids = forced_decoder_ids)
        
        #transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True)
        string1=''
        for i, segment in enumerate(transcription, start=1):
       #  write srt lines
           string1=string1+"\n" + f"{str(i)}\n"+  f"{utils.format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> " + f"{utils.format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"+ f"{segment['text'].strip().replace('-->', '->')}\n";
              
          
      st.write(string1 )
      st.success("Texto Gerado")
   #   print('3')