Spaces:

Photon08
/

speech_to_text

Runtime error

File size: 2,748 Bytes

d4b6fc6
 
 
 
 
 
 
fb8c0b6
667630b
d4b6fc6
 
 
0ca3a16
d4b6fc6
0ca3a16
d4b6fc6
8407d4b
 
 
 
 
6f66c35
d4b6fc6
 
 
 
 
 
 
33186cb
 
 
 
 
 
 
 
d4b6fc6
 
 
 
 
 
 
 
 
8508782
33186cb
d4b6fc6
 
37e5baf
a0e4077
 
 
d4b6fc6
667630b
d4b6fc6
 
 
13aaabd
d4b6fc6
 
 
 
c147b83
d4b6fc6
667630b
c147b83
667630b
 
 
5f57e08
 
 
 
 
c147b83
667630b
5f57e08
667630b
 
c147b83

import torch
import whisper
import pytube
import librosa
import streamlit as st
import numpy as np
from fpdf import FPDF
from reportlab.pdfgen.canvas import Canvas
import time



def predict(url=None, translation="No",tran_lang="en"):

    model_m = whisper.load_model("tiny")

    #file_path = 'https://cf-courses-data.s3.ujs.cloud-object-storage.appdomain.cloud/IBM-GPXX0EPMEN/20220627_140242.mp4'
    file_path = 'https://www.youtube.com/watch?v=-WbN61qtTGQ'
    data = pytube.YouTube(file_path)
    speech = data.streams.get_audio_only()
    audio_file = speech.download()
    audio_35 = whisper.load_audio(audio_file)

    audio = whisper.pad_or_trim(audio_35)

    mel = whisper.log_mel_spectrogram(audio).to(model_m.device)

    _,probs = model_m.detect_language(mel)

    p = -1
    for key in probs:
        if probs[key] >p:
            p = probs[key]
    for keys in probs:
        if probs[keys] == p:
            detected_lang = keys

    lang_dict = sorted(probs)
    video_url = url
    v_data = pytube.YouTube(video_url)
    speech = v_data.streams.get_audio_only()
    test_audio_file = speech.download()

    transcription = model_m.transcribe(test_audio_file,fp16=False)["text"]

    if translation == "Yes":
        trans = model_m.transcribe(test_audio_file,language=tran_lang,fp16=False)["text"]
        return detected_lang, transcription, trans
    else:
        return lang_dict, transcription
st.image(image="https://www.respeecher.com/hubfs/What-is-Text-to-Speech-TTS%29-Initial-Speech-Synthesis-Explained-Respeecher-voice-cloning-software.jpeg",output_format="JPEG")
st.title("Sppech to Text generator")

st.write("This app uses an open source neural net called Whisper(developed by OpenAI)")

url = st.text_input(label="Please enter the YouTube url: ")
tran_req = st.selectbox(label="Do you want to translate the transcript?",options=("Yes","No"))

if tran_req=="Yes":
    lang = st.selectbox(label="Please select the required language: ", options=("en","fr","ja"))
else:
    lang = "en"

if st.button("Generate"):
    st.spinner("Fetching the video...")
    lang_d,transcription,trans = predict(url,translation=tran_req,tran_lang=lang)
    
    st.spinner("Speech to Text engine running...")
    time.sleep(1)
    
    st.write("Detected language:",lang_d)
    #canvas = Canvas("transcript.pdf")
    #canvas.drawString(72, 72, transcription)
    #canvas.save()
    #st.download_button(label="Click here to download the transcript", data=canvas, mime='text/csv',file_name="transcript.pdf")
    st.write(transcription)
    st.spinner("Translation in progress..")
    time.sleep(1)
    st.write("Translation: ")
    
    st.write(trans)
    st.spinner("Completed")
    st.success("Speech to text converted successfully!")