speech_to_text / app.py
Photon08's picture
Update app.py
37e5baf
import torch
import whisper
import pytube
import librosa
import streamlit as st
import numpy as np
from fpdf import FPDF
from reportlab.pdfgen.canvas import Canvas
import time
def predict(url=None, translation="No",tran_lang="en"):
model_m = whisper.load_model("tiny")
#file_path = 'https://cf-courses-data.s3.ujs.cloud-object-storage.appdomain.cloud/IBM-GPXX0EPMEN/20220627_140242.mp4'
file_path = 'https://www.youtube.com/watch?v=-WbN61qtTGQ'
data = pytube.YouTube(file_path)
speech = data.streams.get_audio_only()
audio_file = speech.download()
audio_35 = whisper.load_audio(audio_file)
audio = whisper.pad_or_trim(audio_35)
mel = whisper.log_mel_spectrogram(audio).to(model_m.device)
_,probs = model_m.detect_language(mel)
p = -1
for key in probs:
if probs[key] >p:
p = probs[key]
for keys in probs:
if probs[keys] == p:
detected_lang = keys
lang_dict = sorted(probs)
video_url = url
v_data = pytube.YouTube(video_url)
speech = v_data.streams.get_audio_only()
test_audio_file = speech.download()
transcription = model_m.transcribe(test_audio_file,fp16=False)["text"]
if translation == "Yes":
trans = model_m.transcribe(test_audio_file,language=tran_lang,fp16=False)["text"]
return detected_lang, transcription, trans
else:
return lang_dict, transcription
st.image(image="https://www.respeecher.com/hubfs/What-is-Text-to-Speech-TTS%29-Initial-Speech-Synthesis-Explained-Respeecher-voice-cloning-software.jpeg",output_format="JPEG")
st.title("Sppech to Text generator")
st.write("This app uses an open source neural net called Whisper(developed by OpenAI)")
url = st.text_input(label="Please enter the YouTube url: ")
tran_req = st.selectbox(label="Do you want to translate the transcript?",options=("Yes","No"))
if tran_req=="Yes":
lang = st.selectbox(label="Please select the required language: ", options=("en","fr","ja"))
else:
lang = "en"
if st.button("Generate"):
st.spinner("Fetching the video...")
lang_d,transcription,trans = predict(url,translation=tran_req,tran_lang=lang)
st.spinner("Speech to Text engine running...")
time.sleep(1)
st.write("Detected language:",lang_d)
#canvas = Canvas("transcript.pdf")
#canvas.drawString(72, 72, transcription)
#canvas.save()
#st.download_button(label="Click here to download the transcript", data=canvas, mime='text/csv',file_name="transcript.pdf")
st.write(transcription)
st.spinner("Translation in progress..")
time.sleep(1)
st.write("Translation: ")
st.write(trans)
st.spinner("Completed")
st.success("Speech to text converted successfully!")