import torch import whisper import pytube import librosa import streamlit as st import numpy as np from fpdf import FPDF from reportlab.pdfgen.canvas import Canvas import time def predict(url=None, translation="No",tran_lang="en"): model_m = whisper.load_model("tiny") #file_path = 'https://cf-courses-data.s3.ujs.cloud-object-storage.appdomain.cloud/IBM-GPXX0EPMEN/20220627_140242.mp4' file_path = 'https://www.youtube.com/watch?v=-WbN61qtTGQ' data = pytube.YouTube(file_path) speech = data.streams.get_audio_only() audio_file = speech.download() audio_35 = whisper.load_audio(audio_file) audio = whisper.pad_or_trim(audio_35) mel = whisper.log_mel_spectrogram(audio).to(model_m.device) _,probs = model_m.detect_language(mel) p = -1 for key in probs: if probs[key] >p: p = probs[key] for keys in probs: if probs[keys] == p: detected_lang = keys lang_dict = sorted(probs) video_url = url v_data = pytube.YouTube(video_url) speech = v_data.streams.get_audio_only() test_audio_file = speech.download() transcription = model_m.transcribe(test_audio_file,fp16=False)["text"] if translation == "Yes": trans = model_m.transcribe(test_audio_file,language=tran_lang,fp16=False)["text"] return detected_lang, transcription, trans else: return lang_dict, transcription st.image(image="https://www.respeecher.com/hubfs/What-is-Text-to-Speech-TTS%29-Initial-Speech-Synthesis-Explained-Respeecher-voice-cloning-software.jpeg",output_format="JPEG") st.title("Sppech to Text generator") st.write("This app uses an open source neural net called Whisper(developed by OpenAI)") url = st.text_input(label="Please enter the YouTube url: ") tran_req = st.selectbox(label="Do you want to translate the transcript?",options=("Yes","No")) if tran_req=="Yes": lang = st.selectbox(label="Please select the required language: ", options=("en","fr","ja")) else: lang = "en" if st.button("Generate"): st.spinner("Fetching the video...") lang_d,transcription,trans = predict(url,translation=tran_req,tran_lang=lang) st.spinner("Speech to Text engine running...") time.sleep(1) st.write("Detected language:",lang_d) #canvas = Canvas("transcript.pdf") #canvas.drawString(72, 72, transcription) #canvas.save() #st.download_button(label="Click here to download the transcript", data=canvas, mime='text/csv',file_name="transcript.pdf") st.write(transcription) st.spinner("Translation in progress..") time.sleep(1) st.write("Translation: ") st.write(trans) st.spinner("Completed") st.success("Speech to text converted successfully!")