# import dependencies # Audio Manipulation import audioread import librosa from pydub import AudioSegment, silence import youtube_dl from youtube_dl import DownloadError # Models import torch from transformers import pipeline, HubertForCTC, T5Tokenizer, T5ForConditionalGeneration, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2Tokenizer from pyannote.audio import Pipeline # Others from datetime import timedelta import os import pandas as pd import pickle import re import streamlit as st import time import whisper from whisper import load_model import whisperx import os os.environ["PYTORCH_CUDA_ALLOC_CONF"] ="128mb" import gc torch.cuda.empty_cache() gc.collect() def config(): """ App Configuration This functions sets the page title, its favicon, initialize some global variables (session_state values), displays a title, a smaller one, and apply CSS Code to the app. """ # Set config st.set_page_config(page_title="Speech to Text", page_icon="📝") # Create a Data Directory # Will not be executed with AI Deploy because it is indicated in the DockerFile of the app if not os.path.exists("../data"): os.makedirs("../data") # Initialize session state variables if 'page_index' not in st.session_state: st.session_state['page_index'] = -1 # Handle which page should be displayed (token page, home page, results page, rename page) st.session_state['txt_transcript'] = "" # Save the transcript as .txt so we can display it again on the results page st.session_state["process"] = [] # Save the results obtained so we can display them again on the results page st.session_state['srt_txt'] = "" # Save the transcript in a subtitles case to display it on the results page st.session_state['srt_token'] = 0 # Is subtitles parameter enabled or not st.session_state['audio_file'] = None # Save the audio file provided by the user so we can display it again on the results page st.session_state["start_time"] = 0 # Default audio player starting point (0s) st.session_state["summary"] = "" # Save the summary of the transcript so we can display it on the results page st.session_state["number_of_speakers"] = 0 # Save the number of speakers detected in the conversation (diarization) st.session_state["chosen_mode"] = 0 # Save the mode chosen by the user (Diarization or not, timestamps or not) st.session_state["btn_token_list"] = [] # List of tokens that indicates what options are activated to adapt the display on results page st.session_state["my_HF_token"] = "ACCESS_TOKEN_GOES_HERE" # User's Token that allows the use of the diarization model st.session_state["disable"] = True # Default appearance of the button to change your token # Display Text and CSS st.title("Speech to Text App 📝") st.markdown("""