Audio / src /streamlit_app.py
7H4M3R's picture
Update src/streamlit_app.py
b25a75c verified
import streamlit as st
import os
from transformers import pipeline
from transformers.utils import logging
import numpy as np
import pandas as pd
import yt_dlp
import torchaudio
import ffmpeg
logging.set_verbosity_info()
RATE_HZ = 16000
MAX_SECONDS = 1
MAX_LENGTH = RATE_HZ * MAX_SECONDS
MAX_SEGMENTS = 250
def download_video(url, output_path="video.mp4"):
ydl_opts = {
'format': 'worstvideo[ext=mp4]+bestaudio[ext=m4a]/bestaudio',
'outtmpl': output_path,
'merge_output_format': 'mp4',
'quiet': True,
'noplaylist': True,
'nocheckcertificate': True,
'retries': 3,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
return output_path
def extract_audio(input_path, output_path="audio.mp3"):
(
ffmpeg
.input(input_path)
.output(output_path, format='mp3', acodec='libmp3lame', audio_bitrate='192k')
.overwrite_output()
.run(quiet=True)
)
return output_path
def split_audio(file):
segmented_audio = []
try:
audio, rate = torchaudio.load(str(file))
transform = torchaudio.transforms.Resample(rate, RATE_HZ)
num_segments = (len(audio[0]) // MAX_LENGTH) # Floor division to get segments
for i in range(num_segments):
if i >= MAX_SEGMENTS:
break
start = i * MAX_LENGTH
end = min((i + 1) * MAX_LENGTH, len(audio[0]))
segment = audio[0][start:end]
segment = transform(segment).squeeze(0).numpy().reshape(-1)
segmented_audio.append(segment)
except Exception as e:
print(f"Error processing file: {e}")
return segmented_audio
else:
return np.concatenate(segmented_audio)
accent_mapping = {
'us': 'American',
'canada': 'Canadian',
'england': 'British',
'indian': 'Indian',
'australia': 'Australian',
}
st.set_page_config(page_title="Accent Classifier", layout="centered")
st.title("πŸŽ™οΈ English Accent Classifier")
st.markdown("Upload a video link and get the English accent with confidence.")
video_url = st.text_input("Paste a public video URL (Loom, or MP4):")
if st.button("Analyze"):
if not video_url.strip():
st.warning("Please enter a valid URL.")
else:
with st.spinner("Downloading video..."):
video_path = download_video(video_url)
with st.spinner("Extracting audio..."):
audio_path = extract_audio(video_path)
with st.spinner("Extracting Waves..."):
waves = split_audio(audio_path)
with st.spinner("Classifying accent..."):
model_name = "dima806/english_accents_classification"
pipe = pipeline('audio-classification', model=model_name, device=0)
# accent_data = accent_classify(pipe, audio_path)
accent_data = pipe(waves)[0]
accent = accent_mapping.get(accent_data.get("label", "us"))
confidence = accent_data.get("score", 0)
st.success("Analysis Complete!")
st.markdown(f"**Accent:** {accent}")
st.markdown(f"**Confidence Score:** {confidence:.2f}%")
# Cleanup
os.remove(video_path)
os.remove(audio_path)