SoundOfWater / app.py
bpiyush's picture
Improves the yt download reliablity
cdac373
import os
import sys
sys.path.append("../")
import gradio as gr
import torch
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "serif"
import decord
import PIL, PIL.Image
import librosa
from IPython.display import Markdown, display
import pandas as pd
from util import *
css = """
<style>
body {
font-family: 'Arial', serif;
margin: 0;
padding: 0;
color: black;
}
.header {
display: flex;
align-items: center;
justify-content: center;
margin-top: 5px;
color: black;
}
.footer {
display: flex;
align-items: center;
justify-content: center;
margin-top: 5px;
}
.image {
margin-right: 20px;
}
.content {
text-align: center;
color: black;
}
.title {
font-size: 2.5em;
font-weight: bold;
margin-bottom: 10px;
}
.authors {
color: #4a90e2;
font-size: 1.05em;
margin: 10px 0;
}
.affiliations {
font-size: 1.em;
margin-bottom: 20px;
}
.buttons {
display: flex;
justify-content: center;
gap: 10px;
}
.button {
background-color: #545758;
text-decoration: none;
padding: 8px 16px;
border-radius: 5px;
font-size: 1.05em;
}
.button:hover {
background-color: #333;
}
</style>
"""
header = css + """
<div class="header">
<!-- <div class="image">
<img src="./media_assets/pouring-water-logo5.png" alt="logo" width="100">
</div> -->
<div class="content">
<img src="https://bpiyush.github.io/pouring-water-website/assets/pouring-water-logo5.png" alt="logo" width="80" style="margin-bottom: -50px; margin-right: 30px;">
<div class="title" style="font-size: 44px; margin-left: -30px;">The Sound of Water</div>
<div style="font-size: 30px; margin-left: -30px;"><b>Inferring Physical Properties from Pouring Liquids</b></div>
<div class="authors">
<a style="color: #92eaff; href="https://bpiyush.github.io/">Piyush Bagad</a><sup>1</sup>,
<a style="color: #92eaff; href="https://makarandtapaswi.github.io/">Makarand Tapaswi</a><sup>2</sup>,
<a style="color: #92eaff; href="https://www.ceessnoek.info/">Cees G. M. Snoek</a><sup>3</sup>,
<a style="color: #92eaff; href="https://www.robots.ox.ac.uk/~az/">Andrew Zisserman</a><sup>1</sup>,
</div>
<div class="affiliations">
<sup>1</sup>University of Oxford, <sup>2</sup>IIIT Hyderabad, <sup>3</sup>University of Amsterdam
</div>
<div class="buttons">
<a href="#" style="color: #92eaff;" class="button">arXiv</a>
<a href="https://bpiyush.github.io/pouring-water-website/" style="color: #92eaff;" class="button">🌐 Project</a>
<a href="https://github.com/bpiyush/SoundOfWater" style="color: #92eaff;" class="button"> <img src="https://bpiyush.github.io/pouring-water-website/assets/github-logo.png" alt="logo" style="height:16px; float: left;"> &nbsp;Code</a>
<a href="https://huggingface.co/datasets/bpiyush/sound-of-water" style="color: #92eaff;" class="button">πŸ€— Data</a>
<a href="https://huggingface.co/bpiyush/sound-of-water-models" style="color: #92eaff;" class="button">πŸ€— Models</a>
<a href="#" style="color: #92eaff;" class="button">🎯 Demo</a>
</div>
</div>
</div>
"""
footer = css + """
<div class="header" style="justify-content: left;">
<div class="content" style="font-size: 16px;">
Please give us a 🌟 on <a href='https://github.com/bpiyush/SoundOfWater'>Github</a> if you like our work!
Tips to get better results:
<br><br>
<ol style="text-align: left; font-size: 14px; margin-left: 30px">
<li>The first example may take up to 30-60s for processing since the model is also loaded.</li>
<li>
If you are providing a link, it may take a few seconds to download video from YouTube.
Note that the entire video shall be used.
If the sound of pouring is not clear, the results will be random.
</li>
<li>Although the model is somewhat robust to noise, make sure there is not too much noise such that the pouring is audible.</li>
<li>Note that the video is not used during the inference. The displayed frame is only for reference.</li>
</ol>
</div>
</div>
"""
from download_youtube import download_youtube_video_ytdlp
def download_from_youtube(
video_id,
save_dir="/tmp/",
convert_to_mp4=False,
):
"""
Downloads a YouTube video from start to end times.
Args:
video_id (str): YouTube video ID.
save_dir (str): Directory to save the video.
convert_to_mp4 (bool): Whether to convert the video to mp4 format.
The saved video is in the format: {save_dir}/{video_id}.mp4
"""
import datetime
from subprocess import call
print("Downloading video from YouTube...")
print("Video ID:", video_id)
command = [
"yt-dlp",
"-o", "'{}%(id)s.%(ext)s'".format(save_dir),
"--cookies ./youtube_cookies.txt",
"--verbose",
"--force-overwrites",
f"https://www.youtube.com/watch?v={video_id}",
]
try:
call(" ".join(command), shell=True)
except Exception as e:
print(e)
raise IOError("Failed to download to download YouTube video.")
# If not mp4, convert to mp4
from glob import glob
saved_filepath = glob(os.path.join(save_dir, f"{video_id}.*"))[0]
print("Saved file:", saved_filepath)
if convert_to_mp4:
ext = saved_filepath.split(".")[-1]
to_save = saved_filepath.replace(ext, "mp4")
if ext != "mp4":
# convert to mp4 using ffmpeg
command = "ffmpeg -y -i {} {}".format(saved_filepath, to_save)
call(command, shell=True)
return to_save
else:
return saved_filepath
def configure_input():
gr.Markdown(
"#### Either upload a video file or provide a YouTube link to a video. Note that the entire video shall be used.",
)
video_input = gr.Video(label="Upload Video", height=520)
youtube_link = gr.Textbox(label="YouTube Link", value=None)
gr.Markdown(
"Note: Often, YouTube download can fail because the video may not be public or YouTube asks for Sign in."\
"We recommend downloading the video in other ways on your machine and uploading it here."\
" Alternatively, you can clone the repository and run the demo locally which can allow for Sign-in.",
)
return [video_input, youtube_link]
# video_backend = "decord"
video_backend = "torchvision"
def get_predictions(video_path):
model = load_model()
frame = load_frame(video_path, video_backend=video_backend)
S = load_spectrogram(video_path)
audio = load_audio_tensor(video_path)
z_audio, y_audio = get_model_output(audio, model)
image, df_show, tsne_image = show_output(frame, S, y_audio, z_audio)
return image, df_show, tsne_image
def get_video_id_from_url(url):
import re
if "v=" in url:
video_id = re.findall(r"v=([a-zA-Z0-9_-]+)", url)
elif "youtu.be" in url:
video_id = re.findall(r"youtu.be/([a-zA-Z0-9_-]+)", url)
elif "shorts" in url:
video_id = re.findall(r"shorts/([a-zA-Z0-9_-]+)", url)
else:
raise ValueError("Invalid YouTube URL")
print("Video URL:", url)
print("Video ID:", video_id)
if len(video_id) > 0:
return video_id[0]
else:
raise ValueError("Invalid YouTube URL")
note = """
**Note**: Radius (as well as height) estimation depends on accurate wavelength estimation towards the end.
Thus, it may not be accurate if the wavelength is not estimated correctly at the end.
$$
H = l(0) = \\frac{\lambda(0) - \lambda(T)}{4} \ \ \\text{and} \ \ R = \\frac{\lambda(T)}{4\\beta}
$$
"""
# Example usage in a Gradio interface
def process_input(video, youtube_link):
provided_video = video is not None
if youtube_link is None:
provided_link = False
elif isinstance(youtube_link, str):
provided_link = len(youtube_link) > 0
else:
raise ValueError(f"Invalid type of link {youtube_link}.")
if provided_video and provided_link:
raise ValueError("Please provide either a video file or a YouTube link, not both.")
if provided_video:
print(video)
# # Load model globally
# model = load_model()
# The input is a video file path
video_path = video
# Get predictions
image, df_show, tsne_image = get_predictions(video_path)
return image, df_show, gr.Markdown(note), tsne_image
else:
print(provided_link)
assert provided_link, \
"YouTube Link cannot be empty if no video is provided."
video_id = get_video_id_from_url(youtube_link)
print("Video ID:", video_id)
video_path = download_youtube_video_ytdlp(
video_id, save_dir="/tmp/",
)
# Get predictions
image, df_show, tsne_image = get_predictions(video_path)
# Add youtube link to the note
local_note = f"{note}\n\nYou can watch the original video here: "\
f"[YouTube Link](https://www.youtube.com/watch?v={video_id})"
return image, df_show, gr.Markdown(local_note), tsne_image
def configure_outputs():
image_wide = gr.Image(label="Estimated pitch")
dataframe = gr.DataFrame(label="Estimated physical properties")
image_tsne = gr.Image(label="TSNE of features", width=300)
markdown = gr.Markdown(label="Note")
return [image_wide, dataframe, markdown, image_tsne]
# Configure pre-defined examples
examples = [
["./media_assets/example_video.mp4", None],
["./media_assets/ayNzH0uygFw_9.0_21.0.mp4", None],
["./media_assets/biDn0Gi6V8U_7.0_15.0.mp4", None],
["./media_assets/goWgiQQMugA_2.5_9.0.mp4", None],
["./media_assets/K87g4RvO-9k_254.0_259.0.mp4", None],
# Shows that it works with background noise
["./media_assets/l74zJHCZ9uA.webm", None],
# Shows that it works with a slightly differently shaped container
["./media_assets/LpRPV0hIymU.webm", None],
["./media_assets/k-HnMsS36J8.webm", None],
# [None, "https://www.youtube.com/shorts/6eUQTdkTooo"],
# [None, "https://www.youtube.com/shorts/VxZT15cG6tw"],
# [None, "https://www.youtube.com/shorts/GSXQnNhliDY"],
]
# Define Gradio interface
with gr.Blocks(
css=custom_css,
theme=gr.themes.Default(),
) as demo:
# Add the header
gr.HTML(header)
gr.Interface(
fn=process_input,
inputs=configure_input(),
outputs=configure_outputs(),
examples=examples,
)
# Add the footer
gr.HTML(footer)
# Launch the interface
demo.launch(allowed_paths=["."], share=True)