Spaces:
Build error
Build error
Update download.py
Browse files- download.py +341 -61
download.py
CHANGED
@@ -1,96 +1,376 @@
|
|
1 |
from __future__ import unicode_literals
|
2 |
-
import yt_dlp
|
3 |
import os
|
4 |
import time
|
5 |
-
import os
|
6 |
import shutil
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB
|
9 |
-
FILE_TOO_LARGE_MESSAGE = "The audio file is too large for the current size and rate limits using Whisper. If you used a YouTube link, please try a shorter video clip. If you uploaded an audio file, try trimming or compressing the audio to under 100 MB."
|
10 |
-
max_retries = 3
|
11 |
-
delay = 2
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
class MyLogger(object):
|
15 |
-
def __init__(self, external_logger=lambda x: None):
|
16 |
-
self.external_logger = external_logger
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
|
22 |
-
def warning(self, msg):
|
23 |
-
print("[warning]: ", msg)
|
24 |
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
|
29 |
-
def
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
|
34 |
|
35 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
return {
|
37 |
-
|
38 |
-
|
39 |
-
{
|
40 |
"key": "FFmpegExtractAudio",
|
41 |
-
"preferredcodec":
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
"
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
|
51 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
retries = 0
|
53 |
-
while retries <
|
54 |
try:
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
58 |
info = ydl.extract_info(url, download=False)
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
filename = ydl.prepare_filename(info)
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
except Exception as e:
|
69 |
retries += 1
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
77 |
|
78 |
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
try:
|
|
|
|
|
|
|
|
|
82 |
if os.path.isfile(path):
|
83 |
os.remove(path)
|
84 |
-
|
85 |
elif os.path.isdir(path):
|
86 |
shutil.rmtree(path)
|
87 |
-
|
88 |
else:
|
89 |
-
|
|
|
|
|
90 |
except PermissionError:
|
91 |
-
|
92 |
except FileNotFoundError:
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
except Exception as e:
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from __future__ import unicode_literals
|
2 |
+
import yt_dlp
|
3 |
import os
|
4 |
import time
|
|
|
5 |
import shutil
|
6 |
+
import logging
|
7 |
+
import re
|
8 |
+
import tempfile
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import Optional, Callable, Dict, Any, Union
|
11 |
+
|
12 |
+
# Configuration
|
13 |
+
MAX_FILE_SIZE = 40 * 1024 * 1024 # 40 MB (increased from your original 100MB)
|
14 |
+
FILE_TOO_LARGE_MESSAGE = "The audio file exceeds the 40MB size limit. Please try a shorter video clip or select a lower quality option."
|
15 |
+
MAX_RETRIES = 3
|
16 |
+
RETRY_DELAY = 2 # seconds
|
17 |
+
DEFAULT_AUDIO_FORMAT = "mp3"
|
18 |
+
DEFAULT_AUDIO_QUALITY = "192" # kbps
|
19 |
+
SUPPORTED_FORMATS = ["mp3", "m4a", "wav", "aac", "flac", "opus"]
|
20 |
+
|
21 |
+
# Setup logging
|
22 |
+
logging.basicConfig(
|
23 |
+
level=logging.INFO,
|
24 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
25 |
+
)
|
26 |
+
logger = logging.getLogger("youtube_downloader")
|
27 |
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
class DownloadLogger:
|
30 |
+
"""Enhanced logger for yt-dlp with callback support"""
|
31 |
+
|
32 |
+
def __init__(self, progress_callback: Optional[Callable[[str], None]] = None):
|
33 |
+
self.progress_callback = progress_callback or (lambda x: None)
|
34 |
+
|
35 |
+
def debug(self, msg: str) -> None:
|
36 |
+
if msg.startswith('[download]'):
|
37 |
+
# Extract progress information
|
38 |
+
if '%' in msg:
|
39 |
+
self.progress_callback(msg)
|
40 |
+
logger.debug(msg)
|
41 |
+
|
42 |
+
def warning(self, msg: str) -> None:
|
43 |
+
logger.warning(msg)
|
44 |
+
|
45 |
+
def error(self, msg: str) -> None:
|
46 |
+
logger.error(msg)
|
47 |
|
|
|
|
|
|
|
48 |
|
49 |
+
class DownloadError(Exception):
|
50 |
+
"""Custom exception for download errors"""
|
51 |
+
pass
|
52 |
|
|
|
|
|
53 |
|
54 |
+
def validate_url(url: str) -> bool:
|
55 |
+
"""Validate if the URL is a supported video platform URL"""
|
56 |
+
video_platforms = [
|
57 |
+
r'youtube\.com',
|
58 |
+
r'youtu\.be',
|
59 |
+
r'vimeo\.com',
|
60 |
+
r'dailymotion\.com',
|
61 |
+
r'twitch\.tv',
|
62 |
+
r'soundcloud\.com',
|
63 |
+
r'instagram\.com'
|
64 |
+
]
|
65 |
+
|
66 |
+
pattern = '|'.join([f'({platform})' for platform in video_platforms])
|
67 |
+
return bool(re.search(pattern, url, re.IGNORECASE))
|
68 |
|
69 |
|
70 |
+
def ensure_download_directory(directory: str) -> str:
|
71 |
+
"""Ensure download directory exists, create if it doesn't"""
|
72 |
+
path = Path(directory)
|
73 |
+
path.mkdir(parents=True, exist_ok=True)
|
74 |
+
return str(path.absolute())
|
75 |
|
76 |
|
77 |
+
def get_download_options(
|
78 |
+
output_dir: str = "./downloads/audio",
|
79 |
+
audio_format: str = DEFAULT_AUDIO_FORMAT,
|
80 |
+
audio_quality: str = DEFAULT_AUDIO_QUALITY,
|
81 |
+
progress_callback: Optional[Callable[[str], None]] = None
|
82 |
+
) -> Dict[str, Any]:
|
83 |
+
"""
|
84 |
+
Get yt-dlp download options with specified parameters
|
85 |
+
|
86 |
+
Args:
|
87 |
+
output_dir: Directory to save downloaded files
|
88 |
+
audio_format: Audio format (mp3, m4a, wav, etc.)
|
89 |
+
audio_quality: Audio quality in kbps
|
90 |
+
progress_callback: Function to call with progress updates
|
91 |
+
|
92 |
+
Returns:
|
93 |
+
Dictionary of yt-dlp options
|
94 |
+
"""
|
95 |
+
if audio_format not in SUPPORTED_FORMATS:
|
96 |
+
logger.warning(f"Unsupported format '{audio_format}', falling back to {DEFAULT_AUDIO_FORMAT}")
|
97 |
+
audio_format = DEFAULT_AUDIO_FORMAT
|
98 |
+
|
99 |
+
# Ensure download directory exists
|
100 |
+
output_dir = ensure_download_directory(output_dir)
|
101 |
+
|
102 |
return {
|
103 |
+
"format": "bestaudio/best",
|
104 |
+
"postprocessors": [{
|
|
|
105 |
"key": "FFmpegExtractAudio",
|
106 |
+
"preferredcodec": audio_format,
|
107 |
+
"preferredquality": audio_quality,
|
108 |
+
}],
|
109 |
+
"logger": DownloadLogger(progress_callback),
|
110 |
+
"outtmpl": f"{output_dir}/%(title)s.%(ext)s",
|
111 |
+
"noplaylist": True,
|
112 |
+
"quiet": False,
|
113 |
+
"no_warnings": False,
|
114 |
+
"progress_hooks": [lambda d: download_progress_hook(d, progress_callback)],
|
115 |
+
"overwrites": True,
|
116 |
+
}
|
117 |
+
|
118 |
+
|
119 |
+
def download_progress_hook(d: Dict[str, Any], callback: Optional[Callable[[str], None]] = None) -> None:
|
120 |
+
"""
|
121 |
+
Hook for tracking download progress
|
122 |
+
|
123 |
+
Args:
|
124 |
+
d: Download information dictionary
|
125 |
+
callback: Function to call with progress updates
|
126 |
+
"""
|
127 |
+
if callback is None:
|
128 |
+
callback = lambda x: None
|
129 |
+
|
130 |
+
if d['status'] == 'downloading':
|
131 |
+
progress = d.get('_percent_str', 'unknown progress')
|
132 |
+
speed = d.get('_speed_str', 'unknown speed')
|
133 |
+
eta = d.get('_eta_str', 'unknown ETA')
|
134 |
+
callback(f"Downloading: {progress} at {speed}, ETA: {eta}")
|
135 |
+
|
136 |
+
elif d['status'] == 'finished':
|
137 |
+
filename = os.path.basename(d['filename'])
|
138 |
+
callback(f"Download complete: {filename}")
|
139 |
+
logger.info(f"Download finished: {d['filename']}")
|
140 |
|
141 |
|
142 |
+
def estimate_file_size(info: Dict[str, Any]) -> int:
|
143 |
+
"""
|
144 |
+
Better estimate file size from video info
|
145 |
+
|
146 |
+
Args:
|
147 |
+
info: Video information dictionary
|
148 |
+
|
149 |
+
Returns:
|
150 |
+
Estimated file size in bytes
|
151 |
+
"""
|
152 |
+
# Try different fields that might contain size information
|
153 |
+
filesize = info.get("filesize")
|
154 |
+
if filesize is not None:
|
155 |
+
return filesize
|
156 |
+
|
157 |
+
filesize = info.get("filesize_approx")
|
158 |
+
if filesize is not None:
|
159 |
+
return filesize
|
160 |
+
|
161 |
+
# If we have duration and a bitrate, we can estimate
|
162 |
+
duration = info.get("duration")
|
163 |
+
bitrate = info.get("abr") or info.get("tbr")
|
164 |
+
|
165 |
+
if duration and bitrate:
|
166 |
+
# Estimate using bitrate (kbps) * duration (seconds) / 8 (bits to bytes) * 1024 (to KB)
|
167 |
+
return int(bitrate * duration * 128) # 128 = 1024 / 8
|
168 |
+
|
169 |
+
# Default to a reasonable upper limit if we can't determine
|
170 |
+
return MAX_FILE_SIZE
|
171 |
+
|
172 |
+
|
173 |
+
def download_video_audio(
|
174 |
+
url: str,
|
175 |
+
output_dir: str = "./downloads/audio",
|
176 |
+
audio_format: str = DEFAULT_AUDIO_FORMAT,
|
177 |
+
audio_quality: str = DEFAULT_AUDIO_QUALITY,
|
178 |
+
progress_callback: Optional[Callable[[str], None]] = None
|
179 |
+
) -> Optional[str]:
|
180 |
+
"""
|
181 |
+
Download audio from a video URL
|
182 |
+
|
183 |
+
Args:
|
184 |
+
url: URL of the video
|
185 |
+
output_dir: Directory to save downloaded files
|
186 |
+
audio_format: Audio format (mp3, m4a, wav, etc.)
|
187 |
+
audio_quality: Audio quality in kbps
|
188 |
+
progress_callback: Function to call with progress updates
|
189 |
+
|
190 |
+
Returns:
|
191 |
+
Path to the downloaded audio file or None if download failed
|
192 |
+
|
193 |
+
Raises:
|
194 |
+
DownloadError: If download fails after retries
|
195 |
+
"""
|
196 |
+
if not validate_url(url):
|
197 |
+
error_msg = f"Invalid or unsupported URL: {url}"
|
198 |
+
logger.error(error_msg)
|
199 |
+
raise DownloadError(error_msg)
|
200 |
+
|
201 |
retries = 0
|
202 |
+
while retries < MAX_RETRIES:
|
203 |
try:
|
204 |
+
if progress_callback:
|
205 |
+
progress_callback(f"Starting download (attempt {retries + 1}/{MAX_RETRIES})...")
|
206 |
+
|
207 |
+
ydl_opts = get_download_options(output_dir, audio_format, audio_quality, progress_callback)
|
208 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
209 |
+
logger.info(f"Downloading audio from: {url}")
|
210 |
+
|
211 |
+
# Extract info first without downloading
|
212 |
info = ydl.extract_info(url, download=False)
|
213 |
+
|
214 |
+
# Better file size estimation
|
215 |
+
estimated_size = estimate_file_size(info)
|
216 |
+
if estimated_size > MAX_FILE_SIZE:
|
217 |
+
error_msg = f"{FILE_TOO_LARGE_MESSAGE} (Estimated: {estimated_size / 1024 / 1024:.1f}MB)"
|
218 |
+
logger.error(error_msg)
|
219 |
+
raise DownloadError(error_msg)
|
220 |
+
|
221 |
+
# Now download
|
222 |
+
ydl.download([url])
|
223 |
+
|
224 |
+
# Get the filename - needs some extra handling due to extraction
|
225 |
filename = ydl.prepare_filename(info)
|
226 |
+
base_filename = os.path.splitext(filename)[0]
|
227 |
+
final_filename = f"{base_filename}.{audio_format}"
|
228 |
+
|
229 |
+
# Verify file exists and return path
|
230 |
+
if os.path.exists(final_filename):
|
231 |
+
return final_filename
|
232 |
+
else:
|
233 |
+
# Try to find the file with a different extension
|
234 |
+
for ext in SUPPORTED_FORMATS:
|
235 |
+
potential_file = f"{base_filename}.{ext}"
|
236 |
+
if os.path.exists(potential_file):
|
237 |
+
return potential_file
|
238 |
+
|
239 |
+
# If we get here, something went wrong
|
240 |
+
raise FileNotFoundError(f"Could not locate downloaded file for {url}")
|
241 |
+
|
242 |
+
except yt_dlp.utils.DownloadError as e:
|
243 |
+
retries += 1
|
244 |
+
error_msg = f"Download error (Attempt {retries}/{MAX_RETRIES}): {str(e)}"
|
245 |
+
logger.error(error_msg)
|
246 |
+
if progress_callback:
|
247 |
+
progress_callback(error_msg)
|
248 |
+
|
249 |
+
if "HTTP Error 429" in str(e):
|
250 |
+
# Rate limiting - wait longer
|
251 |
+
time.sleep(RETRY_DELAY * 5)
|
252 |
+
elif retries >= MAX_RETRIES:
|
253 |
+
raise DownloadError(f"Failed to download after {MAX_RETRIES} attempts: {str(e)}")
|
254 |
+
else:
|
255 |
+
time.sleep(RETRY_DELAY)
|
256 |
except Exception as e:
|
257 |
retries += 1
|
258 |
+
error_msg = f"Unexpected error (Attempt {retries}/{MAX_RETRIES}): {str(e)}"
|
259 |
+
logger.error(error_msg)
|
260 |
+
if progress_callback:
|
261 |
+
progress_callback(error_msg)
|
262 |
+
|
263 |
+
if retries >= MAX_RETRIES:
|
264 |
+
raise DownloadError(f"Failed to download after {MAX_RETRIES} attempts: {str(e)}")
|
265 |
+
time.sleep(RETRY_DELAY)
|
266 |
+
|
267 |
+
return None
|
268 |
|
269 |
|
270 |
+
def delete_download(path: str) -> bool:
|
271 |
+
"""
|
272 |
+
Delete a downloaded file or directory
|
273 |
+
|
274 |
+
Args:
|
275 |
+
path: Path to file or directory to delete
|
276 |
+
|
277 |
+
Returns:
|
278 |
+
True if deletion was successful, False otherwise
|
279 |
+
"""
|
280 |
try:
|
281 |
+
if not path or not os.path.exists(path):
|
282 |
+
logger.warning(f"Path does not exist: {path}")
|
283 |
+
return False
|
284 |
+
|
285 |
if os.path.isfile(path):
|
286 |
os.remove(path)
|
287 |
+
logger.info(f"File deleted: {path}")
|
288 |
elif os.path.isdir(path):
|
289 |
shutil.rmtree(path)
|
290 |
+
logger.info(f"Directory deleted: {path}")
|
291 |
else:
|
292 |
+
logger.warning(f"Path is neither a file nor a directory: {path}")
|
293 |
+
return False
|
294 |
+
return True
|
295 |
except PermissionError:
|
296 |
+
logger.error(f"Permission denied: Unable to delete {path}")
|
297 |
except FileNotFoundError:
|
298 |
+
logger.error(f"File or directory not found: {path}")
|
299 |
+
except Exception as e:
|
300 |
+
logger.error(f"Error deleting {path}: {str(e)}")
|
301 |
+
return False
|
302 |
+
|
303 |
+
|
304 |
+
def trim_audio_file(input_file: str, max_duration_seconds: int = 600) -> str:
|
305 |
+
"""
|
306 |
+
Trim an audio file to a maximum duration to reduce file size
|
307 |
+
|
308 |
+
Args:
|
309 |
+
input_file: Path to input audio file
|
310 |
+
max_duration_seconds: Maximum duration in seconds
|
311 |
+
|
312 |
+
Returns:
|
313 |
+
Path to trimmed file
|
314 |
+
"""
|
315 |
+
try:
|
316 |
+
import ffmpeg
|
317 |
+
|
318 |
+
# Create output filename
|
319 |
+
file_dir = os.path.dirname(input_file)
|
320 |
+
file_name, file_ext = os.path.splitext(os.path.basename(input_file))
|
321 |
+
output_file = os.path.join(file_dir, f"{file_name}_trimmed{file_ext}")
|
322 |
+
|
323 |
+
# Trim using ffmpeg
|
324 |
+
ffmpeg.input(input_file).output(
|
325 |
+
output_file, t=str(max_duration_seconds), acodec='copy'
|
326 |
+
).run(quiet=True, overwrite_output=True)
|
327 |
+
|
328 |
+
logger.info(f"Trimmed {input_file} to {max_duration_seconds} seconds")
|
329 |
+
return output_file
|
330 |
+
except Exception as e:
|
331 |
+
logger.error(f"Error trimming audio: {str(e)}")
|
332 |
+
return input_file # Return original if trimming fails
|
333 |
+
|
334 |
+
|
335 |
+
def get_video_info(url: str) -> Dict[str, Any]:
|
336 |
+
"""
|
337 |
+
Get information about a video without downloading
|
338 |
+
|
339 |
+
Args:
|
340 |
+
url: URL of the video
|
341 |
+
|
342 |
+
Returns:
|
343 |
+
Dictionary of video information
|
344 |
+
"""
|
345 |
+
try:
|
346 |
+
with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
|
347 |
+
info = ydl.extract_info(url, download=False)
|
348 |
+
return info
|
349 |
except Exception as e:
|
350 |
+
logger.error(f"Error getting video info: {str(e)}")
|
351 |
+
raise DownloadError(f"Could not retrieve video information: {str(e)}")
|
352 |
+
|
353 |
+
|
354 |
+
# Example usage
|
355 |
+
if __name__ == "__main__":
|
356 |
+
# Example progress callback
|
357 |
+
def print_progress(msg):
|
358 |
+
print(msg)
|
359 |
+
|
360 |
+
try:
|
361 |
+
# Example: Download audio from a YouTube URL
|
362 |
+
url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
363 |
+
audio_path = download_video_audio(
|
364 |
+
url,
|
365 |
+
output_dir="./downloads",
|
366 |
+
audio_format="mp3",
|
367 |
+
progress_callback=print_progress
|
368 |
+
)
|
369 |
+
|
370 |
+
print(f"Downloaded to: {audio_path}")
|
371 |
+
|
372 |
+
# Clean up
|
373 |
+
if audio_path:
|
374 |
+
delete_download(audio_path)
|
375 |
+
except DownloadError as e:
|
376 |
+
print(f"Download failed: {e}")
|