llinahosna commited on
Commit
9fb8c08
1 Parent(s): d2b22fb

Create download_from_youtube.py

Browse files
Files changed (1) hide show
  1. download_from_youtube.py +53 -0
download_from_youtube.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import urllib.parse
4
+
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
+ import youtube_dl
7
+
8
+
9
+ def get_video_id(url):
10
+ url_data = urllib.parse.urlparse(url)
11
+ query = urllib.parse.parse_qs(url_data.query)
12
+ id = query["v"][0]
13
+ return id
14
+
15
+
16
+ def get_video_name(url):
17
+ id = get_video_id(url)
18
+ params = {"format": "json", "url": "https://www.youtube.com/watch?v=%s" % id}
19
+ url = "https://www.youtube.com/oembed"
20
+ query_string = urllib.parse.urlencode(params)
21
+ url = url + "?" + query_string
22
+
23
+ with urllib.request.urlopen(url) as response:
24
+ response_text = response.read()
25
+ data = json.loads(response_text.decode())
26
+ return data['title']
27
+
28
+
29
+ def download_transcription(url, output_path):
30
+ if os.path.exists(output_path):
31
+ return
32
+
33
+ id = get_video_id(url)
34
+
35
+ # Download transcript with 'YouTubeTranscriptApi'
36
+ str = YouTubeTranscriptApi.get_transcript(id, languages=['en'])
37
+ json.dump(str, open(output_path, 'w'))
38
+
39
+
40
+ def download_mp3(url, output_path):
41
+ if os.path.exists(output_path):
42
+ return
43
+ ydl_opts = {
44
+ 'outtmpl': output_path,
45
+ 'format': 'bestaudio/best',
46
+ 'postprocessors': [{
47
+ 'key': 'FFmpegExtractAudio',
48
+ 'preferredcodec': 'mp3',
49
+ 'preferredquality': '192',
50
+ }],
51
+ }
52
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
53
+ ydl.download([url])