Joshua Lochner commited on
Commit
a45bd3f
1 Parent(s): 6e9c369

Add youtube transcript api

Browse files
youtube_transcript_api2/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._api import YouTubeTranscriptApi
2
+ from ._transcripts import TranscriptList, Transcript
3
+ from ._errors import (
4
+ TranscriptsDisabled,
5
+ NoTranscriptFound,
6
+ CouldNotRetrieveTranscript,
7
+ VideoUnavailable,
8
+ TooManyRequests,
9
+ NotTranslatable,
10
+ TranslationLanguageNotAvailable,
11
+ NoTranscriptAvailable,
12
+ CookiePathInvalid,
13
+ CookiesInvalid,
14
+ FailedToCreateConsentCookie,
15
+ YouTubeRequestFailed,
16
+ )
youtube_transcript_api2/__main__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import logging
4
+
5
+ from ._cli import YouTubeTranscriptCli
6
+
7
+
8
+ def main():
9
+ logging.basicConfig()
10
+
11
+ print(YouTubeTranscriptCli(sys.argv[1:]).run())
12
+
13
+
14
+ if __name__ == '__main__':
15
+ main()
youtube_transcript_api2/_api.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ try: # pragma: no cover
3
+ import http.cookiejar as cookiejar
4
+ CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
5
+ except ImportError: # pragma: no cover
6
+ import cookielib as cookiejar
7
+ CookieLoadError = IOError
8
+
9
+ from ._transcripts import TranscriptListFetcher
10
+
11
+ from ._errors import (
12
+ CookiePathInvalid,
13
+ CookiesInvalid
14
+ )
15
+
16
+
17
+ class YouTubeTranscriptApi(object):
18
+ @classmethod
19
+ def list_transcripts(cls, video_id, proxies=None, cookies=None):
20
+ """
21
+ Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
22
+ which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
23
+ over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
24
+ metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
25
+ `transcript.translate('en')`. Example::
26
+
27
+ # retrieve the available transcripts
28
+ transcript_list = YouTubeTranscriptApi.get('video_id')
29
+
30
+ # iterate over all available transcripts
31
+ for transcript in transcript_list:
32
+ # the Transcript object provides metadata properties
33
+ print(
34
+ transcript.video_id,
35
+ transcript.language,
36
+ transcript.language_code,
37
+ # whether it has been manually created or generated by YouTube
38
+ transcript.is_generated,
39
+ # a list of languages the transcript can be translated to
40
+ transcript.translation_languages,
41
+ )
42
+
43
+ # fetch the actual transcript data
44
+ print(transcript.fetch())
45
+
46
+ # translating the transcript will return another transcript object
47
+ print(transcript.translate('en').fetch())
48
+
49
+ # you can also directly filter for the language you are looking for, using the transcript list
50
+ transcript = transcript_list.find_transcript(['de', 'en'])
51
+
52
+ # or just filter for manually created transcripts
53
+ transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
54
+
55
+ # or automatically generated ones
56
+ transcript = transcript_list.find_generated_transcript(['de', 'en'])
57
+
58
+ :param video_id: the youtube video id
59
+ :type video_id: str
60
+ :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
61
+ :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
62
+ :param cookies: a string of the path to a text file containing youtube authorization cookies
63
+ :type cookies: str
64
+ :return: the list of available transcripts
65
+ :rtype TranscriptList:
66
+ """
67
+ with requests.Session() as http_client:
68
+ if cookies:
69
+ http_client.cookies = cls._load_cookies(cookies, video_id)
70
+ http_client.proxies = proxies if proxies else {}
71
+ return TranscriptListFetcher(http_client).fetch(video_id)
72
+
73
+ @classmethod
74
+ def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
75
+ """
76
+ Retrieves the transcripts for a list of videos.
77
+
78
+ :param video_ids: a list of youtube video ids
79
+ :type video_ids: list[str]
80
+ :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
81
+ it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
82
+ do so.
83
+ :type languages: list[str]
84
+ :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
85
+ one of the video transcripts
86
+ :type continue_after_error: bool
87
+ :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
88
+ :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
89
+ :param cookies: a string of the path to a text file containing youtube authorization cookies
90
+ :type cookies: str
91
+ :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
92
+ video ids, which could not be retrieved
93
+ :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
94
+ """
95
+ data = {}
96
+ unretrievable_videos = []
97
+
98
+ for video_id in video_ids:
99
+ try:
100
+ data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
101
+ except Exception as exception:
102
+ if not continue_after_error:
103
+ raise exception
104
+
105
+ unretrievable_videos.append(video_id)
106
+
107
+ return data, unretrievable_videos
108
+
109
+ @classmethod
110
+ def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
111
+ """
112
+ Retrieves the transcript for a single video. This is just a shortcut for calling::
113
+
114
+ YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
115
+
116
+ :param video_id: the youtube video id
117
+ :type video_id: str
118
+ :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
119
+ it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
120
+ do so.
121
+ :type languages: list[str]
122
+ :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
123
+ :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
124
+ :param cookies: a string of the path to a text file containing youtube authorization cookies
125
+ :type cookies: str
126
+ :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
127
+ :rtype [{'text': str, 'start': float, 'end': float}]:
128
+ """
129
+ return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
130
+
131
+ @classmethod
132
+ def _load_cookies(cls, cookies, video_id):
133
+ try:
134
+ cookie_jar = cookiejar.MozillaCookieJar()
135
+ cookie_jar.load(cookies)
136
+ if not cookie_jar:
137
+ raise CookiesInvalid(video_id)
138
+ return cookie_jar
139
+ except CookieLoadError:
140
+ raise CookiePathInvalid(video_id)
youtube_transcript_api2/_cli.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from ._api import YouTubeTranscriptApi
4
+
5
+ from .formatters import FormatterLoader
6
+
7
+
8
+ class YouTubeTranscriptCli(object):
9
+ def __init__(self, args):
10
+ self._args = args
11
+
12
+ def run(self):
13
+ parsed_args = self._parse_args()
14
+
15
+ if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
16
+ return ''
17
+
18
+ proxies = None
19
+ if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
20
+ proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
21
+
22
+ cookies = parsed_args.cookies
23
+
24
+ transcripts = []
25
+ exceptions = []
26
+
27
+ for video_id in parsed_args.video_ids:
28
+ try:
29
+ transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
30
+ except Exception as exception:
31
+ exceptions.append(exception)
32
+
33
+ return '\n\n'.join(
34
+ [str(exception) for exception in exceptions]
35
+ + ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else [])
36
+ )
37
+
38
+ def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
39
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)
40
+
41
+ if parsed_args.list_transcripts:
42
+ return str(transcript_list)
43
+
44
+ if parsed_args.exclude_manually_created:
45
+ transcript = transcript_list.find_generated_transcript(parsed_args.languages)
46
+ elif parsed_args.exclude_generated:
47
+ transcript = transcript_list.find_manually_created_transcript(parsed_args.languages)
48
+ else:
49
+ transcript = transcript_list.find_transcript(parsed_args.languages)
50
+
51
+ if parsed_args.translate:
52
+ transcript = transcript.translate(parsed_args.translate)
53
+
54
+ return transcript.fetch()
55
+
56
+ def _parse_args(self):
57
+ parser = argparse.ArgumentParser(
58
+ description=(
59
+ 'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
60
+ 'It also works for automatically generated subtitles and it does not require a headless browser, like '
61
+ 'other selenium based solutions do!'
62
+ )
63
+ )
64
+ parser.add_argument(
65
+ '--list-transcripts',
66
+ action='store_const',
67
+ const=True,
68
+ default=False,
69
+ help='This will list the languages in which the given videos are available in.',
70
+ )
71
+ parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
72
+ parser.add_argument(
73
+ '--languages',
74
+ nargs='*',
75
+ default=['en',],
76
+ type=str,
77
+ help=(
78
+ 'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
79
+ 'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails '
80
+ 'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
81
+ 'may have to play around with the language codes a bit, to find the one which is working for you!'
82
+ ),
83
+ )
84
+ parser.add_argument(
85
+ '--exclude-generated',
86
+ action='store_const',
87
+ const=True,
88
+ default=False,
89
+ help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.',
90
+ )
91
+ parser.add_argument(
92
+ '--exclude-manually-created',
93
+ action='store_const',
94
+ const=True,
95
+ default=False,
96
+ help='If this flag is set transcripts which have been manually created will not be retrieved.',
97
+ )
98
+ parser.add_argument(
99
+ '--format',
100
+ type=str,
101
+ default='pretty',
102
+ choices=tuple(FormatterLoader.TYPES.keys()),
103
+ )
104
+ parser.add_argument(
105
+ '--translate',
106
+ default='',
107
+ help=(
108
+ 'The language code for the language you want this transcript to be translated to. Use the '
109
+ '--list-transcripts feature to find out which languages are translatable and which translation '
110
+ 'languages are available.'
111
+ )
112
+ )
113
+ parser.add_argument(
114
+ '--http-proxy',
115
+ default='',
116
+ metavar='URL',
117
+ help='Use the specified HTTP proxy.'
118
+ )
119
+ parser.add_argument(
120
+ '--https-proxy',
121
+ default='',
122
+ metavar='URL',
123
+ help='Use the specified HTTPS proxy.'
124
+ )
125
+ parser.add_argument(
126
+ '--cookies',
127
+ default=None,
128
+ help='The cookie file that will be used for authorization with youtube.'
129
+ )
130
+
131
+ return self._sanitize_video_ids(parser.parse_args(self._args))
132
+
133
+ def _sanitize_video_ids(self, args):
134
+ args.video_ids = [video_id.replace('\\', '') for video_id in args.video_ids]
135
+ return args
youtube_transcript_api2/_errors.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._settings import WATCH_URL
2
+
3
+
4
+ class CouldNotRetrieveTranscript(Exception):
5
+ """
6
+ Raised if a transcript could not be retrieved.
7
+ """
8
+ ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
9
+ CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
10
+ CAUSE_MESSAGE = ''
11
+ GITHUB_REFERRAL = (
12
+ '\n\nIf you are sure that the described cause is not responsible for this error '
13
+ 'and that a transcript should be retrievable, please create an issue at '
14
+ 'https://github.com/jdepoix/youtube-transcript-api/issues. '
15
+ 'Please add which version of youtube_transcript_api you are using '
16
+ 'and provide the information needed to replicate the error. '
17
+ 'Also make sure that there are no open issues which already describe your problem!'
18
+ )
19
+
20
+ def __init__(self, video_id):
21
+ self.video_id = video_id
22
+ super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message())
23
+
24
+ def _build_error_message(self):
25
+ cause = self.cause
26
+ error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
27
+
28
+ if cause:
29
+ error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
30
+
31
+ return error_message
32
+
33
+ @property
34
+ def cause(self):
35
+ return self.CAUSE_MESSAGE
36
+
37
+
38
+ class YouTubeRequestFailed(CouldNotRetrieveTranscript):
39
+ CAUSE_MESSAGE = 'Request to YouTube failed: {reason}'
40
+
41
+ def __init__(self, video_id, http_error):
42
+ self.reason = str(http_error)
43
+ super(YouTubeRequestFailed, self).__init__(video_id)
44
+
45
+ @property
46
+ def cause(self):
47
+ return self.CAUSE_MESSAGE.format(
48
+ reason=self.reason,
49
+ )
50
+
51
+
52
+ class VideoUnavailable(CouldNotRetrieveTranscript):
53
+ CAUSE_MESSAGE = 'The video is no longer available'
54
+
55
+
56
+ class TooManyRequests(CouldNotRetrieveTranscript):
57
+ CAUSE_MESSAGE = (
58
+ 'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
59
+ 'One of the following things can be done to work around this:\n\
60
+ - Manually solve the captcha in a browser and export the cookie. '
61
+ 'Read here how to use that cookie with '
62
+ 'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
63
+ - Use a different IP address\n\
64
+ - Wait until the ban on your IP has been lifted'
65
+ )
66
+
67
+
68
+ class TranscriptsDisabled(CouldNotRetrieveTranscript):
69
+ CAUSE_MESSAGE = 'Subtitles are disabled for this video'
70
+
71
+
72
+ class NoTranscriptAvailable(CouldNotRetrieveTranscript):
73
+ CAUSE_MESSAGE = 'No transcripts are available for this video'
74
+
75
+
76
+ class NotTranslatable(CouldNotRetrieveTranscript):
77
+ CAUSE_MESSAGE = 'The requested language is not translatable'
78
+
79
+
80
+ class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript):
81
+ CAUSE_MESSAGE = 'The requested translation language is not available'
82
+
83
+
84
+ class CookiePathInvalid(CouldNotRetrieveTranscript):
85
+ CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
86
+
87
+
88
+ class CookiesInvalid(CouldNotRetrieveTranscript):
89
+ CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
90
+
91
+
92
+ class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
93
+ CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
94
+
95
+
96
+ class NoTranscriptFound(CouldNotRetrieveTranscript):
97
+ CAUSE_MESSAGE = (
98
+ 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
99
+ '{transcript_data}'
100
+ )
101
+
102
+ def __init__(self, video_id, requested_language_codes, transcript_data):
103
+ self._requested_language_codes = requested_language_codes
104
+ self._transcript_data = transcript_data
105
+ super(NoTranscriptFound, self).__init__(video_id)
106
+
107
+ @property
108
+ def cause(self):
109
+ return self.CAUSE_MESSAGE.format(
110
+ requested_language_codes=self._requested_language_codes,
111
+ transcript_data=str(self._transcript_data),
112
+ )
youtube_transcript_api2/_html_unescaping.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+
4
+ # This can only be tested by using different python versions, therefore it is not covered by coverage.py
5
+ if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover
6
+ # Python 3.4+
7
+ from html import unescape
8
+ else: # pragma: no cover
9
+ if sys.version_info.major <= 2:
10
+ # Python 2
11
+ import HTMLParser
12
+
13
+ html_parser = HTMLParser.HTMLParser()
14
+ else:
15
+ # Python 3.0 - 3.3
16
+ import html.parser
17
+
18
+ html_parser = html.parser.HTMLParser()
19
+
20
+ def unescape(string):
21
+ return html_parser.unescape(string)
youtube_transcript_api2/_settings.py ADDED
@@ -0,0 +1 @@
 
 
1
+ WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
youtube_transcript_api2/_transcripts.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ # This can only be tested by using different python versions, therefore it is not covered by coverage.py
4
+ if sys.version_info.major == 2: # pragma: no cover
5
+ reload(sys)
6
+ sys.setdefaultencoding('utf-8')
7
+
8
+ import json
9
+
10
+ from xml.etree import ElementTree
11
+
12
+ import re
13
+
14
+ from requests import HTTPError
15
+
16
+ from ._html_unescaping import unescape
17
+ from ._errors import (
18
+ VideoUnavailable,
19
+ TooManyRequests,
20
+ YouTubeRequestFailed,
21
+ NoTranscriptFound,
22
+ TranscriptsDisabled,
23
+ NotTranslatable,
24
+ TranslationLanguageNotAvailable,
25
+ NoTranscriptAvailable,
26
+ FailedToCreateConsentCookie,
27
+ )
28
+ from ._settings import WATCH_URL
29
+
30
+
31
+ def _raise_http_errors(response, video_id):
32
+ try:
33
+ response.raise_for_status()
34
+ return response
35
+ except HTTPError as error:
36
+ raise YouTubeRequestFailed(error, video_id)
37
+
38
+
39
+ class TranscriptListFetcher(object):
40
+ def __init__(self, http_client):
41
+ self._http_client = http_client
42
+
43
+ def fetch(self, video_id):
44
+ return TranscriptList.build(
45
+ self._http_client,
46
+ video_id,
47
+ self._extract_captions_json(self._fetch_video_html(video_id), video_id)
48
+ )
49
+
50
+ def _extract_captions_json(self, html, video_id):
51
+ splitted_html = html.split('"captions":')
52
+
53
+ if len(splitted_html) <= 1:
54
+ if 'class="g-recaptcha"' in html:
55
+ raise TooManyRequests(video_id)
56
+ if '"playabilityStatus":' not in html:
57
+ raise VideoUnavailable(video_id)
58
+
59
+ raise TranscriptsDisabled(video_id)
60
+
61
+ captions_json = json.loads(
62
+ splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
63
+ ).get('playerCaptionsTracklistRenderer')
64
+ if captions_json is None:
65
+ raise TranscriptsDisabled(video_id)
66
+
67
+ if 'captionTracks' not in captions_json:
68
+ raise NoTranscriptAvailable(video_id)
69
+
70
+ return captions_json
71
+
72
+ def _create_consent_cookie(self, html, video_id):
73
+ match = re.search('name="v" value="(.*?)"', html)
74
+ if match is None:
75
+ raise FailedToCreateConsentCookie(video_id)
76
+ self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
77
+
78
+ def _fetch_video_html(self, video_id):
79
+ html = self._fetch_html(video_id)
80
+ if 'action="https://consent.youtube.com/s"' in html:
81
+ self._create_consent_cookie(html, video_id)
82
+ html = self._fetch_html(video_id)
83
+ if 'action="https://consent.youtube.com/s"' in html:
84
+ raise FailedToCreateConsentCookie(video_id)
85
+ return html
86
+
87
+ def _fetch_html(self, video_id):
88
+ response = self._http_client.get(WATCH_URL.format(video_id=video_id))
89
+ return unescape(_raise_http_errors(response, video_id).text)
90
+
91
+
92
+ class TranscriptList(object):
93
+ """
94
+ This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
95
+ for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
96
+ """
97
+ def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
98
+ """
99
+ The constructor is only for internal use. Use the static build method instead.
100
+
101
+ :param video_id: the id of the video this TranscriptList is for
102
+ :type video_id: str
103
+ :param manually_created_transcripts: dict mapping language codes to the manually created transcripts
104
+ :type manually_created_transcripts: dict[str, Transcript]
105
+ :param generated_transcripts: dict mapping language codes to the generated transcripts
106
+ :type generated_transcripts: dict[str, Transcript]
107
+ :param translation_languages: list of languages which can be used for translatable languages
108
+ :type translation_languages: list[dict[str, str]]
109
+ """
110
+ self.video_id = video_id
111
+ self._manually_created_transcripts = manually_created_transcripts
112
+ self._generated_transcripts = generated_transcripts
113
+ self._translation_languages = translation_languages
114
+
115
+ @staticmethod
116
+ def build(http_client, video_id, captions_json):
117
+ """
118
+ Factory method for TranscriptList.
119
+
120
+ :param http_client: http client which is used to make the transcript retrieving http calls
121
+ :type http_client: requests.Session
122
+ :param video_id: the id of the video this TranscriptList is for
123
+ :type video_id: str
124
+ :param captions_json: the JSON parsed from the YouTube pages static HTML
125
+ :type captions_json: dict
126
+ :return: the created TranscriptList
127
+ :rtype TranscriptList:
128
+ """
129
+ translation_languages = [
130
+ {
131
+ 'language': translation_language['languageName']['simpleText'],
132
+ 'language_code': translation_language['languageCode'],
133
+ } for translation_language in captions_json['translationLanguages']
134
+ ]
135
+
136
+ manually_created_transcripts = {}
137
+ generated_transcripts = {}
138
+
139
+ for caption in captions_json['captionTracks']:
140
+ if caption.get('kind', '') == 'asr':
141
+ transcript_dict = generated_transcripts
142
+ else:
143
+ transcript_dict = manually_created_transcripts
144
+
145
+ transcript_dict[caption['languageCode']] = Transcript(
146
+ http_client,
147
+ video_id,
148
+ caption['baseUrl'],
149
+ caption['name']['simpleText'],
150
+ caption['languageCode'],
151
+ caption.get('kind', '') == 'asr',
152
+ translation_languages if caption.get('isTranslatable', False) else []
153
+ )
154
+
155
+ return TranscriptList(
156
+ video_id,
157
+ manually_created_transcripts,
158
+ generated_transcripts,
159
+ translation_languages,
160
+ )
161
+
162
+ def __iter__(self):
163
+ return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
164
+
165
+ def find_transcript(self, language_codes):
166
+ """
167
+ Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
168
+ are found, generated transcripts are used. If you only want generated transcripts use
169
+ `find_manually_created_transcript` instead.
170
+
171
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
172
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
173
+ it fails to do so.
174
+ :type languages: list[str]
175
+ :return: the found Transcript
176
+ :rtype Transcript:
177
+ :raises: NoTranscriptFound
178
+ """
179
+ return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
180
+
181
+ def find_generated_transcript(self, language_codes):
182
+ """
183
+ Finds a automatically generated transcript for a given language code.
184
+
185
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
186
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
187
+ it fails to do so.
188
+ :type languages: list[str]
189
+ :return: the found Transcript
190
+ :rtype Transcript:
191
+ :raises: NoTranscriptFound
192
+ """
193
+ return self._find_transcript(language_codes, [self._generated_transcripts,])
194
+
195
+ def find_manually_created_transcript(self, language_codes):
196
+ """
197
+ Finds a manually created transcript for a given language code.
198
+
199
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
200
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
201
+ it fails to do so.
202
+ :type languages: list[str]
203
+ :return: the found Transcript
204
+ :rtype Transcript:
205
+ :raises: NoTranscriptFound
206
+ """
207
+ return self._find_transcript(language_codes, [self._manually_created_transcripts,])
208
+
209
+ def _find_transcript(self, language_codes, transcript_dicts):
210
+ for language_code in language_codes:
211
+ for transcript_dict in transcript_dicts:
212
+ if language_code in transcript_dict:
213
+ return transcript_dict[language_code]
214
+
215
+ raise NoTranscriptFound(
216
+ self.video_id,
217
+ language_codes,
218
+ self
219
+ )
220
+
221
+ def __str__(self):
222
+ return (
223
+ 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
224
+ '(MANUALLY CREATED)\n'
225
+ '{available_manually_created_transcript_languages}\n\n'
226
+ '(GENERATED)\n'
227
+ '{available_generated_transcripts}\n\n'
228
+ '(TRANSLATION LANGUAGES)\n'
229
+ '{available_translation_languages}'
230
+ ).format(
231
+ video_id=self.video_id,
232
+ available_manually_created_transcript_languages=self._get_language_description(
233
+ str(transcript) for transcript in self._manually_created_transcripts.values()
234
+ ),
235
+ available_generated_transcripts=self._get_language_description(
236
+ str(transcript) for transcript in self._generated_transcripts.values()
237
+ ),
238
+ available_translation_languages=self._get_language_description(
239
+ '{language_code} ("{language}")'.format(
240
+ language=translation_language['language'],
241
+ language_code=translation_language['language_code'],
242
+ ) for translation_language in self._translation_languages
243
+ )
244
+ )
245
+
246
+ def _get_language_description(self, transcript_strings):
247
+ description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
248
+ return description if description else 'None'
249
+
250
+
251
+ class Transcript(object):
252
+ def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
253
+ """
254
+ You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
255
+ TranscriptList.
256
+
257
+ :param http_client: http client which is used to make the transcript retrieving http calls
258
+ :type http_client: requests.Session
259
+ :param video_id: the id of the video this TranscriptList is for
260
+ :type video_id: str
261
+ :param url: the url which needs to be called to fetch the transcript
262
+ :param language: the name of the language this transcript uses
263
+ :param language_code:
264
+ :param is_generated:
265
+ :param translation_languages:
266
+ """
267
+ self._http_client = http_client
268
+ self.video_id = video_id
269
+ self._url = url
270
+ self.language = language
271
+ self.language_code = language_code
272
+ self.is_generated = is_generated
273
+ self.translation_languages = translation_languages
274
+ self._translation_languages_dict = {
275
+ translation_language['language_code']: translation_language['language']
276
+ for translation_language in translation_languages
277
+ }
278
+
279
+ def fetch(self):
280
+ """
281
+ Loads the actual transcript data.
282
+
283
+ :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
284
+ :rtype [{'text': str, 'start': float, 'end': float}]:
285
+ """
286
+ response = self._http_client.get(self._url)
287
+ return _TranscriptParser().parse(
288
+ _raise_http_errors(response, self.video_id).text,
289
+ )
290
+
291
+ def __str__(self):
292
+ return '{language_code} ("{language}"){translation_description}'.format(
293
+ language=self.language,
294
+ language_code=self.language_code,
295
+ translation_description='[TRANSLATABLE]' if self.is_translatable else ''
296
+ )
297
+
298
+ @property
299
+ def is_translatable(self):
300
+ return len(self.translation_languages) > 0
301
+
302
+ def translate(self, language_code):
303
+ if not self.is_translatable:
304
+ raise NotTranslatable(self.video_id)
305
+
306
+ if language_code not in self._translation_languages_dict:
307
+ raise TranslationLanguageNotAvailable(self.video_id)
308
+
309
+ return Transcript(
310
+ self._http_client,
311
+ self.video_id,
312
+ '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
313
+ self._translation_languages_dict[language_code],
314
+ language_code,
315
+ True,
316
+ [],
317
+ )
318
+
319
+
320
+ class _TranscriptParser(object):
321
+ HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
322
+
323
+ def parse(self, plain_data):
324
+ return [
325
+ {
326
+ 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
327
+ 'start': float(xml_element.attrib['start']),
328
+ 'duration': float(xml_element.attrib.get('dur', '0.0')),
329
+ }
330
+ for xml_element in ElementTree.fromstring(plain_data)
331
+ if xml_element.text is not None
332
+ ]
youtube_transcript_api2/formatters.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import pprint
4
+
5
+
6
+ class Formatter(object):
7
+ """Formatter should be used as an abstract base class.
8
+
9
+ Formatter classes should inherit from this class and implement
10
+ their own .format() method which should return a string. A
11
+ transcript is represented by a List of Dictionary items.
12
+ """
13
+
14
+ def format_transcript(self, transcript, **kwargs):
15
+ raise NotImplementedError('A subclass of Formatter must implement ' \
16
+ 'their own .format_transcript() method.')
17
+
18
+ def format_transcripts(self, transcripts, **kwargs):
19
+ raise NotImplementedError('A subclass of Formatter must implement ' \
20
+ 'their own .format_transcripts() method.')
21
+
22
+
23
+ class PrettyPrintFormatter(Formatter):
24
+ def format_transcript(self, transcript, **kwargs):
25
+ """Pretty prints a transcript.
26
+
27
+ :param transcript:
28
+ :return: A pretty printed string representation of the transcript.'
29
+ :rtype str
30
+ """
31
+ return pprint.pformat(transcript, **kwargs)
32
+
33
+ def format_transcripts(self, transcripts, **kwargs):
34
+ """Pretty prints a list of transcripts.
35
+
36
+ :param transcripts:
37
+ :return: A pretty printed string representation of the transcripts.'
38
+ :rtype str
39
+ """
40
+ return self.format_transcript(transcripts, **kwargs)
41
+
42
+
43
+ class JSONFormatter(Formatter):
44
+ def format_transcript(self, transcript, **kwargs):
45
+ """Converts a transcript into a JSON string.
46
+
47
+ :param transcript:
48
+ :return: A JSON string representation of the transcript.'
49
+ :rtype str
50
+ """
51
+ return json.dumps(transcript, **kwargs)
52
+
53
+ def format_transcripts(self, transcripts, **kwargs):
54
+ """Converts a list of transcripts into a JSON string.
55
+
56
+ :param transcripts:
57
+ :return: A JSON string representation of the transcript.'
58
+ :rtype str
59
+ """
60
+ return self.format_transcript(transcripts, **kwargs)
61
+
62
+
63
+ class TextFormatter(Formatter):
64
+ def format_transcript(self, transcript, **kwargs):
65
+ """Converts a transcript into plain text with no timestamps.
66
+
67
+ :param transcript:
68
+ :return: all transcript text lines separated by newline breaks.'
69
+ :rtype str
70
+ """
71
+ return '\n'.join(line['text'] for line in transcript)
72
+
73
+ def format_transcripts(self, transcripts, **kwargs):
74
+ """Converts a list of transcripts into plain text with no timestamps.
75
+
76
+ :param transcripts:
77
+ :return: all transcript text lines separated by newline breaks.'
78
+ :rtype str
79
+ """
80
+ return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
81
+
82
+
83
+ class WebVTTFormatter(Formatter):
84
+ def _seconds_to_timestamp(self, time):
85
+ """Helper that converts `time` into a transcript cue timestamp.
86
+
87
+ :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp
88
+
89
+ :param time: a float representing time in seconds.
90
+ :type time: float
91
+ :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS'
92
+ :rtype str
93
+ :example:
94
+ >>> self._seconds_to_timestamp(6.93)
95
+ '00:00:06.930'
96
+ """
97
+ time = float(time)
98
+ hours, remainder = divmod(time, 3600)
99
+ mins, secs = divmod(remainder, 60)
100
+ ms = int(round((time - int(time))*1000, 2))
101
+ return "{:02.0f}:{:02.0f}:{:02.0f}.{:03d}".format(hours, mins, secs, ms)
102
+
103
+ def format_transcript(self, transcript, **kwargs):
104
+ """A basic implementation of WEBVTT formatting.
105
+
106
+ :param transcript:
107
+ :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
108
+ """
109
+ lines = []
110
+ for i, line in enumerate(transcript):
111
+ if i < len(transcript) - 1:
112
+ # Looks ahead, use next start time since duration value
113
+ # would create an overlap between start times.
114
+ time_text = "{} --> {}".format(
115
+ self._seconds_to_timestamp(line['start']),
116
+ self._seconds_to_timestamp(transcript[i + 1]['start'])
117
+ )
118
+ else:
119
+ # Reached the end, cannot look ahead, use duration now.
120
+ duration = line['start'] + line['duration']
121
+ time_text = "{} --> {}".format(
122
+ self._seconds_to_timestamp(line['start']),
123
+ self._seconds_to_timestamp(duration)
124
+ )
125
+ lines.append("{}\n{}".format(time_text, line['text']))
126
+
127
+ return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"
128
+
129
+ def format_transcripts(self, transcripts, **kwargs):
130
+ """A basic implementation of WEBVTT formatting for a list of transcripts.
131
+
132
+ :param transcripts:
133
+ :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
134
+ """
135
+ return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
136
+
137
+
138
+ class FormatterLoader(object):
139
+ TYPES = {
140
+ 'json': JSONFormatter,
141
+ 'pretty': PrettyPrintFormatter,
142
+ 'text': TextFormatter,
143
+ 'webvtt': WebVTTFormatter,
144
+ }
145
+
146
+ class UnknownFormatterType(Exception):
147
+ def __init__(self, formatter_type):
148
+ super(FormatterLoader.UnknownFormatterType, self).__init__(
149
+ 'The format \'{formatter_type}\' is not supported. '
150
+ 'Choose one of the following formats: {supported_formatter_types}'.format(
151
+ formatter_type=formatter_type,
152
+ supported_formatter_types=', '.join(FormatterLoader.TYPES.keys()),
153
+ )
154
+ )
155
+
156
+ def load(self, formatter_type='pretty'):
157
+ """
158
+ Loads the Formatter for the given formatter type.
159
+
160
+ :param formatter_type:
161
+ :return: Formatter object
162
+ """
163
+ if formatter_type not in FormatterLoader.TYPES.keys():
164
+ raise FormatterLoader.UnknownFormatterType(formatter_type)
165
+ return FormatterLoader.TYPES[formatter_type]()