Add youtube transcript api

Browse files

Files changed (9) hide show

youtube_transcript_api2/__init__.py +16 -0
youtube_transcript_api2/__main__.py +15 -0
youtube_transcript_api2/_api.py +140 -0
youtube_transcript_api2/_cli.py +135 -0
youtube_transcript_api2/_errors.py +112 -0
youtube_transcript_api2/_html_unescaping.py +21 -0
youtube_transcript_api2/_settings.py +1 -0
youtube_transcript_api2/_transcripts.py +332 -0
youtube_transcript_api2/formatters.py +165 -0

youtube_transcript_api2/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from ._api import YouTubeTranscriptApi
+from ._transcripts import TranscriptList, Transcript
+from ._errors import (
+    TranscriptsDisabled,
+    NoTranscriptFound,
+    CouldNotRetrieveTranscript,
+    VideoUnavailable,
+    TooManyRequests,
+    NotTranslatable,
+    TranslationLanguageNotAvailable,
+    NoTranscriptAvailable,
+    CookiePathInvalid,
+    CookiesInvalid,
+    FailedToCreateConsentCookie,
+    YouTubeRequestFailed,
+)

youtube_transcript_api2/__main__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import sys
+import logging
+from ._cli import YouTubeTranscriptCli
+def main():
+    logging.basicConfig()
+    print(YouTubeTranscriptCli(sys.argv[1:]).run())
+if __name__ == '__main__':
+    main()

youtube_transcript_api2/_api.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import requests
+try: # pragma: no cover
+    import http.cookiejar as cookiejar
+    CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
+except ImportError: # pragma: no cover
+    import cookielib as cookiejar
+    CookieLoadError = IOError
+from ._transcripts import TranscriptListFetcher
+from ._errors import (
+    CookiePathInvalid,
+    CookiesInvalid
+)
+class YouTubeTranscriptApi(object):
+    @classmethod
+    def list_transcripts(cls, video_id, proxies=None, cookies=None):
+        """
+        Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
+        which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
+        over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
+        metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
+        `transcript.translate('en')`. Example::
+            # retrieve the available transcripts
+            transcript_list = YouTubeTranscriptApi.get('video_id')
+            # iterate over all available transcripts
+            for transcript in transcript_list:
+                # the Transcript object provides metadata properties
+                print(
+                    transcript.video_id,
+                    transcript.language,
+                    transcript.language_code,
+                    # whether it has been manually created or generated by YouTube
+                    transcript.is_generated,
+                    # a list of languages the transcript can be translated to
+                    transcript.translation_languages,
+                )
+                # fetch the actual transcript data
+                print(transcript.fetch())
+                # translating the transcript will return another transcript object
+                print(transcript.translate('en').fetch())
+            # you can also directly filter for the language you are looking for, using the transcript list
+            transcript = transcript_list.find_transcript(['de', 'en'])
+            # or just filter for manually created transcripts
+            transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
+            # or automatically generated ones
+            transcript = transcript_list.find_generated_transcript(['de', 'en'])
+        :param video_id: the youtube video id
+        :type video_id: str
+        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
+        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
+        :param cookies: a string of the path to a text file containing youtube authorization cookies
+        :type cookies: str
+        :return: the list of available transcripts
+        :rtype TranscriptList:
+        """
+        with requests.Session() as http_client:
+            if cookies:
+                http_client.cookies = cls._load_cookies(cookies, video_id)
+            http_client.proxies = proxies if proxies else {}
+            return TranscriptListFetcher(http_client).fetch(video_id)
+    @classmethod
+    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
+        """
+        Retrieves the transcripts for a list of videos.
+        :param video_ids: a list of youtube video ids
+        :type video_ids: list[str]
+        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
+        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
+        do so.
+        :type languages: list[str]
+        :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
+        one of the video transcripts
+        :type continue_after_error: bool
+        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
+        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
+        :param cookies: a string of the path to a text file containing youtube authorization cookies
+        :type cookies: str
+        :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
+        video ids, which could not be retrieved
+        :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
+        """
+        data = {}
+        unretrievable_videos = []
+        for video_id in video_ids:
+            try:
+                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
+            except Exception as exception:
+                if not continue_after_error:
+                    raise exception
+                unretrievable_videos.append(video_id)
+        return data, unretrievable_videos
+    @classmethod
+    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
+        """
+        Retrieves the transcript for a single video. This is just a shortcut for calling::
+            YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
+        :param video_id: the youtube video id
+        :type video_id: str
+        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
+        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
+        do so.
+        :type languages: list[str]
+        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
+        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
+        :param cookies: a string of the path to a text file containing youtube authorization cookies
+        :type cookies: str
+        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
+        :rtype [{'text': str, 'start': float, 'end': float}]:
+        """
+        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
+    @classmethod
+    def _load_cookies(cls, cookies, video_id):
+        try:
+            cookie_jar = cookiejar.MozillaCookieJar()
+            cookie_jar.load(cookies)
+            if not cookie_jar:
+                raise CookiesInvalid(video_id)
+            return cookie_jar
+        except CookieLoadError:
+            raise CookiePathInvalid(video_id)

youtube_transcript_api2/_cli.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import argparse
+from ._api import YouTubeTranscriptApi
+from .formatters import FormatterLoader
+class YouTubeTranscriptCli(object):
+    def __init__(self, args):
+        self._args = args
+    def run(self):
+        parsed_args = self._parse_args()
+        if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
+            return ''
+        proxies = None
+        if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
+            proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
+        cookies = parsed_args.cookies
+        transcripts = []
+        exceptions = []
+        for video_id in parsed_args.video_ids:
+            try:
+                transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
+            except Exception as exception:
+                exceptions.append(exception)
+        return '\n\n'.join(
+            [str(exception) for exception in exceptions]
+            + ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else [])
+        )
+    def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)
+        if parsed_args.list_transcripts:
+            return str(transcript_list)
+        if parsed_args.exclude_manually_created:
+            transcript = transcript_list.find_generated_transcript(parsed_args.languages)
+        elif parsed_args.exclude_generated:
+            transcript = transcript_list.find_manually_created_transcript(parsed_args.languages)
+        else:
+            transcript = transcript_list.find_transcript(parsed_args.languages)
+        if parsed_args.translate:
+            transcript = transcript.translate(parsed_args.translate)
+        return transcript.fetch()
+    def _parse_args(self):
+        parser = argparse.ArgumentParser(
+            description=(
+                'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
+                'It also works for automatically generated subtitles and it does not require a headless browser, like '
+                'other selenium based solutions do!'
+            )
+        )
+        parser.add_argument(
+            '--list-transcripts',
+            action='store_const',
+            const=True,
+            default=False,
+            help='This will list the languages in which the given videos are available in.',
+        )
+        parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
+        parser.add_argument(
+            '--languages',
+            nargs='*',
+            default=['en',],
+            type=str,
+            help=(
+                'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
+                'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails '
+                'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
+                'may have to play around with the language codes a bit, to find the one which is working for you!'
+            ),
+        )
+        parser.add_argument(
+            '--exclude-generated',
+            action='store_const',
+            const=True,
+            default=False,
+            help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.',
+        )
+        parser.add_argument(
+            '--exclude-manually-created',
+            action='store_const',
+            const=True,
+            default=False,
+            help='If this flag is set transcripts which have been manually created will not be retrieved.',
+        )
+        parser.add_argument(
+            '--format',
+            type=str,
+            default='pretty',
+            choices=tuple(FormatterLoader.TYPES.keys()),
+        )
+        parser.add_argument(
+            '--translate',
+            default='',
+            help=(
+                'The language code for the language you want this transcript to be translated to. Use the '
+                '--list-transcripts feature to find out which languages are translatable and which translation '
+                'languages are available.'
+            )
+        )
+        parser.add_argument(
+            '--http-proxy',
+            default='',
+            metavar='URL',
+            help='Use the specified HTTP proxy.'
+        )
+        parser.add_argument(
+            '--https-proxy',
+            default='',
+            metavar='URL',
+            help='Use the specified HTTPS proxy.'
+        )
+        parser.add_argument(
+            '--cookies',
+            default=None,
+            help='The cookie file that will be used for authorization with youtube.'
+        )
+        return self._sanitize_video_ids(parser.parse_args(self._args))
+    def _sanitize_video_ids(self, args):
+        args.video_ids = [video_id.replace('\\', '') for video_id in args.video_ids]
+        return args

youtube_transcript_api2/_errors.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from ._settings import WATCH_URL
+class CouldNotRetrieveTranscript(Exception):
+    """
+    Raised if a transcript could not be retrieved.
+    """
+    ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
+    CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
+    CAUSE_MESSAGE = ''
+    GITHUB_REFERRAL = (
+        '\n\nIf you are sure that the described cause is not responsible for this error '
+        'and that a transcript should be retrievable, please create an issue at '
+        'https://github.com/jdepoix/youtube-transcript-api/issues. '
+        'Please add which version of youtube_transcript_api you are using '
+        'and provide the information needed to replicate the error. '
+        'Also make sure that there are no open issues which already describe your problem!'
+    )
+    def __init__(self, video_id):
+        self.video_id = video_id
+        super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message())
+    def _build_error_message(self):
+        cause = self.cause
+        error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
+        if cause:
+            error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
+        return error_message
+    @property
+    def cause(self):
+        return self.CAUSE_MESSAGE
+class YouTubeRequestFailed(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'Request to YouTube failed: {reason}'
+    def __init__(self, video_id, http_error):
+        self.reason = str(http_error)
+        super(YouTubeRequestFailed, self).__init__(video_id)
+    @property
+    def cause(self):
+        return self.CAUSE_MESSAGE.format(
+            reason=self.reason,
+        )
+class VideoUnavailable(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'The video is no longer available'
+class TooManyRequests(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = (
+        'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
+        'One of the following things can be done to work around this:\n\
+        - Manually solve the captcha in a browser and export the cookie. '
+        'Read here how to use that cookie with '
+        'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
+        - Use a different IP address\n\
+        - Wait until the ban on your IP has been lifted'
+    )
+class TranscriptsDisabled(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'Subtitles are disabled for this video'
+class NoTranscriptAvailable(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'No transcripts are available for this video'
+class NotTranslatable(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'The requested language is not translatable'
+class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'The requested translation language is not available'
+class CookiePathInvalid(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
+class CookiesInvalid(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
+class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
+class NoTranscriptFound(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = (
+        'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
+        '{transcript_data}'
+    )
+    def __init__(self, video_id, requested_language_codes, transcript_data):
+        self._requested_language_codes = requested_language_codes
+        self._transcript_data = transcript_data
+        super(NoTranscriptFound, self).__init__(video_id)
+    @property
+    def cause(self):
+        return self.CAUSE_MESSAGE.format(
+            requested_language_codes=self._requested_language_codes,
+            transcript_data=str(self._transcript_data),
+        )

youtube_transcript_api2/_html_unescaping.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import sys
+# This can only be tested by using different python versions, therefore it is not covered by coverage.py
+if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover
+    # Python 3.4+
+    from html import unescape
+else: # pragma: no cover
+    if sys.version_info.major <= 2:
+        # Python 2
+        import HTMLParser
+        html_parser = HTMLParser.HTMLParser()
+    else:
+        # Python 3.0 - 3.3
+        import html.parser
+        html_parser = html.parser.HTMLParser()
+    def unescape(string):
+        return html_parser.unescape(string)

youtube_transcript_api2/_settings.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'

youtube_transcript_api2/_transcripts.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import sys
+# This can only be tested by using different python versions, therefore it is not covered by coverage.py
+if sys.version_info.major == 2: # pragma: no cover
+    reload(sys)
+    sys.setdefaultencoding('utf-8')
+import json
+from xml.etree import ElementTree
+import re
+from requests import HTTPError
+from ._html_unescaping import unescape
+from ._errors import (
+    VideoUnavailable,
+    TooManyRequests,
+    YouTubeRequestFailed,
+    NoTranscriptFound,
+    TranscriptsDisabled,
+    NotTranslatable,
+    TranslationLanguageNotAvailable,
+    NoTranscriptAvailable,
+    FailedToCreateConsentCookie,
+)
+from ._settings import WATCH_URL
+def _raise_http_errors(response, video_id):
+    try:
+        response.raise_for_status()
+        return response
+    except HTTPError as error:
+        raise YouTubeRequestFailed(error, video_id)
+class TranscriptListFetcher(object):
+    def __init__(self, http_client):
+        self._http_client = http_client
+    def fetch(self, video_id):
+        return TranscriptList.build(
+            self._http_client,
+            video_id,
+            self._extract_captions_json(self._fetch_video_html(video_id), video_id)
+        )
+    def _extract_captions_json(self, html, video_id):
+        splitted_html = html.split('"captions":')
+        if len(splitted_html) <= 1:
+            if 'class="g-recaptcha"' in html:
+                raise TooManyRequests(video_id)
+            if '"playabilityStatus":' not in html:
+                raise VideoUnavailable(video_id)
+            raise TranscriptsDisabled(video_id)
+        captions_json = json.loads(
+            splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
+        ).get('playerCaptionsTracklistRenderer')
+        if captions_json is None:
+            raise TranscriptsDisabled(video_id)
+        if 'captionTracks' not in captions_json:
+            raise NoTranscriptAvailable(video_id)
+        return captions_json
+    def _create_consent_cookie(self, html, video_id):
+        match = re.search('name="v" value="(.*?)"', html)
+        if match is None:
+            raise FailedToCreateConsentCookie(video_id)
+        self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
+    def _fetch_video_html(self, video_id):
+        html = self._fetch_html(video_id)
+        if 'action="https://consent.youtube.com/s"' in html:
+            self._create_consent_cookie(html, video_id)
+            html = self._fetch_html(video_id)
+            if 'action="https://consent.youtube.com/s"' in html:
+                raise FailedToCreateConsentCookie(video_id)
+        return html
+    def _fetch_html(self, video_id):
+        response = self._http_client.get(WATCH_URL.format(video_id=video_id))
+        return unescape(_raise_http_errors(response, video_id).text)
+class TranscriptList(object):
+    """
+    This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
+    for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
+    """
+    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
+        """
+        The constructor is only for internal use. Use the static build method instead.
+        :param video_id: the id of the video this TranscriptList is for
+        :type video_id: str
+        :param manually_created_transcripts: dict mapping language codes to the manually created transcripts
+        :type manually_created_transcripts: dict[str, Transcript]
+        :param generated_transcripts: dict mapping language codes to the generated transcripts
+        :type generated_transcripts: dict[str, Transcript]
+        :param translation_languages: list of languages which can be used for translatable languages
+        :type translation_languages: list[dict[str, str]]
+        """
+        self.video_id = video_id
+        self._manually_created_transcripts = manually_created_transcripts
+        self._generated_transcripts = generated_transcripts
+        self._translation_languages = translation_languages
+    @staticmethod
+    def build(http_client, video_id, captions_json):
+        """
+        Factory method for TranscriptList.
+        :param http_client: http client which is used to make the transcript retrieving http calls
+        :type http_client: requests.Session
+        :param video_id: the id of the video this TranscriptList is for
+        :type video_id: str
+        :param captions_json: the JSON parsed from the YouTube pages static HTML
+        :type captions_json: dict
+        :return: the created TranscriptList
+        :rtype TranscriptList:
+        """
+        translation_languages = [
+            {
+                'language': translation_language['languageName']['simpleText'],
+                'language_code': translation_language['languageCode'],
+            } for translation_language in captions_json['translationLanguages']
+        ]
+        manually_created_transcripts = {}
+        generated_transcripts = {}
+        for caption in captions_json['captionTracks']:
+            if caption.get('kind', '') == 'asr':
+                transcript_dict = generated_transcripts
+            else:
+                transcript_dict = manually_created_transcripts
+            transcript_dict[caption['languageCode']] = Transcript(
+                http_client,
+                video_id,
+                caption['baseUrl'],
+                caption['name']['simpleText'],
+                caption['languageCode'],
+                caption.get('kind', '') == 'asr',
+                translation_languages if caption.get('isTranslatable', False) else []
+            )
+        return TranscriptList(
+            video_id,
+            manually_created_transcripts,
+            generated_transcripts,
+            translation_languages,
+        )
+    def __iter__(self):
+        return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
+    def find_transcript(self, language_codes):
+        """
+        Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
+        are found, generated transcripts are used. If you only want generated transcripts use
+        `find_manually_created_transcript` instead.
+        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
+        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
+        it fails to do so.
+        :type languages: list[str]
+        :return: the found Transcript
+        :rtype Transcript:
+        :raises: NoTranscriptFound
+        """
+        return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
+    def find_generated_transcript(self, language_codes):
+        """
+        Finds a automatically generated transcript for a given language code.
+        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
+        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
+        it fails to do so.
+        :type languages: list[str]
+        :return: the found Transcript
+        :rtype Transcript:
+        :raises: NoTranscriptFound
+        """
+        return self._find_transcript(language_codes, [self._generated_transcripts,])
+    def find_manually_created_transcript(self, language_codes):
+        """
+        Finds a manually created transcript for a given language code.
+        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
+        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
+        it fails to do so.
+        :type languages: list[str]
+        :return: the found Transcript
+        :rtype Transcript:
+        :raises: NoTranscriptFound
+        """
+        return self._find_transcript(language_codes, [self._manually_created_transcripts,])
+    def _find_transcript(self, language_codes, transcript_dicts):
+        for language_code in language_codes:
+            for transcript_dict in transcript_dicts:
+                if language_code in transcript_dict:
+                    return transcript_dict[language_code]
+        raise NoTranscriptFound(
+            self.video_id,
+            language_codes,
+            self
+        )
+    def __str__(self):
+        return (
+            'For this video ({video_id}) transcripts are available in the following languages:\n\n'
+            '(MANUALLY CREATED)\n'
+            '{available_manually_created_transcript_languages}\n\n'
+            '(GENERATED)\n'
+            '{available_generated_transcripts}\n\n'
+            '(TRANSLATION LANGUAGES)\n'
+            '{available_translation_languages}'
+        ).format(
+            video_id=self.video_id,
+            available_manually_created_transcript_languages=self._get_language_description(
+                str(transcript) for transcript in self._manually_created_transcripts.values()
+            ),
+            available_generated_transcripts=self._get_language_description(
+                str(transcript) for transcript in self._generated_transcripts.values()
+            ),
+            available_translation_languages=self._get_language_description(
+                '{language_code} ("{language}")'.format(
+                    language=translation_language['language'],
+                    language_code=translation_language['language_code'],
+                ) for translation_language in self._translation_languages
+            )
+        )
+    def _get_language_description(self, transcript_strings):
+        description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
+        return description if description else 'None'
+class Transcript(object):
+    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
+        """
+        You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
+        TranscriptList.
+        :param http_client: http client which is used to make the transcript retrieving http calls
+        :type http_client: requests.Session
+        :param video_id: the id of the video this TranscriptList is for
+        :type video_id: str
+        :param url: the url which needs to be called to fetch the transcript
+        :param language: the name of the language this transcript uses
+        :param language_code:
+        :param is_generated:
+        :param translation_languages:
+        """
+        self._http_client = http_client
+        self.video_id = video_id
+        self._url = url
+        self.language = language
+        self.language_code = language_code
+        self.is_generated = is_generated
+        self.translation_languages = translation_languages
+        self._translation_languages_dict = {
+            translation_language['language_code']: translation_language['language']
+            for translation_language in translation_languages
+        }
+    def fetch(self):
+        """
+        Loads the actual transcript data.
+        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
+        :rtype [{'text': str, 'start': float, 'end': float}]:
+        """
+        response = self._http_client.get(self._url)
+        return _TranscriptParser().parse(
+            _raise_http_errors(response, self.video_id).text,
+        )
+    def __str__(self):
+        return '{language_code} ("{language}"){translation_description}'.format(
+            language=self.language,
+            language_code=self.language_code,
+            translation_description='[TRANSLATABLE]' if self.is_translatable else ''
+        )
+    @property
+    def is_translatable(self):
+        return len(self.translation_languages) > 0
+    def translate(self, language_code):
+        if not self.is_translatable:
+            raise NotTranslatable(self.video_id)
+        if language_code not in self._translation_languages_dict:
+            raise TranslationLanguageNotAvailable(self.video_id)
+        return Transcript(
+            self._http_client,
+            self.video_id,
+            '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
+            self._translation_languages_dict[language_code],
+            language_code,
+            True,
+            [],
+        )
+class _TranscriptParser(object):
+    HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
+    def parse(self, plain_data):
+        return [
+            {
+                'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
+                'start': float(xml_element.attrib['start']),
+                'duration': float(xml_element.attrib.get('dur', '0.0')),
+            }
+            for xml_element in ElementTree.fromstring(plain_data)
+            if xml_element.text is not None
+        ]

youtube_transcript_api2/formatters.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import json
+import pprint
+class Formatter(object):
+    """Formatter should be used as an abstract base class.
+    Formatter classes should inherit from this class and implement
+    their own .format() method which should return a string. A
+    transcript is represented by a List of Dictionary items.
+    """
+    def format_transcript(self, transcript, **kwargs):
+        raise NotImplementedError('A subclass of Formatter must implement ' \
+            'their own .format_transcript() method.')
+    def format_transcripts(self, transcripts, **kwargs):
+        raise NotImplementedError('A subclass of Formatter must implement ' \
+                                  'their own .format_transcripts() method.')
+class PrettyPrintFormatter(Formatter):
+    def format_transcript(self, transcript, **kwargs):
+        """Pretty prints a transcript.
+        :param transcript:
+        :return: A pretty printed string representation of the transcript.'
+        :rtype str
+        """
+        return pprint.pformat(transcript, **kwargs)
+    def format_transcripts(self, transcripts, **kwargs):
+        """Pretty prints a list of transcripts.
+        :param transcripts:
+        :return: A pretty printed string representation of the transcripts.'
+        :rtype str
+        """
+        return self.format_transcript(transcripts, **kwargs)
+class JSONFormatter(Formatter):
+    def format_transcript(self, transcript, **kwargs):
+        """Converts a transcript into a JSON string.
+        :param transcript:
+        :return: A JSON string representation of the transcript.'
+        :rtype str
+        """
+        return json.dumps(transcript, **kwargs)
+    def format_transcripts(self, transcripts, **kwargs):
+        """Converts a list of transcripts into a JSON string.
+        :param transcripts:
+        :return: A JSON string representation of the transcript.'
+        :rtype str
+        """
+        return self.format_transcript(transcripts, **kwargs)
+class TextFormatter(Formatter):
+    def format_transcript(self, transcript, **kwargs):
+        """Converts a transcript into plain text with no timestamps.
+        :param transcript:
+        :return: all transcript text lines separated by newline breaks.'
+        :rtype str
+        """
+        return '\n'.join(line['text'] for line in transcript)
+    def format_transcripts(self, transcripts, **kwargs):
+        """Converts a list of transcripts into plain text with no timestamps.
+        :param transcripts:
+        :return: all transcript text lines separated by newline breaks.'
+        :rtype str
+        """
+        return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
+class WebVTTFormatter(Formatter):
+    def _seconds_to_timestamp(self, time):
+        """Helper that converts `time` into a transcript cue timestamp.
+        :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp
+        :param time: a float representing time in seconds.
+        :type time: float
+        :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS'
+        :rtype str
+        :example:
+        >>> self._seconds_to_timestamp(6.93)
+        '00:00:06.930'
+        """
+        time = float(time)
+        hours, remainder = divmod(time, 3600)
+        mins, secs = divmod(remainder, 60)
+        ms = int(round((time - int(time))*1000, 2))
+        return "{:02.0f}:{:02.0f}:{:02.0f}.{:03d}".format(hours, mins, secs, ms)
+    def format_transcript(self, transcript, **kwargs):
+        """A basic implementation of WEBVTT formatting.
+        :param transcript:
+        :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
+        """
+        lines = []
+        for i, line in enumerate(transcript):
+            if i < len(transcript) - 1:
+                # Looks ahead, use next start time since duration value
+                # would create an overlap between start times.
+                time_text = "{} --> {}".format(
+                    self._seconds_to_timestamp(line['start']),
+                    self._seconds_to_timestamp(transcript[i + 1]['start'])
+                )
+            else:
+                # Reached the end, cannot look ahead, use duration now.
+                duration = line['start'] + line['duration']
+                time_text = "{} --> {}".format(
+                    self._seconds_to_timestamp(line['start']),
+                    self._seconds_to_timestamp(duration)
+                )
+            lines.append("{}\n{}".format(time_text, line['text']))
+        return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"
+    def format_transcripts(self, transcripts, **kwargs):
+        """A basic implementation of WEBVTT formatting for a list of transcripts.
+        :param transcripts:
+        :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
+        """
+        return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
+class FormatterLoader(object):
+    TYPES = {
+        'json': JSONFormatter,
+        'pretty': PrettyPrintFormatter,
+        'text': TextFormatter,
+        'webvtt': WebVTTFormatter,
+    }
+    class UnknownFormatterType(Exception):
+        def __init__(self, formatter_type):
+            super(FormatterLoader.UnknownFormatterType, self).__init__(
+                'The format \'{formatter_type}\' is not supported. '
+                'Choose one of the following formats: {supported_formatter_types}'.format(
+                    formatter_type=formatter_type,
+                    supported_formatter_types=', '.join(FormatterLoader.TYPES.keys()),
+                )
+            )
+    def load(self, formatter_type='pretty'):
+        """
+        Loads the Formatter for the given formatter type.
+        :param formatter_type:
+        :return: Formatter object
+        """
+        if formatter_type not in FormatterLoader.TYPES.keys():
+            raise FormatterLoader.UnknownFormatterType(formatter_type)
+        return FormatterLoader.TYPES[formatter_type]()