File size: 10,927 Bytes

dc4dce6

import json
from functools import lru_cache
from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TooManyRequests,
    YouTubeRequestFailed,
    CouldNotRetrieveTranscript
)
import json
import re
import requests
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TextClassificationPipeline,
)
from typing import Any, Dict, List
import os
import numpy as np

CATEGORIES = [None, 'SPONSOR', 'SELFPROMO', 'INTERACTION']

PROFANITY_RAW = '[ __ ]'  # How YouTube transcribes profanity
PROFANITY_CONVERTED = '*****'  # Safer version for tokenizing

NUM_DECIMALS = 3

# https://www.fincher.org/Utilities/CountryLanguageList.shtml
# https://lingohub.com/developers/supported-locales/language-designators-with-regions
LANGUAGE_PREFERENCE_LIST = ['en-GB', 'en-US', 'en-CA', 'en-AU', 'en-NZ', 'en-ZA',
                            'en-IE', 'en-IN', 'en-JM', 'en-BZ', 'en-TT', 'en-PH', 'en-ZW',
                            'en']


def parse_transcript_json(json_data, granularity):
    assert json_data['wireMagic'] == 'pb3'

    assert granularity in ('word', 'chunk')

    # TODO remove bracketed words?
    # (kiss smacks)
    # (upbeat music)
    # [text goes here]

    # Some manual transcripts aren't that well formatted... but do have punctuation
    # https://www.youtube.com/watch?v=LR9FtWVjk2c

    parsed_transcript = []

    events = json_data['events']

    for event_index, event in enumerate(events):
        segments = event.get('segs')
        if not segments:
            continue

        # This value is known (when phrase appears on screen)
        start_ms = event['tStartMs']
        total_characters = 0

        new_segments = []
        for seg in segments:
            # Replace \n, \t, etc. with space
            text = ' '.join(seg['utf8'].split())

            # Remove zero-width spaces and strip trailing and leading whitespace
            text = text.replace('\u200b', '').replace('\u200c', '').replace(
                '\u200d', '').replace('\ufeff', '').strip()

            # Alternatively,
            # text = text.encode('ascii', 'ignore').decode()

            # Needed for auto-generated transcripts
            text = text.replace(PROFANITY_RAW, PROFANITY_CONVERTED)

            if not text:
                continue

            offset_ms = seg.get('tOffsetMs', 0)

            new_segments.append({
                'text': text,
                'start': round((start_ms + offset_ms)/1000, NUM_DECIMALS)
            })

            total_characters += len(text)

        if not new_segments:
            continue

        if event_index < len(events) - 1:
            next_start_ms = events[event_index + 1]['tStartMs']
            total_event_duration_ms = min(
                event.get('dDurationMs', float('inf')), next_start_ms - start_ms)
        else:
            total_event_duration_ms = event.get('dDurationMs', 0)

        # Ensure duration is non-negative
        total_event_duration_ms = max(total_event_duration_ms, 0)

        avg_seconds_per_character = (
            total_event_duration_ms/total_characters)/1000

        num_char_count = 0
        for seg_index, seg in enumerate(new_segments):
            num_char_count += len(seg['text'])

            # Estimate segment end
            seg_end = seg['start'] + \
                (num_char_count * avg_seconds_per_character)

            if seg_index < len(new_segments) - 1:
                # Do not allow longer than next
                seg_end = min(seg_end, new_segments[seg_index+1]['start'])

            seg['end'] = round(seg_end, NUM_DECIMALS)
            parsed_transcript.append(seg)

    final_parsed_transcript = []
    for i in range(len(parsed_transcript)):

        word_level = granularity == 'word'
        if word_level:
            split_text = parsed_transcript[i]['text'].split()
        elif granularity == 'chunk':
            # Split on space after punctuation
            split_text = re.split(
                r'(?<=[.!?,-;])\s+', parsed_transcript[i]['text'])
            if len(split_text) == 1:
                split_on_whitespace = parsed_transcript[i]['text'].split()

                if len(split_on_whitespace) >= 8:  # Too many words
                    # Rather split on whitespace instead of punctuation
                    split_text = split_on_whitespace
                else:
                    word_level = True
        else:
            raise ValueError('Unknown granularity')

        segment_end = parsed_transcript[i]['end']
        if i < len(parsed_transcript) - 1:
            segment_end = min(segment_end, parsed_transcript[i+1]['start'])

        segment_duration = segment_end - parsed_transcript[i]['start']

        num_chars_in_text = sum(map(len, split_text))

        num_char_count = 0
        current_offset = 0
        for s in split_text:
            num_char_count += len(s)

            next_offset = (num_char_count/num_chars_in_text) * segment_duration

            word_start = round(
                parsed_transcript[i]['start'] + current_offset, NUM_DECIMALS)
            word_end = round(
                parsed_transcript[i]['start'] + next_offset, NUM_DECIMALS)

            # Make the reasonable assumption that min wps is 1.5
            final_parsed_transcript.append({
                'text': s,
                'start': word_start,
                'end': min(word_end, word_start + 1.5) if word_level else word_end
            })
            current_offset = next_offset

    return final_parsed_transcript


def list_transcripts(video_id):
    try:
        return YouTubeTranscriptApi.list_transcripts(video_id)
    except json.decoder.JSONDecodeError:
        return None


WORDS_TO_REMOVE = [
    '[Music]'
    '[Applause]'
    '[Laughter]'
]


@lru_cache(maxsize=16)
def get_words(video_id, transcript_type='auto', fallback='manual', filter_words_to_remove=True, granularity='word'):
    """Get parsed video transcript with caching system
    returns None if not processed yet and process is False
    """

    raw_transcript_json = None
    try:
        transcript_list = list_transcripts(video_id)

        if transcript_list is not None:
            if transcript_type == 'manual':
                ts = transcript_list.find_manually_created_transcript(
                    LANGUAGE_PREFERENCE_LIST)
            else:
                ts = transcript_list.find_generated_transcript(
                    LANGUAGE_PREFERENCE_LIST)
            raw_transcript = ts._http_client.get(
                f'{ts._url}&fmt=json3').content
            if raw_transcript:
                raw_transcript_json = json.loads(raw_transcript)
    except (TooManyRequests, YouTubeRequestFailed):
        raise  # Cannot recover from these errors and do not mark as empty transcript

    except requests.exceptions.RequestException:  # Can recover
        return get_words(video_id, transcript_type, fallback, granularity)

    except CouldNotRetrieveTranscript:  # Retrying won't solve
        pass  # Mark as empty transcript

    except json.decoder.JSONDecodeError:
        return get_words(video_id, transcript_type, fallback, granularity)

    if not raw_transcript_json and fallback is not None:
        return get_words(video_id, transcript_type=fallback, fallback=None, granularity=granularity)

    if raw_transcript_json:
        processed_transcript = parse_transcript_json(
            raw_transcript_json, granularity)
        if filter_words_to_remove:
            processed_transcript = list(
                filter(lambda x: x['text'] not in WORDS_TO_REMOVE, processed_transcript))
    else:
        processed_transcript = raw_transcript_json  # Either None or []

    return processed_transcript


def word_start(word):
    return word['start']


def word_end(word):
    return word.get('end', word['start'])


def extract_segment(words, start, end, map_function=None):
    """Extracts all words with time in [start, end]"""

    a = max(binary_search_below(words, 0, len(words), start), 0)
    b = min(binary_search_above(words, -1, len(words) - 1, end) + 1, len(words))

    to_transform = map_function is not None and callable(map_function)

    return [
        map_function(words[i]) if to_transform else words[i] for i in range(a, b)
    ]


def avg(*items):
    return sum(items)/len(items)


def binary_search_below(transcript, start_index, end_index, time):
    if start_index >= end_index:
        return end_index

    middle_index = (start_index + end_index) // 2
    middle = transcript[middle_index]
    middle_time = avg(word_start(middle), word_end(middle))

    if time <= middle_time:
        return binary_search_below(transcript, start_index, middle_index, time)
    else:
        return binary_search_below(transcript, middle_index + 1, end_index, time)


def binary_search_above(transcript, start_index, end_index, time):
    if start_index >= end_index:
        return end_index

    middle_index = (start_index + end_index + 1) // 2
    middle = transcript[middle_index]
    middle_time = avg(word_start(middle), word_end(middle))

    if time >= middle_time:
        return binary_search_above(transcript, middle_index, end_index, time)
    else:
        return binary_search_above(transcript, start_index, middle_index - 1, time)


class PreTrainedPipeline():
    def __init__(self, path: str):
        self.model2 = AutoModelForSequenceClassification.from_pretrained(path)
        self.tokenizer2 = AutoTokenizer.from_pretrained(path)
        self.pipeline2 = SponsorBlockClassificationPipeline(
            model=self.model2, tokenizer=self.tokenizer2)

    def __call__(self, inputs: str) -> List[Dict[str, Any]]:

        # Automated call (compressed string)
        if ' ' not in inputs and inputs.count(',') >= 2:
            split_info = inputs.split(',', 1)
            times = np.reshape(np.array(split_info[1].split(',')), (-1, 2))
            data = []
            for start, end in times:
                data.append({
                    'video_id': split_info[0],
                    'start': float(start),
                    'end': float(end)
                })
        else:
            data = inputs

        return self.pipeline2(data)


class SponsorBlockClassificationPipeline(TextClassificationPipeline):
    def __init__(self, model, tokenizer):
        super().__init__(model=model, tokenizer=tokenizer, return_all_scores=True)

    def preprocess(self, data, **tokenizer_kwargs):
        if isinstance(data, str):  # If string, assume this is what user wants to classify
            text = data
        else:  # Otherwise, get data from transcript
            words = get_words(data['video_id'])
            segment_words = extract_segment(words, data['start'], data['end'])
            text = ' '.join(x['text'] for x in segment_words)

        return self.tokenizer(
            text, return_tensors=self.framework, **tokenizer_kwargs)