Spaces:

mchangrh
/

sponsorblock-ml-fork

Runtime error

App Files Files Community

Joshua Lochner commited on Jan 9, 2022

Commit

90d1f68

•

1 Parent(s): 4678c9b

Add functionality to predict self-promo and interaction reminders

Browse files

Files changed (7) hide show

src/evaluate.py +2 -4
src/predict.py +31 -24
src/preprocess.py +111 -55
src/segment.py +3 -3
src/shared.py +5 -6
src/train.py +15 -15
src/utils.py +6 -0

src/evaluate.py CHANGED Viewed

@@ -105,13 +105,13 @@ def calculate_metrics(labelled_words, predictions):
         if predicted_sponsor:
             # total_positive_time += duration
-            if word['sponsor']:  # Is actual sponsor
                 metrics['true_positive'] += duration
             else:
                 metrics['false_positive'] += duration
         else:
             # total_negative_time += duration
-            if word['sponsor']:  # Is actual sponsor
                 metrics['false_negative'] += duration
             else:
                 metrics['true_negative'] += duration
@@ -176,8 +176,6 @@ def main():
     with open(final_path) as fp:
         final_data = json.load(fp)
-    classifier, vectorizer = get_classifier_vectorizer(classifier_args)
     total_accuracy = 0
     total_precision = 0
     total_recall = 0

         if predicted_sponsor:
             # total_positive_time += duration
+            if word['category'] is not None:  # Is actual sponsor
                 metrics['true_positive'] += duration
             else:
                 metrics['false_positive'] += duration
         else:
             # total_negative_time += duration
+            if word['category'] is not None:  # Is actual sponsor
                 metrics['false_negative'] += duration
             else:
                 metrics['true_negative'] += duration
     with open(final_path) as fp:
         final_data = json.load(fp)
     total_accuracy = 0
     total_precision = 0
     total_recall = 0

src/predict.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transformers.trainer_utils import get_last_checkpoint
 from shared import OutputArguments
 from typing import Optional
 from segment import (
@@ -11,21 +11,22 @@ from segment import (
     SegmentationArguments
 )
 import preprocess
-import re
 from errors import TranscriptError
 from model import get_classifier_vectorizer
 from transformers import (
     AutoModelForSeq2SeqLM,
-    AutoTokenizer
 )
 from dataclasses import dataclass, field
-from transformers import HfArgumentParser
 from shared import device
 import logging
 def seconds_to_time(seconds):
-    fractional = str(round(seconds % 1, 3))[1:]
     h, remainder = divmod(abs(int(seconds)), 3600)
     m, s = divmod(remainder, 60)
     return f"{'-' if seconds < 0 else ''}{h:02}:{m:02}:{s:02}{fractional}"
@@ -64,7 +65,7 @@ class PredictArguments(TrainingOutputArguments):
     )
-SPONSOR_MATCH_RE = fr'(?<={CustomTokens.START_SPONSOR.value})\s*(.*?)\s*(?={CustomTokens.END_SPONSOR.value}|$)'
 MATCH_WINDOW = 25       # Increase for accuracy, but takes longer: O(n^3)
 MERGE_TIME_WITHIN = 8   # Merge predictions if they are within x seconds
@@ -97,11 +98,13 @@ class ClassifierArguments:
         default=0.5, metadata={'help': 'Remove all predictions whose classification probability is below this threshold.'})
-def filter_predictions(predictions, classifier, vectorizer, classifier_args):
     """Use classifier to filter predictions"""
     if not predictions:
         return predictions
     transformed_segments = vectorizer.transform([
         preprocess.clean_text(' '.join([x['text'] for x in pred['words']]))
         for pred in predictions
@@ -142,9 +145,7 @@ def predict(video_id, model, tokenizer, segmentation_args, words=None, classifie
             words, prediction['start'], prediction['end'])
     if classifier_args is not None:
-        classifier, vectorizer = get_classifier_vectorizer(classifier_args)
-        predictions = filter_predictions(
-            predictions, classifier, vectorizer, classifier_args)
     return predictions
@@ -166,13 +167,10 @@ def greedy_match(list, sublist):
     return best_i, best_j, best_k
-DEFAULT_TOKEN_PREFIX = 'summarize: '
 def predict_sponsor_text(text, model, tokenizer):
     """Given a body of text, predict the words which are part of the sponsor"""
     input_ids = tokenizer(
-        f'{DEFAULT_TOKEN_PREFIX}{text}', return_tensors='pt', truncation=True).input_ids.to(device())
     # Can't be longer than input length + SAFETY_TOKENS or model input dim
     max_out_len = min(len(input_ids[0]) + SAFETY_TOKENS, model.model_dim)
@@ -183,10 +181,11 @@ def predict_sponsor_text(text, model, tokenizer):
 def predict_sponsor_matches(text, model, tokenizer):
     sponsorship_text = predict_sponsor_text(text, model, tokenizer)
-    if CustomTokens.NO_SPONSOR.value in sponsorship_text:
         return []
-    return re.findall(SPONSOR_MATCH_RE, sponsorship_text)
 def segments_to_prediction_times(segments, model, tokenizer):
@@ -202,7 +201,7 @@ def segments_to_prediction_times(segments, model, tokenizer):
         matches = predict_sponsor_matches(batch_text, model, tokenizer)
         for match in matches:
-            matched_text = match.split()
             # TODO skip if too short
             i1, j1, k1 = greedy_match(
@@ -217,7 +216,8 @@ def segments_to_prediction_times(segments, model, tokenizer):
             predicted_time_ranges.append({
                 'start': word_start(extracted_words[0]),
-                'end': word_end(extracted_words[-1])
             })
     # Necessary to sort matches by start time
@@ -225,23 +225,29 @@ def segments_to_prediction_times(segments, model, tokenizer):
     # Merge overlapping predictions and sponsorships that are close together
     # Caused by model having max input size
-    last_end_time = -1
     final_predicted_time_ranges = []
     for range in predicted_time_ranges:
         start_time = range['start']
         end_time = range['end']
-        if (start_time <= last_end_time <= end_time) or (last_end_time != -1 and start_time - last_end_time <= MERGE_TIME_WITHIN):
-            # Ending time of last segment is in this segment, so we extend last prediction range
             final_predicted_time_ranges[-1]['end'] = end_time
         else:  # No overlap, is a new prediction
             final_predicted_time_ranges.append({
                 'start': start_time,
                 'end': end_time,
             })
-        last_end_time = end_time
     return final_predicted_time_ranges
@@ -268,7 +274,7 @@ def main():
     predict_args.video_id = predict_args.video_id.strip()
     predictions = predict(predict_args.video_id, model, tokenizer,
-                          segmentation_args, classifier_args=classifier_args)
     video_url = f'https://www.youtube.com/watch?v={predict_args.video_id}'
     if not predictions:
@@ -282,7 +288,8 @@ def main():
               ' '.join([w['text'] for w in prediction['words']]), '"', sep='')
         print('Time:', seconds_to_time(
             prediction['start']), '-->', seconds_to_time(prediction['end']))
-        print('Probability:', prediction['probability'])
         print()

+from utils import re_findall
 from shared import OutputArguments
 from typing import Optional
 from segment import (
     SegmentationArguments
 )
 import preprocess
 from errors import TranscriptError
 from model import get_classifier_vectorizer
 from transformers import (
     AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    HfArgumentParser
 )
+from transformers.trainer_utils import get_last_checkpoint
 from dataclasses import dataclass, field
 from shared import device
 import logging
 def seconds_to_time(seconds):
+    fractional = round(seconds % 1, 3)
+    fractional = '' if fractional == 0 else str(fractional)[1:]
     h, remainder = divmod(abs(int(seconds)), 3600)
     m, s = divmod(remainder, 60)
     return f"{'-' if seconds < 0 else ''}{h:02}:{m:02}:{s:02}{fractional}"
     )
+SPONSOR_MATCH_RE = fr'(?<={CustomTokens.START_SEGMENT.value})\s*_(?P<category>\S+)\s*(?P<text>.*?)\s*(?={CustomTokens.END_SEGMENT.value}|$)'
 MATCH_WINDOW = 25       # Increase for accuracy, but takes longer: O(n^3)
 MERGE_TIME_WITHIN = 8   # Merge predictions if they are within x seconds
         default=0.5, metadata={'help': 'Remove all predictions whose classification probability is below this threshold.'})
+def filter_predictions(predictions, classifier_args):  # classifier, vectorizer,
     """Use classifier to filter predictions"""
     if not predictions:
         return predictions
+    classifier, vectorizer = get_classifier_vectorizer(classifier_args)
     transformed_segments = vectorizer.transform([
         preprocess.clean_text(' '.join([x['text'] for x in pred['words']]))
         for pred in predictions
             words, prediction['start'], prediction['end'])
     if classifier_args is not None:
+        predictions = filter_predictions(predictions, classifier_args)
     return predictions
     return best_i, best_j, best_k
 def predict_sponsor_text(text, model, tokenizer):
     """Given a body of text, predict the words which are part of the sponsor"""
     input_ids = tokenizer(
+        f'{CustomTokens.EXTRACT_SEGMENTS_PREFIX.value} {text}', return_tensors='pt', truncation=True).input_ids.to(device())
     # Can't be longer than input length + SAFETY_TOKENS or model input dim
     max_out_len = min(len(input_ids[0]) + SAFETY_TOKENS, model.model_dim)
 def predict_sponsor_matches(text, model, tokenizer):
     sponsorship_text = predict_sponsor_text(text, model, tokenizer)
+    if CustomTokens.NO_SEGMENT.value in sponsorship_text:
         return []
+    return re_findall(SPONSOR_MATCH_RE, sponsorship_text)
 def segments_to_prediction_times(segments, model, tokenizer):
         matches = predict_sponsor_matches(batch_text, model, tokenizer)
         for match in matches:
+            matched_text = match['text'].split()
             # TODO skip if too short
             i1, j1, k1 = greedy_match(
             predicted_time_ranges.append({
                 'start': word_start(extracted_words[0]),
+                'end': word_end(extracted_words[-1]),
+                'category': match['category']
             })
     # Necessary to sort matches by start time
     # Merge overlapping predictions and sponsorships that are close together
     # Caused by model having max input size
+    prev_prediction = None
     final_predicted_time_ranges = []
     for range in predicted_time_ranges:
         start_time = range['start']
         end_time = range['end']
+        if prev_prediction is not None and range['category'] == prev_prediction['category'] and (
+            start_time <= prev_prediction['end'] <= end_time or start_time -
+                prev_prediction['end'] <= MERGE_TIME_WITHIN
+        ):
+            # Ending time of last segment is in this segment or c, so we extend last prediction range
             final_predicted_time_ranges[-1]['end'] = end_time
         else:  # No overlap, is a new prediction
             final_predicted_time_ranges.append({
                 'start': start_time,
                 'end': end_time,
+                'category': range['category']
             })
+        prev_prediction = range
     return final_predicted_time_ranges
     predict_args.video_id = predict_args.video_id.strip()
     predictions = predict(predict_args.video_id, model, tokenizer,
+                          segmentation_args)  # TODO add back , classifier_args=classifier_args
     video_url = f'https://www.youtube.com/watch?v={predict_args.video_id}'
     if not predictions:
               ' '.join([w['text'] for w in prediction['words']]), '"', sep='')
         print('Time:', seconds_to_time(
             prediction['start']), '-->', seconds_to_time(prediction['end']))
+        print('Probability:', prediction.get('probability'))
+        print('Category:', prediction.get('category'))
         print()

src/preprocess.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import itertools
-from typing import Optional
 from datasets import load_dataset
 from model import ModelArguments
 import segment
@@ -24,8 +25,10 @@ def find(s, ch):
     return [i for i, ltr in enumerate(s) if ltr == ch]
-def wordify(transcript):
     """Try to replicate format for automatically generated transcripts"""
     words = []
     for line_index, line in enumerate(transcript):
@@ -34,9 +37,14 @@ def wordify(transcript):
             continue
         start = line['start']
-        next_start = transcript[line_index +
-                                1]['start'] if line_index < len(transcript) - 1 else float('inf')
-        end = min(start + line['duration'], next_start)
         duration = end - start
         indices = find(text, ' ') + [len(text)]
@@ -52,9 +60,9 @@ def wordify(transcript):
             w_start = start + percentage * duration
             words.append({
-                'start': round(w_start, 5),
-                'duration': round(w_duration, 5),
-                'end': round(w_start + w_duration, 5),
                 'text': word,
             })
@@ -69,6 +77,10 @@ def get_manual_words(transcript_list):
     return wordify(transcript)
 def get_auto_words(transcript_list):
     words = []
     transcript = transcript_list.find_generated_transcript(['en'])
@@ -82,7 +94,7 @@ def get_auto_words(transcript_list):
             offset_ms = word.get('tOffsetMs', 0)
             texts = word['utf8'].replace(
-                CustomTokens.PROFANITY_RAW.value, CustomTokens.PROFANITY_CONVERTED.value
             ).strip().split()
             for text in texts:
@@ -94,7 +106,7 @@ def get_auto_words(transcript_list):
     return words
-def get_words(video_id, process=True, fallback=False, transcript_type='auto'):
     """Get parsed video transcript with caching system
     returns None if not processed yet and process is False
     """
@@ -148,21 +160,31 @@ def extract_sponsors(words, min_sponsor_segment_length=5):
     paragraphs = []
     current = []
     for word in words:
-        if not word.get('sponsor') and not current:
-            continue
-        if word['sponsor']:
             current.append(word['text'])
         else:
-            paragraphs.append(current)
             current = []
-    if current:
-        paragraphs.append(current)
     # Remove all too short:
     paragraphs = list(filter(lambda x: len(
-        x) >= min_sponsor_segment_length, paragraphs))
     return paragraphs
@@ -203,10 +225,8 @@ def clean_text(text):
     text = re.sub(NUM_REGEX, CustomTokens.NUMBER.value, text)
     # Replace profanity with special token
-    text = text.replace(CustomTokens.PROFANITY_RAW.value,
-                        CustomTokens.PROFANITY.value)
-    text = text.replace(CustomTokens.PROFANITY_CONVERTED.value,
-                        CustomTokens.PROFANITY.value)
     return text.strip()
@@ -254,11 +274,25 @@ class PreprocessArguments:
     do_create: bool = field(
         default=False, metadata={'help': 'Merge sponsor segments into single file'}
     )
     min_votes: int = field(
         default=0, metadata={'help': 'Minimum number of votes'})
     # Downvotes will make this negative.
     # 1 = At least one positive vote
     do_transcribe: bool = field(
         default=False, metadata={'help': 'Get transcripts for videos'}
     )
@@ -266,7 +300,7 @@ class PreprocessArguments:
         default=4, metadata={'help': 'Number of transcripts to download in parallel'})
     overwrite: bool = field(
-        default=False, metadata={'help': 'Overwrite training, testing and validation data, if present.'}
     )
     do_generate: bool = field(
@@ -447,14 +481,26 @@ def main():
         preprocess_args.raw_data_dir, preprocess_args.raw_data_file)
     def get_rows():
         with open(raw_dataset_path, newline='') as csvfile:
             reader = csv.DictReader(csvfile)
             for line in reader:
                 if line['service'] != 'YouTube':
                     continue
                 # TODO add support for other categories and action types?
-                if line['category'] != 'sponsor':
                     continue
                 if line['actionType'] != 'skip':
                     continue
@@ -463,9 +509,6 @@ def main():
                 if line['hidden'] == '1' or line['shadowHidden'] == '1':
                     continue
-                if len(line['videoID']) != 11:
-                    continue  # Invalid youtube video ID
                 # Skip those that aren't highly voted
                 line['votes'] = int(line['votes'])
                 # incorrect_votes = int(line['incorrectVotes'])
@@ -494,6 +537,8 @@ def main():
         for row in data_rows:
             video_ids.add(row['videoID'])
         print('Start transcribing')
         with tqdm(total=len(video_ids)) as progress:
             def on_job_complete(job):
@@ -517,21 +562,18 @@ def main():
     final_path = os.path.join(
         processed_args.processed_dir, processed_args.processed_file)
-    if os.path.exists(final_path) and not preprocess_args.do_create:
-        logging.info(f'{final_path} exists, opening file')
-        with open(final_path) as fp:
-            final_data = json.load(fp)
-    else:
         print('Create final data')
         final_data = {}
         if data_rows is None:
             data_rows = get_rows()
         # TODO add progress bar
         # TODO parallelise?
-        for line in data_rows:
             video_id = line['videoID']
             if video_id not in final_data:
@@ -540,7 +582,10 @@ def main():
             segment_start = float(line['startTime'])
             segment_end = float(line['endTime'])
-            video_words = get_words(video_id, process=True)
             segment_words = segment.extract_segment(
                 video_words, segment_start, segment_end)
@@ -552,7 +597,8 @@ def main():
             wps = len(segment_words)/duration if duration > 0 else 0
             if wps < preprocess_args.min_wps:
-                print('bad segment in', video_id, '| wps =', wps)
                 continue
             final_data[video_id].append({
@@ -580,10 +626,16 @@ def main():
         #     raw_dataset_path, final_path, preprocess_args.min_votes)
         # # TODO save metadata in final.json?
-    logging.info(f'Found {len(final_data)} videos')
     # TODO shuffle final_data
     # if not os.path.exists(excess_path) or preprocess_args.overwrite
     # TODO use overwrite param
@@ -610,10 +662,8 @@ def main():
         write_mode = 'w' if preprocess_args.overwrite else 'a'
         get_all = preprocess_args.max_videos is None
-        if get_all:
-            total = len(final_data)
-        else:
-            total = preprocess_args.max_videos
         index = 0
         data = final_data.items()
@@ -641,7 +691,7 @@ def main():
                 elif count_videos >= preprocess_args.max_videos:
                     break
-                words = get_words(video_id, False)
                 if not words:
                     continue
@@ -662,34 +712,40 @@ def main():
                     progress.update()
                 for seg in segments:
-                    segment_text = ' '.join((x['text'] for x in seg))
-                    extracted_text = ''
-                    for p in extract_sponsors(seg):
-                        p_text = ' '.join(p)
-                        extracted_text += f'{CustomTokens.START_SPONSOR.value} {p_text} {CustomTokens.END_SPONSOR.value}. '
                     duration = segment.word_end(
                         seg[-1]) - segment.word_start(seg[0])
                     wps = len(seg)/duration if duration > 0 else 0
                     # Ignore segments with "not enough words" in the transcript
                     if wps < preprocess_args.min_wps:
                         continue
                     d = {
                         'video_index': index,
                         'video_id': video_id,
                         'text': clean_text(segment_text),
-                        'words_per_second': wps,
                     }
-                    d['sponsor'] = bool(extracted_text)
-                    d['extracted'] = clean_text(
-                        extracted_text) if d['sponsor'] else CustomTokens.NO_SPONSOR.value
-                    print(json.dumps(d), file=(
-                        positive if d['sponsor'] else negative))
     if preprocess_args.do_split:
         print('Splitting')

+from datetime import datetime
 import itertools
+from typing import Optional, List
 from datasets import load_dataset
 from model import ModelArguments
 import segment
     return [i for i, ltr in enumerate(s) if ltr == ch]
+def wordify(transcript, maximum_wps=1):
     """Try to replicate format for automatically generated transcripts"""
+    # Do not allow segments to be on screen for too long using maximum_wps
     words = []
     for line_index, line in enumerate(transcript):
             continue
         start = line['start']
+        next_start = transcript[line_index + 1]['start'] \
+            if line_index < len(transcript) - 1 else float('inf')
+        # Use maximum wps to calculate latest end (to avoid segments which stay on screen too long)
+        longest_duration = maximum_wps * text.count(' ')
+        latest_end = start + longest_duration
+        end = min(start + line['duration'], next_start, latest_end)
         duration = end - start
         indices = find(text, ' ') + [len(text)]
             w_start = start + percentage * duration
             words.append({
+                'start': round(w_start, 3),
+                'duration': round(w_duration, 3),
+                'end': round(w_start + w_duration, 3),
                 'text': word,
             })
     return wordify(transcript)
+PROFANITY_RAW = '[ __ ]'  # How YouTube transcribes profanity
+PROFANITY_CONVERTED = '*****'  # Safer version for tokenizing
 def get_auto_words(transcript_list):
     words = []
     transcript = transcript_list.find_generated_transcript(['en'])
             offset_ms = word.get('tOffsetMs', 0)
             texts = word['utf8'].replace(
+                PROFANITY_RAW, PROFANITY_CONVERTED
             ).strip().split()
             for text in texts:
     return words
+def get_words(video_id, process=True, fallback=True, transcript_type='auto'):
     """Get parsed video transcript with caching system
     returns None if not processed yet and process is False
     """
     paragraphs = []
     current = []
+    prev_category = None
     for word in words:
+        if word['category'] is None:  # and not current:
+            continue  # Skip unimportant
+        if word['category'] == prev_category:
             current.append(word['text'])
         else:
+            paragraphs.append({
+                'words': current,
+                'category': prev_category,
+            })
             current = []
+        prev_category = word['category']
+    if current and prev_category is not None:
+        paragraphs.append({
+            'words': current,
+            'category': prev_category,
+        })
     # Remove all too short:
     paragraphs = list(filter(lambda x: len(
+        x['words']) >= min_sponsor_segment_length, paragraphs))
     return paragraphs
     text = re.sub(NUM_REGEX, CustomTokens.NUMBER.value, text)
     # Replace profanity with special token
+    text = text.replace(PROFANITY_RAW, CustomTokens.PROFANITY.value)
+    text = text.replace(PROFANITY_CONVERTED, CustomTokens.PROFANITY.value)
     return text.strip()
     do_create: bool = field(
         default=False, metadata={'help': 'Merge sponsor segments into single file'}
     )
     min_votes: int = field(
         default=0, metadata={'help': 'Minimum number of votes'})
     # Downvotes will make this negative.
     # 1 = At least one positive vote
+    min_date: str = field(
+        default='20/08/2021', metadata={'help': 'Only use submissions from after this date, defaults to the release of v3.0 (https://github.com/ajayyy/SponsorBlock/releases/tag/3.0)'})
+    categories: str = field(
+        default_factory=lambda: ['sponsor', 'selfpromo', 'interaction'],
+        metadata={
+            'nargs': '+',
+            'choices': ['intro', 'sponsor', 'interaction',
+                        'outro', 'selfpromo', 'preview',
+                        'poi_highlight', 'filler', 'music_offtopic']  # moreCategories
+        }
+    )
     do_transcribe: bool = field(
         default=False, metadata={'help': 'Get transcripts for videos'}
     )
         default=4, metadata={'help': 'Number of transcripts to download in parallel'})
     overwrite: bool = field(
+        default=True, metadata={'help': 'Overwrite training, testing and validation data, if present.'}
     )
     do_generate: bool = field(
         preprocess_args.raw_data_dir, preprocess_args.raw_data_file)
     def get_rows():
+        latest_time = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
         with open(raw_dataset_path, newline='') as csvfile:
             reader = csv.DictReader(csvfile)
             for line in reader:
+                submitted_time = datetime.fromtimestamp(
+                    float(line['timeSubmitted'])/1e3)
+                if submitted_time < latest_time:
+                    continue
                 if line['service'] != 'YouTube':
                     continue
+                if len(line['videoID']) != 11:
+                    continue  # Invalid youtube video ID
                 # TODO add support for other categories and action types?
+                if line['category'] not in preprocess_args.categories:
                     continue
                 if line['actionType'] != 'skip':
                     continue
                 if line['hidden'] == '1' or line['shadowHidden'] == '1':
                     continue
                 # Skip those that aren't highly voted
                 line['votes'] = int(line['votes'])
                 # incorrect_votes = int(line['incorrectVotes'])
         for row in data_rows:
             video_ids.add(row['videoID'])
+        # TODO first set - os.listdir and do rest
         print('Start transcribing')
         with tqdm(total=len(video_ids)) as progress:
             def on_job_complete(job):
     final_path = os.path.join(
         processed_args.processed_dir, processed_args.processed_file)
+    if preprocess_args.do_create:
         print('Create final data')
         final_data = {}
         if data_rows is None:
             data_rows = get_rows()
+            # data_rows = itertools.islice(data_rows, 1000)  # TODO temp
         # TODO add progress bar
         # TODO parallelise?
+        for index, line in enumerate(data_rows):
             video_id = line['videoID']
             if video_id not in final_data:
             segment_start = float(line['startTime'])
             segment_end = float(line['endTime'])
+            video_words = get_words(video_id, process=False)
+            if not video_words:
+                continue
             segment_words = segment.extract_segment(
                 video_words, segment_start, segment_end)
             wps = len(segment_words)/duration if duration > 0 else 0
             if wps < preprocess_args.min_wps:
+                print(index, 'Skipping bad segment in',
+                      video_id, '| wps =', wps)
                 continue
             final_data[video_id].append({
         #     raw_dataset_path, final_path, preprocess_args.min_votes)
         # # TODO save metadata in final.json?
+    elif os.path.exists(final_path):
+        # Already exists
+        logging.info(f'{final_path} exists, opening file')
+        with open(final_path) as fp:
+            final_data = json.load(fp)
+        logging.info(f'Found {len(final_data)} videos')
+    else:
+        return  # Do not continue
     # TODO shuffle final_data
     # if not os.path.exists(excess_path) or preprocess_args.overwrite
     # TODO use overwrite param
         write_mode = 'w' if preprocess_args.overwrite else 'a'
         get_all = preprocess_args.max_videos is None
+        total = len(final_data) if get_all else preprocess_args.max_videos
         index = 0
         data = final_data.items()
                 elif count_videos >= preprocess_args.max_videos:
                     break
+                words = get_words(video_id, process=False)
                 if not words:
                     continue
                     progress.update()
                 for seg in segments:
                     duration = segment.word_end(
                         seg[-1]) - segment.word_start(seg[0])
                     wps = len(seg)/duration if duration > 0 else 0
                     # Ignore segments with "not enough words" in the transcript
                     if wps < preprocess_args.min_wps:
                         continue
+                    segment_text = ' '.join((x['text'] for x in seg))
+                    extracted_segments = extract_sponsors(seg)
                     d = {
                         'video_index': index,
                         'video_id': video_id,
                         'text': clean_text(segment_text),
+                        'words_per_second': round(wps, 3),
                     }
+                    if extracted_segments:
+                        extracted_texts = []
+                        for s in extracted_segments:
+                            w = ' '.join(s['words'])
+                            category = s['category'].upper()
+                            t = f"{CustomTokens.START_SEGMENT.value}_{category} {w} {CustomTokens.END_SEGMENT.value}_{category}"
+                            extracted_texts.append(t)
+                        extracted_text = '\n'.join(extracted_texts)
+                        d['extracted'] = clean_text(extracted_text)
+                        print(json.dumps(d), file=positive)
+                    else:
+                        d['extracted'] = CustomTokens.NO_SEGMENT.value
+                        print(json.dumps(d), file=negative)
     if preprocess_args.do_split:
         print('Splitting')

src/segment.py CHANGED Viewed

@@ -25,7 +25,7 @@ def get_overlapping_chunks_of_tokens(tokens, size, overlap):
 # Generate up to max_tokens - SAFETY_TOKENS
-SAFETY_TOKENS = 8
 # TODO play around with this?
@@ -36,10 +36,10 @@ def add_labels_to_words(words, sponsor_segments):
     # TODO binary search
     for word in words:
-        word['sponsor'] = False
         for sponsor_segment in sponsor_segments:
             if sponsor_segment['start'] <= word['start'] <= sponsor_segment['end']:
-                word['sponsor'] = True
     # TODO use extract_segment with mapping function?
     # TODO remove sponsor segments that contain mostly empty space?

 # Generate up to max_tokens - SAFETY_TOKENS
+SAFETY_TOKENS = 12
 # TODO play around with this?
     # TODO binary search
     for word in words:
+        word['category'] = None
         for sponsor_segment in sponsor_segments:
             if sponsor_segment['start'] <= word['start'] <= sponsor_segment['end']:
+                word['category'] = sponsor_segment['category']
     # TODO use extract_segment with mapping function?
     # TODO remove sponsor segments that contain mostly empty space?

src/shared.py CHANGED Viewed

@@ -7,16 +7,17 @@ from typing import Optional
 from dataclasses import dataclass, field
 from enum import Enum
 class CustomTokens(Enum):
     URL = 'URL_TOKEN'
     HYPHENATED_URL = 'HYPHENATED_URL_TOKEN'
     NUMBER_PERCENTAGE = 'NUMBER_PERCENTAGE_TOKEN'
     NUMBER = 'NUMBER_TOKEN'
-    START_SPONSOR = 'START_SPONSOR'
-    END_SPONSOR = 'END_SPONSOR'
-    NO_SPONSOR = 'NO_SPONSOR_FOUND'
     SHORT_HYPHENATED = 'SHORT_HYPHENATED_TOKEN'
     LONG_WORD = 'LONG_WORD_TOKEN'
@@ -26,8 +27,6 @@ class CustomTokens(Enum):
     APPLAUSE = '[Applause]'
     LAUGHTER = '[Laughter]'
-    PROFANITY_RAW = '[ __ ]'  # How YouTube transcribes profanity
-    PROFANITY_CONVERTED = '*****'  # Safer version for tokenizing
     PROFANITY = 'PROFANITY_TOKEN'
     @classmethod

 from dataclasses import dataclass, field
 from enum import Enum
 class CustomTokens(Enum):
+    EXTRACT_SEGMENTS_PREFIX = 'EXTRACT_SEGMENTS: '
     URL = 'URL_TOKEN'
     HYPHENATED_URL = 'HYPHENATED_URL_TOKEN'
     NUMBER_PERCENTAGE = 'NUMBER_PERCENTAGE_TOKEN'
     NUMBER = 'NUMBER_TOKEN'
+    START_SEGMENT = 'START_SEGMENT_TOKEN'
+    END_SEGMENT = 'END_SEGMENT_TOKEN'
+    NO_SEGMENT = 'NO_SEGMENT_FOUND'
     SHORT_HYPHENATED = 'SHORT_HYPHENATED_TOKEN'
     LONG_WORD = 'LONG_WORD_TOKEN'
     APPLAUSE = '[Applause]'
     LAUGHTER = '[Laughter]'
     PROFANITY = 'PROFANITY_TOKEN'
     @classmethod

src/train.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from preprocess import load_datasets, DatasetArguments
-from predict import ClassifierArguments, SPONSOR_MATCH_RE, DEFAULT_TOKEN_PREFIX
-from shared import device, GeneralArguments, OutputArguments
-from model import ModelArguments
 import transformers
-from model import get_model, get_tokenizer
 import logging
 import os
 import sys
@@ -22,7 +21,7 @@ from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
 from sklearn.linear_model import LogisticRegression
 from sklearn.feature_extraction.text import TfidfVectorizer
 import re
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -117,7 +116,7 @@ class DataTrainingArguments:
         },
     )
     source_prefix: Optional[str] = field(
-        default=DEFAULT_TOKEN_PREFIX, metadata={
             'help': 'A prefix to add before every source text (useful for T5 models).'}
     )
@@ -135,11 +134,11 @@ class SequenceTrainingArguments(OutputArguments, Seq2SeqTrainingArguments):
     num_train_epochs: float = field(
         default=1, metadata={'help': 'Total number of training epochs to perform.'})
-    save_steps: int = field(default=2500, metadata={
                             'help': 'Save checkpoint every X updates steps.'})
-    eval_steps: int = field(default=2500, metadata={
                             'help': 'Run an evaluation every X steps.'})
-    logging_steps: int = field(default=2500, metadata={
                                'help': 'Log every X updates steps.'})
     skip_train_transformer: bool = field(default=False, metadata={
@@ -257,8 +256,8 @@ def main():
             ngram_range=(1, 2),  # best so far
             # max_features=8000  # remove for higher accuracy?
-            max_features=50000
-            # max_features=10000
         )
         train_test_data = {
@@ -277,11 +276,12 @@ def main():
             dataset = raw_datasets[ds_type]
             for row in dataset:
                 # Get matches:
-                if row['sponsor']:
-                    matches = re.findall(SPONSOR_MATCH_RE, row['extracted'])
-                else:
                     matches = [row['text']]
                 for match in matches:

 from preprocess import load_datasets, DatasetArguments
+from predict import ClassifierArguments, SPONSOR_MATCH_RE
+from shared import CustomTokens, device, GeneralArguments, OutputArguments
+from model import ModelArguments, get_model, get_tokenizer
 import transformers
 import logging
 import os
 import sys
 from transformers.utils.versions import require_version
 from sklearn.linear_model import LogisticRegression
 from sklearn.feature_extraction.text import TfidfVectorizer
+from utils import re_findall
 import re
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
         },
     )
     source_prefix: Optional[str] = field(
+        default=CustomTokens.EXTRACT_SEGMENTS_PREFIX.value, metadata={
             'help': 'A prefix to add before every source text (useful for T5 models).'}
     )
     num_train_epochs: float = field(
         default=1, metadata={'help': 'Total number of training epochs to perform.'})
+    save_steps: int = field(default=5000, metadata={
                             'help': 'Save checkpoint every X updates steps.'})
+    eval_steps: int = field(default=5000, metadata={
                             'help': 'Run an evaluation every X steps.'})
+    logging_steps: int = field(default=5000, metadata={
                                'help': 'Log every X updates steps.'})
     skip_train_transformer: bool = field(default=False, metadata={
             ngram_range=(1, 2),  # best so far
             # max_features=8000  # remove for higher accuracy?
+            # max_features=50000
+            max_features=10000
         )
         train_test_data = {
             dataset = raw_datasets[ds_type]
             for row in dataset:
                 # Get matches:
+                matches = re_findall(SPONSOR_MATCH_RE, row['extracted'])
+                return  # TODO fix
+                if not matches:
                     matches = [row['text']]
                 for match in matches:

src/utils.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import asyncio
 import os
 class Job:
     def __init__(self, function, *args, **kwargs) -> None:
         self.function = function
@@ -84,3 +86,7 @@ class InterruptibleThreadPool:
             self.loop.close()
         return self.jobs

+import re
 import asyncio
 import os
 class Job:
     def __init__(self, function, *args, **kwargs) -> None:
         self.function = function
             self.loop.close()
         return self.jobs
+def re_findall(pattern, string):
+    return [m.groupdict() for m in re.finditer(pattern, string)]