Spaces:

mozilla-foundation
/

youtube_video_similarity

Build error

App Files Files Community

aapot commited on Sep 19, 2022

Commit

f3772cc

1 Parent(s): 8b86424

Add demo application

Browse files

Files changed (7) hide show

README.md +4 -1
app.py +94 -0
helpers.py +92 -0
requirements.txt +12 -0
utils/huggingface_model_wrapper.py +57 -0
utils/text_cleaning.py +131 -0
utils/unifiedmodel.py +348 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Youtube Video Similarity
-emoji: 👀
 colorFrom: purple
 colorTo: blue
 sdk: gradio
@@ -8,6 +8,9 @@ sdk_version: 3.3.1
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Youtube Video Similarity
+emoji: ▶
 colorFrom: purple
 colorTo: blue
 sdk: gradio
 app_file: app.py
 pinned: false
 license: apache-2.0
+models:
+- mozilla-foundation/youtube_video_similarity_model_wt
+- mozilla-foundation/youtube_video_similarity_model_nt
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import gradio as gr
+import torch
+from utils.unifiedmodel import RRUMDataset
+from utils.huggingface_model_wrapper import YoutubeVideoSimilarityModel
+from torch.utils.data import DataLoader
+from helpers import get_example_videos, update_youtube_embedded_html, get_input_data_df
+RR_EXAMPLES_URL = os.environ.get(
+    'RR_EXAMPLES_URL', 'https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/regrets_reporter_study/v1/files/000000000000.json')
+NUM_RR_EXAMPLES = 5
+example_videos, example_videos_rr = get_example_videos(
+    RR_EXAMPLES_URL, NUM_RR_EXAMPLES)
+demo_title = 'Mozilla RegretsReporter YouTube video similarity'
+demo_description = f'''
+# {demo_title}
+This demo showcases the YouTube video semantic similarity model developed as part of the RegretsReporter research project at Mozilla Foundation. You can read more about the project [here](https://foundation.mozilla.org/en/youtube/user-controls/) and about the semantic similarity model [here](https://foundation.mozilla.org/en/blog/the-regretsreporter-user-controls-study-machine-learning-to-measure-semantic-similarity-of-youtube-videos/). Note: the model is multilingual so you can try it with non-English videos too while it probably works the best with English videos.
+This demo works by inserting two YouTube video URLs below and clicking the Run button. After a few seconds, you will see model's predicted probability of how similar those two videos are. You can copy URLs from YouTube or also try out a few predefined examples by clicking them on the examples table.
+'''
+placeholder_youtube_embedded_html = '''
+    <p>Insert video URL first</p>
+'''
+model_wt = YoutubeVideoSimilarityModel.from_pretrained(
+    'mozilla-foundation/youtube_video_similarity_model_wt', use_auth_token=True)
+model_nt = YoutubeVideoSimilarityModel.from_pretrained(
+    'mozilla-foundation/youtube_video_similarity_model_nt', use_auth_token=True)
+cross_encoder_model_name_or_path = model_wt.cross_encoder_model_name_or_path
+def get_video_similarity(video1_url, video2_url):
+    df = get_input_data_df(video1_url, video2_url)
+    if df['regret_transcript'].isna().any() or df['recommendation_transcript'].isna().any():
+        with_transcript = False
+    else:
+        with_transcript = True
+    dataset = RRUMDataset(df, with_transcript=with_transcript, label_col=None,
+                          cross_encoder_model_name_or_path=cross_encoder_model_name_or_path)
+    data_loader = DataLoader(dataset.test_dataset, shuffle=False,
+                             batch_size=1, num_workers=0, pin_memory=False)
+    with torch.inference_mode():
+        if with_transcript:
+            pred = model_wt(next(iter(data_loader)))
+        else:
+            pred = model_nt(next(iter(data_loader)))
+    pred = torch.special.expit(pred).squeeze().tolist()
+    return f'YouTube videos are {pred:.0%} similar'
+with gr.Blocks(title=demo_title) as demo:
+    gr.Markdown(demo_description)
+    with gr.Row():
+        with gr.Column():
+            input_text1 = gr.Textbox(
+                label='Video 1', placeholder='Insert first YouTube video URL')
+            input_text2 = gr.Textbox(
+                label='Video 2', placeholder='Insert second YouTube video URL')
+            inputs = [input_text1, input_text2]
+            with gr.Row():
+                clear_btn = gr.Button('Clear', variant='secondary')
+                run_btn = gr.Button('Run', variant='primary')
+        with gr.Column():
+            outputs = [gr.Label(label='Model prediction')]
+    with gr.Accordion('See video details', open=False):
+        with gr.Row():
+            with gr.Column():
+                video_embedded = gr.HTML(
+                    value=placeholder_youtube_embedded_html)
+            with gr.Column():
+                video_embedded2 = gr.HTML(
+                    value=placeholder_youtube_embedded_html)
+    with gr.Column():
+        if example_videos:
+            examples = gr.Examples(examples=example_videos, inputs=inputs)
+        if example_videos_rr:
+            examples_rr = gr.Examples(examples=example_videos_rr, inputs=inputs,
+                                      label='Example bad becommendations from the RegretsReporter report')
+    run_btn.click(fn=get_video_similarity, inputs=inputs, outputs=outputs)
+    clear_btn.click(lambda value_1, value_2, value_3: (
+        None, None, None), inputs=inputs + outputs, outputs=inputs + outputs)
+    input_text1.change(lambda input: update_youtube_embedded_html(
+        input, 1) if input else placeholder_youtube_embedded_html, inputs=input_text1, outputs=video_embedded)
+    input_text2.change(lambda input: update_youtube_embedded_html(
+        input, 2) if input else placeholder_youtube_embedded_html, inputs=input_text2, outputs=video_embedded2)
+demo.launch()

helpers.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import itertools
+import random
+import requests
+import pandas as pd
+from pytube import YouTube
+from youtube_transcript_api import YouTubeTranscriptApi
+from youtube_transcript_api.formatters import TextFormatter
+def is_youtube_video_available(url):
+    video = YouTube(url)
+    try:
+        video.title
+        return True
+    except:
+        return False
+def get_example_videos(rr_examples_url, num_rr_examples):
+    example_videos = [['https://www.youtube.com/watch?v=WfVF-Ec4naQ', 'https://www.youtube.com/watch?v=4hrNt28t7Cw'],
+                      ['https://www.youtube.com/watch?v=GbpjLP-UvIU',
+                       'https://www.youtube.com/watch?v=BlQ2mP2EE4A'],
+                      ['https://www.youtube.com/watch?v=fdzY1f2P91k',
+                       'https://www.youtube.com/watch?v=BlQ2mP2EE4A'],
+                      ['https://www.youtube.com/watch?v=fdzY1f2P91k', 'https://www.youtube.com/watch?v=9gIVGJQ3xWE']]
+    example_videos = [ex for ex in example_videos if is_youtube_video_available(
+        ex[0]) and is_youtube_video_available(ex[1])]
+    try:
+        example_videos_rr = requests.get(rr_examples_url).json()
+    except:
+        example_videos_rr = []
+    example_videos_rr = [[f'https://www.youtube.com/watch?v={ex["rejected_video_id"]}',
+                          f'https://www.youtube.com/watch?v={ex["recommendation_id"]}'] for ex in example_videos_rr]
+    # remove duplicate video pairs, there seems to be one duplicate
+    example_videos_rr.sort()
+    example_videos_rr = list(example_videos_rr for example_videos_rr,
+                             _ in itertools.groupby(example_videos_rr))
+    example_videos_rr = [ex for ex in example_videos_rr if is_youtube_video_available(
+        ex[0]) and is_youtube_video_available(ex[1])]
+    if len(example_videos_rr) > num_rr_examples:
+        example_videos_rr = random.sample(example_videos_rr, num_rr_examples)
+    return example_videos, example_videos_rr
+def get_youtube_embedded_html(embed_url, video_position):
+    return f'''
+        <p>Video {video_position}</p>
+        <iframe width="100%" height="360px" src="{embed_url}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; fullscreen" allowfullscreen></iframe>
+    '''
+def update_youtube_embedded_html(video_url, video_position):
+    try:
+        embed_url = YouTube(video_url).embed_url
+    except:
+        return f'''
+            <p>There was error in fetching details for video with the URL: {video_url}</p>
+        '''
+    return get_youtube_embedded_html(embed_url, video_position)
+def get_youtube_video_data(url):
+    video = YouTube(url)
+    channel_id = video.channel_id
+    video_title = video.title
+    video_description = video.description
+    try:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video.video_id)
+    except:
+        return channel_id, video_title, video_description, None
+    available_non_common_langs = [tr.language_code for tr in list(
+        transcript_list) if tr.language_code not in ['en', 'en-US', 'es', 'de']]
+    video_transcript = YouTubeTranscriptApi.get_transcript(
+        video.video_id, languages=['en', 'en-US', 'es', 'de'] + available_non_common_langs)
+    video_transcript = TextFormatter().format_transcript(
+        video_transcript).replace('\n', ' ')
+    return channel_id, video_title, video_description, video_transcript
+def get_input_data_df(video1_url, video2_url):
+    channel_id, video_title, video_description, video_transcript = get_youtube_video_data(
+        video1_url)
+    channel_id2, video_title2, video_description2, video_transcript2 = get_youtube_video_data(
+        video2_url)
+    channel_sim = 1 if channel_id == channel_id2 else 0
+    df = pd.DataFrame([[video_title, video_description, video_transcript] + [video_title2, video_description2, video_transcript2] + [channel_sim]], columns=[
+                      'regret_title', 'regret_description', 'regret_transcript', 'recommendation_title', 'recommendation_description', 'recommendation_transcript', 'channel_sim'])
+    return df

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+datasets==2.4.0
+gradio==3.3.1
+huggingface_hub==0.9.1
+pandas==1.4.3
+pyarrow==9.0.0
+pytorch_lightning==1.7.6
+pytube==12.1.0
+requests==2.27.1
+torch==1.12.1
+torchmetrics==0.9.3
+transformers==4.22.1
+youtube_transcript_api==0.4.4

utils/huggingface_model_wrapper.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from huggingface_hub import PyTorchModelHubMixin
+from huggingface_hub.constants import PYTORCH_WEIGHTS_NAME
+from huggingface_hub.file_download import hf_hub_download
+from .unifiedmodel import RRUM
+import os
+import torch
+class YoutubeVideoSimilarityModel(RRUM, PyTorchModelHubMixin):
+    """
+        Hugging Face `PyTorchModelHubMixin` wrapper for RegretsReporter `RRUM` model.
+        This allows loading, using, and saving the model from Hugging Face model hub
+        with default Hugging Face methods `from_pretrained` and `save_pretrained`.
+    """
+    @classmethod
+    def _from_pretrained(
+        cls,
+        model_id,
+        revision,
+        cache_dir,
+        force_download,
+        proxies,
+        resume_download,
+        local_files_only,
+        use_auth_token,
+        map_location="cpu",
+        strict=False,
+        **model_kwargs,
+    ):
+        map_location = torch.device(map_location)
+        if os.path.isdir(model_id):
+            print("Loading weights from local directory")
+            model_file = os.path.join(model_id, PYTORCH_WEIGHTS_NAME)
+        else:
+            model_file = hf_hub_download(
+                repo_id=model_id,
+                filename=PYTORCH_WEIGHTS_NAME,
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                use_auth_token=use_auth_token,
+                local_files_only=local_files_only,
+            )
+        # convert Huggingface config to RRUM acceptable input parameters
+        if "config" in model_kwargs:
+            model_kwargs = {**model_kwargs["config"], **model_kwargs}
+            del model_kwargs["config"]
+        model = cls(**model_kwargs)
+        state_dict = torch.load(model_file, map_location=map_location)
+        model.load_state_dict(state_dict, strict=strict)
+        model.eval()
+        return model

utils/text_cleaning.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from fastcore.basics import listify
+from fastcore.utils import compose
+import unicodedata
+from string import punctuation
+import html
+from itertools import groupby
+import re
+control_char_regex = re.compile(r'[\r\n\t]+')
+url_regex = re.compile(
+    r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')
+username_regex = re.compile(r'(^|[^@\w])@(\w{1,15})\b')
+def fix_html(text):
+    tmp_ls = []
+    for e in listify(text):
+        e = e.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace('nbsp;', ' ').replace(
+            '#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace('<br />', "\n").replace(
+            '\\"', '"').replace('<unk>', ' ').replace(' @.@ ', '.').replace(' @-@ ', '-').replace('...', ' …')
+        tmp_ls.append(html.unescape(e))
+    text = tmp_ls
+    return text
+def remove_control_char(text):
+    tmp_ls = []
+    for e in listify(text):
+        tmp_ls.append(re.sub(control_char_regex, '.', e))
+    text = tmp_ls
+    return text
+def remove_remaining_control_chars(text):
+    tmp_ls = []
+    for e in listify(text):
+        tmp_ls.append(
+            ''.join(ch for ch in e if unicodedata.category(ch)[0] != 'C'))
+    text = tmp_ls
+    return text
+def remove_unicode_symbols(text):
+    tmp_ls = []
+    for e in listify(text):
+        tmp_ls.append(
+            ''.join(ch for ch in e if unicodedata.category(ch)[0] != 'So'))
+    text = tmp_ls
+    return text
+def standardise_punc(text):
+    transl_table = dict([(ord(x), ord(y))
+                         for x, y in zip(u"‘’´“”–-",  u"'''\"\"--")])
+    tmp_ls = []
+    for e in listify(text):
+        e = e.translate(transl_table)
+        tmp_ls.append(e)
+    text = tmp_ls
+    return text
+def remove_news_tags(text):
+    tmp_ls = []
+    for e in listify(text):
+        e = re.sub(r"(<[A-Z].+?>)|(</[A-Z].+?>)", "", e)
+        tmp_ls.append(e)
+    text = tmp_ls
+    return text
+def replace_urls(text):
+    filler, tmp_ls = '', []
+    for e in listify(text):
+        e = re.sub(r"(<a.+?>)|(</a>)|(<ref.+?>)", "", e)
+        e = re.sub(url_regex, filler, e)
+        tmp_ls.append(e)
+    text = tmp_ls
+    return text
+def replace_usernames(text):
+    filler, tmp_ls = '', []
+    for e in listify(text):
+        occ = e.count('@')
+        for _ in range(occ):
+            e = e.replace('@<user>', f'{filler}')
+            # replace other user handles by filler
+            e = re.sub(username_regex, filler, e)
+        tmp_ls.append(e)
+    text = tmp_ls
+    return text
+def remove_duplicate_punctuation(text):
+    tmp_ls = []
+    for e in listify(text):
+        e = re.sub(r'\b(\w+)( \1\b)+', r'\1', e)
+        punc = set(punctuation)
+        newtext = []
+        for k, g in groupby(e):
+            if k in punc:
+                newtext.append(k)
+            else:
+                newtext.extend(g)
+        e = ''.join(newtext)
+        tmp_ls.append(e)
+    text = tmp_ls
+    return text
+def remove_multi_space(text):
+    tmp_ls = []
+    for e in listify(text):
+        tmp_ls.append(' '.join(e.split()))
+    text = tmp_ls
+    return text
+clean_text_funcs = compose(*[fix_html, remove_control_char, remove_remaining_control_chars, remove_unicode_symbols,
+                            standardise_punc, remove_news_tags, replace_urls, replace_usernames, remove_duplicate_punctuation, remove_multi_space])

utils/unifiedmodel.py ADDED Viewed

	@@ -0,0 +1,348 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
+import datasets
+import pandas as pd
+import pyarrow
+import pytorch_lightning as pl
+import torchmetrics
+import torch.nn as nn
+import torch
+import types
+import multiprocessing
+from .text_cleaning import clean_text_funcs
+class RRUMDataset():
+    scalar_features = ['channel_sim']
+    _image_features = ['regret_thumbnail',
+                       'recommendation_thumbnail']  # not used atm
+    def __init__(self, data, with_transcript, cross_encoder_model_name_or_path, label_col="label", label_map=None, balance_label_counts=False, max_length=128, do_train_test_split=False, test_size=0.25, seed=42, keep_video_ids_for_predictions=False, encode_on_the_fly=False, clean_text=False, processing_batch_size=1000, processing_num_proc=1):
+        self._with_transcript = with_transcript
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            cross_encoder_model_name_or_path)
+        self.label_col = label_col
+        self.label_map = label_map
+        self.balance_label_counts = balance_label_counts
+        self.max_length = max_length
+        self.seed = seed
+        self.keep_video_ids_for_predictions = keep_video_ids_for_predictions
+        self.clean_text = clean_text
+        self.processing_batch_size = processing_batch_size
+        self.processing_num_proc = multiprocessing.cpu_count(
+        ) if not processing_num_proc else processing_num_proc
+        self.text_types = ['title', 'description'] + \
+            (['transcript'] if self._with_transcript else [])
+        self._text_features = [
+            'regret_title', 'recommendation_title', 'regret_description',
+            'recommendation_description'] + (['regret_transcript', 'recommendation_transcript'] if self._with_transcript else [])
+        # LOAD DATA INTO DATASET
+        self.streaming_dataset = False
+        if isinstance(data, pd.DataFrame):
+            self.dataset = datasets.Dataset.from_pandas(data)
+        elif isinstance(data, types.GeneratorType):
+            examples_iterable = datasets.iterable_dataset.ExamplesIterable(
+                self._streaming_generate_examples, {"iterable": data})
+            self.dataset = datasets.IterableDataset(examples_iterable)
+            self._stream_dataset_example = next(iter(self.dataset))
+            self._stream_dataset_column_names = list(
+                self._stream_dataset_example.keys())
+            self.streaming_dataset = True
+        elif isinstance(data, pyarrow.Table):
+            self.dataset = datasets.Dataset(data)
+        else:
+            raise ValueError(
+                f'Type of data is {type(data)} when pd.DataFrame, pyarrow.Table, or generator of pyarrow.RecordBatch is allowed')
+        # PREPROCESS DATASET
+        self._preprocess()
+        # ENCODE DATASET
+        self.train_dataset = None
+        self.test_dataset = None
+        if self.streaming_dataset:
+            # IterableDataset doesn't have train_test_split method
+            if self.label_col:
+                self.train_dataset = self._encode_streaming(self.dataset)
+                print('Streaming dataset available in .train_dataset')
+            else:
+                self.test_dataset = self._encode_streaming(self.dataset)
+                print(
+                    'Streaming dataset available in .test_dataset because label_col=None')
+        else:
+            # dataset into train_dataset and/or test_dataset
+            if do_train_test_split:
+                ds = self.dataset.train_test_split(
+                    test_size=test_size, shuffle=True, seed=self.seed, stratify_by_column=self.label_col)
+                self.train_dataset = ds['train']
+                self.test_dataset = ds['test']
+                print(
+                    f'Dataset was splitted into train and test with test_size={test_size}')
+            else:
+                if self.label_col:
+                    self.train_dataset = self.dataset
+                else:
+                    self.test_dataset = self.dataset
+            if encode_on_the_fly:
+                if self.train_dataset:
+                    self.train_dataset.set_transform(self._encode_on_the_fly)
+                    print('On-the-fly encoded dataset available in .train_dataset')
+                if self.test_dataset:
+                    self.test_dataset.set_transform(self._encode_on_the_fly)
+                    print('On-the-fly encoded dataset available in .test_dataset')
+            else:
+                if self.train_dataset:
+                    self.train_dataset = self._encode(self.train_dataset)
+                    print('Pre-encoded dataset available in .train_dataset')
+                if self.test_dataset:
+                    self.test_dataset = self._encode(self.test_dataset)
+                    print('Pre-encoded dataset available in .test_dataset')
+    def __len__(self):
+        if self.streaming_dataset:
+            raise ValueError(
+                f'Streaming dataset does not support len() method')
+        return len(self.dataset)
+    def __getitem__(self, index):
+        if self.streaming_dataset:
+            return next(iter(self.dataset))
+        return self.dataset[index]
+    def _streaming_generate_examples(self, iterable):
+        id_ = 0
+        # TODO: make sure GeneratorType is pyarrow.RecordBatch
+        if isinstance(iterable, types.GeneratorType):
+            for examples in iterable:
+                for ex in examples.to_pylist():
+                    yield id_, ex
+                    id_ += 1
+    def _preprocess(self):
+        if self._with_transcript:
+            self.dataset = self.dataset.filter(
+                lambda example: example['regret_transcript'] is not None and example['recommendation_transcript'] is not None)
+        else:
+            self.dataset = self.dataset.filter(
+                lambda example: example['regret_transcript'] is None or example['recommendation_transcript'] is None)
+        if self.label_col:
+            if self.streaming_dataset:
+                if self.label_col in self._stream_dataset_column_names and isinstance(self._stream_dataset_example[self.label_col], str):
+                    if not self.label_map:
+                        raise ValueError(
+                            f'"label_map" dict was not provided and is needed to encode string labels for streaming datasets')
+                    # cast_column method had issues with streaming dataset
+                    self.dataset = self.dataset.map(
+                        self._streaming_rename_labels)
+            else:
+                if self.dataset.features[self.label_col].dtype == 'string':
+                    if not self.label_map:
+                        self.label_map = {k: v for v, k in enumerate(
+                            self.dataset.unique(self.label_col))}
+                    self.dataset = self.dataset.filter(
+                        lambda example: example[self.label_col] in self.label_map.keys())
+                    self.dataset = self.dataset.cast_column(self.label_col, datasets.ClassLabel(
+                        num_classes=len(self.label_map), names=list(self.label_map.keys())))
+        self.dataset = self.dataset.filter(lambda example: not any(x in [None, ""] for x in [
+                                           example[key] for key in self._text_features + self.scalar_features + ([self.label_col] if self.label_col else [])]))  # dropna
+        if self.balance_label_counts and self.label_col and not self.streaming_dataset:
+            label_datasets = {}
+            for label in list(self.label_map.values()):
+                label_dataset = self.dataset.filter(
+                    lambda example: example[self.label_col] == label)
+                label_datasets[len(label_dataset)] = label_dataset
+            min_label_count = min(label_datasets)
+            sampled_datasets = [dataset.train_test_split(train_size=min_label_count, shuffle=True, seed=self.seed)[
+                'train'] if len(dataset) != min_label_count else dataset for dataset in label_datasets.values()]
+            self.dataset = datasets.concatenate_datasets(sampled_datasets)
+        if self.clean_text:
+            self.dataset = self.dataset.map(self._clean_text, batched=not self.streaming_dataset,
+                                            batch_size=self.processing_batch_size)
+        self.dataset = self.dataset.map(self._truncate_and_strip_text, batched=not self.streaming_dataset,
+                                        batch_size=self.processing_batch_size)
+    def _streaming_rename_labels(self, example):
+        # rename labels according to label_map if not already correct labels
+        if isinstance(example[self.label_col], list):
+            example[self.label_col] = [self.label_map.get(
+                ex, None) for ex in example[self.label_col] if ex not in self.label_map.values()]
+        elif isinstance(example[self.label_col], str) and example[self.label_col] not in self.label_map.values():
+            example[self.label_col] = self.label_map.get(
+                example[self.label_col], None)
+        else:
+            raise ValueError(
+                f'Type of example label is {type(example[self.label_col])} when list or string is allowed')
+        return example
+    def _clean_text(self, example):
+        for feat in self._text_features:
+            example[feat] = clean_text_funcs(example[feat])[0] if isinstance(
+                example[feat], str) else clean_text_funcs(example[feat])
+        return example
+    def _truncate_and_strip_text(self, example):
+        # tokenizer will truncate to max_length tokens anyway so to save RAM let's truncate to max_length words already beforehand
+        # one word is usually one or more tokens so should be safe to truncate this way without losing information
+        for feat in self._text_features:
+            if isinstance(example[feat], list):
+                example[feat] = [
+                    ' '.join(text.split()[:self.max_length]).strip() for text in example[feat] if text]
+            elif isinstance(example[feat], str):
+                example[feat] = ' '.join(example[feat].split()[
+                                         :self.max_length]).strip()
+            elif example[feat] is None:
+                return None
+            else:
+                raise ValueError(
+                    f'Type of example is {type(example[feat])} when list or string is allowed')
+        return example
+    def _encode(self, dataset):
+        encoded_dataset = None
+        for text_type in self.text_types:
+            encoded_text_type = dataset.map(lambda regret, recommendation: self.tokenizer(regret, recommendation, padding="max_length", truncation=True, max_length=self.max_length), batched=True,
+                                            batch_size=self.processing_batch_size, num_proc=self.processing_num_proc, input_columns=[f'regret_{text_type}', f'recommendation_{text_type}'], remove_columns=dataset.column_names)
+            encoded_text_type = encoded_text_type.rename_columns(
+                {col: f'{text_type}_{col}' for col in encoded_text_type.column_names})  # e.g. input_ids -> title_input_ids so we have separate input_ids for each text_type
+            if encoded_dataset:
+                encoded_dataset = datasets.concatenate_datasets(
+                    [encoded_dataset, encoded_text_type], axis=1)
+            else:
+                encoded_dataset = encoded_text_type
+        # copy scalar features and label from original dataset to the encoded dataset
+        for scalar_feat in self.scalar_features:
+            encoded_dataset = encoded_dataset.add_column(
+                name=scalar_feat, column=dataset[scalar_feat])
+        if self.label_col:
+            encoded_dataset = encoded_dataset.add_column(
+                name=self.label_col, column=dataset[self.label_col])
+        if self.keep_video_ids_for_predictions:
+            for id in ['regret_id', "recommendation_id"]:
+                encoded_dataset = encoded_dataset.add_column(
+                    name=id, column=dataset[id])
+        encoded_dataset.set_format(
+            type='torch', columns=encoded_dataset.column_names)
+        return encoded_dataset
+    def _encode_streaming(self, dataset):
+        encoded_dataset = dataset.map(self._encode_on_the_fly, batched=True,
+                                      batch_size=self.processing_batch_size, remove_columns=list(set(self._stream_dataset_column_names)-set(self.scalar_features + (
+                                          [self.label_col] if self.label_col else []) + (['regret_id', "recommendation_id"] if self.keep_video_ids_for_predictions else []))))  # IterableDataset doesn't have column_names attribute as normal Dataset
+        encoded_dataset = encoded_dataset.with_format("torch")
+        return encoded_dataset
+    def _encode_on_the_fly(self, batch):
+        for text_type in self.text_types:
+            encoded_text_type = dict(self.tokenizer(
+                batch[f'regret_{text_type}'], batch[f'recommendation_{text_type}'], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"))
+            for encoded_key in encoded_text_type.copy():
+                encoded_text_type[f"{text_type}_{encoded_key}"] = encoded_text_type.pop(encoded_key) if not self.streaming_dataset else encoded_text_type.pop(
+                    encoded_key).squeeze(0)  # e.g. input_ids -> title_input_ids so we have separate input_ids for each text_type
+            del batch[f'regret_{text_type}']
+            del batch[f'recommendation_{text_type}']
+            batch.update(encoded_text_type)
+        for scalar_feat in self.scalar_features:
+            batch[scalar_feat] = torch.as_tensor(
+                batch[scalar_feat]) if not self.streaming_dataset else torch.as_tensor(batch[scalar_feat]).squeeze(0)
+        if self.label_col:
+            batch[self.label_col] = torch.as_tensor(
+                batch[self.label_col]) if not self.streaming_dataset else torch.as_tensor(batch[self.label_col]).squeeze(0)
+        return batch
+class RRUM(pl.LightningModule):
+    def __init__(self, text_types, scalar_features, label_col, cross_encoder_model_name_or_path, optimizer_config=None, freeze_policy=None, pos_weight=None):
+        super().__init__()
+        self.save_hyperparameters()
+        self.text_types = text_types
+        self.scalar_features = scalar_features
+        self.label_col = label_col
+        self.optimizer_config = optimizer_config
+        self.cross_encoder_model_name_or_path = cross_encoder_model_name_or_path
+        self.cross_encoders = nn.ModuleDict({})
+        for t in self.text_types:
+            self.cross_encoders[t] = AutoModelForSequenceClassification.from_pretrained(
+                self.cross_encoder_model_name_or_path)
+        if freeze_policy is not None:
+            for xe in self.cross_encoders.values():
+                for name, param in xe.named_parameters():
+                    if freeze_policy(name):
+                        param.requires_grad = False
+        cross_encoder_out_features = list(self.cross_encoders.values())[0](
+            torch.randint(1, 2, (1, 2))).logits.size(dim=1)
+        self.lin1 = nn.Linear(len(self.cross_encoders) * cross_encoder_out_features +
+                              len(self.scalar_features), 1)
+        self.ac_metric = torchmetrics.Accuracy()
+        self.pr_metric = torchmetrics.Precision()
+        self.re_metric = torchmetrics.Recall()
+        self.auc_metric = torchmetrics.AUROC()
+        if pos_weight:
+            self.loss = nn.BCEWithLogitsLoss(
+                pos_weight=torch.Tensor([pos_weight]))
+        else:
+            self.loss = nn.BCEWithLogitsLoss()
+    def forward(self, x):
+        cross_logits = {}
+        for f in self.text_types:
+            inputs = {key.split(f'{f}_')[1]: x[key]
+                      for key in x if f in key}  # e.g. title_input_ids -> input_ids since we have separate input_ids for each text_type
+            cross_logits[f] = self.cross_encoders[f](**inputs).logits
+        x = torch.cat([*cross_logits.values()] +
+                      [x[scalar][:, None] for scalar in self.scalar_features],
+                      1
+                      )
+        del cross_logits
+        x = self.lin1(x)
+        return x
+    def configure_optimizers(self):
+        if self.optimizer_config:
+            return self.optimizer_config(self)
+        optimizer = torch.optim.AdamW(self.parameters(), lr=5e-5)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=int(
+                self.trainer.estimated_stepping_batches * 0.05),
+            num_training_steps=self.trainer.estimated_stepping_batches,
+        )
+        scheduler = {'scheduler': scheduler,
+                     'interval': 'step', 'frequency': 1}
+        return [optimizer], [scheduler]
+    def training_step(self, train_batch, batch_idx):
+        y = train_batch[self.label_col].unsqueeze(1).float()
+        logits = self(train_batch)
+        loss = self.loss(logits, y)
+        self.log('train_loss', loss)
+        return loss
+    def validation_step(self, val_batch, batch_idx):
+        y = val_batch[self.label_col].unsqueeze(1).float()
+        logits = self(val_batch)
+        loss = self.loss(logits, y)
+        self.ac_metric(logits, y.int())
+        self.pr_metric(logits, y.int())
+        self.re_metric(logits, y.int())
+        self.auc_metric(logits, y.int())
+        self.log('validation_accuracy', self.ac_metric)
+        self.log('validation_precision', self.pr_metric)
+        self.log('validation_recall', self.re_metric)
+        self.log('validation_auc', self.auc_metric)
+        self.log('val_loss', loss, prog_bar=True)
+    def validation_epoch_end(self, outputs):
+        self.log('validation_accuracy_ep', self.ac_metric)
+        self.log('validation_precision_ep', self.pr_metric)
+        self.log('validation_recall_ep', self.re_metric)
+        self.log('validation_auc_ep', self.auc_metric)