File size: 4,240 Bytes
f3772cc
 
 
 
25bd6d3
f3772cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25bd6d3
 
 
 
f3772cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import itertools
import random
import requests
import pandas as pd
import gradio as gr
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter


def is_youtube_video_available(url):
    video = YouTube(url)
    try:
        video.title
        return True
    except:
        return False


def get_example_videos(rr_examples_url, num_rr_examples):
    example_videos = [['https://www.youtube.com/watch?v=WfVF-Ec4naQ', 'https://www.youtube.com/watch?v=4hrNt28t7Cw'],
                      ['https://www.youtube.com/watch?v=GbpjLP-UvIU',
                       'https://www.youtube.com/watch?v=BlQ2mP2EE4A'],
                      ['https://www.youtube.com/watch?v=fdzY1f2P91k',
                       'https://www.youtube.com/watch?v=BlQ2mP2EE4A'],
                      ['https://www.youtube.com/watch?v=fdzY1f2P91k', 'https://www.youtube.com/watch?v=9gIVGJQ3xWE']]
    example_videos = [ex for ex in example_videos if is_youtube_video_available(
        ex[0]) and is_youtube_video_available(ex[1])]

    try:
        example_videos_rr = requests.get(rr_examples_url).json()
    except:
        example_videos_rr = []
    example_videos_rr = [[f'https://www.youtube.com/watch?v={ex["rejected_video_id"]}',
                          f'https://www.youtube.com/watch?v={ex["recommendation_id"]}'] for ex in example_videos_rr]
    # remove duplicate video pairs, there seems to be one duplicate
    example_videos_rr.sort()
    example_videos_rr = list(example_videos_rr for example_videos_rr,
                             _ in itertools.groupby(example_videos_rr))
    example_videos_rr = [ex for ex in example_videos_rr if is_youtube_video_available(
        ex[0]) and is_youtube_video_available(ex[1])]
    if len(example_videos_rr) > num_rr_examples:
        example_videos_rr = random.sample(example_videos_rr, num_rr_examples)

    return example_videos, example_videos_rr


def get_youtube_embedded_html(embed_url, video_position):
    return f'''
        <p>Video {video_position}</p>
        <iframe width="100%" height="360px" src="{embed_url}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; fullscreen" allowfullscreen></iframe>
    '''


def update_youtube_embedded_html(video_url, video_position):
    try:
        embed_url = YouTube(video_url).embed_url
    except:
        return f'''
            <p>There was error in fetching details for video with the URL: {video_url}</p>
        '''
    return get_youtube_embedded_html(embed_url, video_position)


def get_youtube_video_data(url):
    try:
        video = YouTube(url)
    except:
        raise gr.Error(f'Could not find YouTube video with the URL {url}')
    channel_id = video.channel_id
    video_title = video.title
    video_description = video.description

    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video.video_id)
    except:
        return channel_id, video_title, video_description, None

    available_non_common_langs = [tr.language_code for tr in list(
        transcript_list) if tr.language_code not in ['en', 'en-US', 'es', 'de']]
    video_transcript = YouTubeTranscriptApi.get_transcript(
        video.video_id, languages=['en', 'en-US', 'es', 'de'] + available_non_common_langs)
    video_transcript = TextFormatter().format_transcript(
        video_transcript).replace('\n', ' ')
    return channel_id, video_title, video_description, video_transcript


def get_input_data_df(video1_url, video2_url):
    channel_id, video_title, video_description, video_transcript = get_youtube_video_data(
        video1_url)
    channel_id2, video_title2, video_description2, video_transcript2 = get_youtube_video_data(
        video2_url)
    channel_sim = 1 if channel_id == channel_id2 else 0
    df = pd.DataFrame([[video_title, video_description, video_transcript] + [video_title2, video_description2, video_transcript2] + [channel_sim]], columns=[
                      'regret_title', 'regret_description', 'regret_transcript', 'recommendation_title', 'recommendation_description', 'recommendation_transcript', 'channel_sim'])
    return df