File size: 3,897 Bytes
7971a7a
 
0112deb
7971a7a
 
bfd4fcf
7971a7a
 
 
 
bfd4fcf
 
 
7971a7a
 
 
 
 
 
 
 
 
 
bfd4fcf
7971a7a
bfd4fcf
 
7971a7a
bfd4fcf
 
 
 
 
 
 
 
 
 
 
7971a7a
bfd4fcf
 
 
7971a7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfd4fcf
 
 
 
 
 
 
7971a7a
 
 
 
 
 
 
 
 
867c45f
bfd4fcf
 
867c45f
 
bfd4fcf
867c45f
7971a7a
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import urllib.request
import shutil
import logging
import hashlib
import time

from PIL import Image
import imagehash
from moviepy.editor import VideoFileClip
from moviepy.video.fx.all import crop
import numpy as np
from pytube import YouTube

from config import FPS, VIDEO_DIRECTORY


def filepath_from_url(url):
    """Return filepath based on a md5 hash of a url."""
    return os.path.join(VIDEO_DIRECTORY, hashlib.md5(url.encode()).hexdigest())

def download_video_from_url(url):
    """Download video from url or return md5 hash as video name"""
    start = time.time()
    filepath = filepath_from_url(url)

    # Check if it exists already
    if not os.path.exists(filepath):
        # For YouTube links
        if url.startswith('https://www.youtube.com') or url.startswith('youtube.com') or url.startswith('http://www.youtube.com'):
            file_dir = '/'.join(x for x in filepath.split('/')[:-1])
            filename = filepath.split('/')[-1]
            logging.info(f"file_dir = {file_dir}")
            logging.info(f"filename = {filename}")
            YouTube(url).streams.get_highest_resolution().download(file_dir, skip_existing = False, filename = filename)
            logging.info(f"Downloaded YouTube video from {url} to {filepath} in {time.time() - start:.1f} seconds.")
            return filepath

        # Works for basically all links, except youtube 
        with (urllib.request.urlopen(url)) as f, open(filepath, 'wb') as fileout:
            logging.info(f"Starting copyfileobj on {f}")
            shutil.copyfileobj(f, fileout, length=16*1024*1024)
        logging.info(f"Downloaded video from {url} to {filepath} in {time.time() - start:.1f} seconds.")
    else:
        logging.info(f"Skipping downloading from {url} because {filepath} already exists.")
    return filepath

def change_ffmpeg_fps(clip, fps=FPS):
    # Hacking the ffmpeg call based on 
    # https://github.com/Zulko/moviepy/blob/master/moviepy/video/io/ffmpeg_reader.py#L126
    import subprocess as sp

    cmd = [arg + ",fps=%d" % fps if arg.startswith("scale=") else arg for arg in clip.reader.proc.args]
    clip.reader.close()
    clip.reader.proc = sp.Popen(cmd, bufsize=clip.reader.bufsize, 
                                stdout=sp.PIPE, stderr=sp.PIPE, stdin=sp.DEVNULL)
    clip.fps = clip.reader.fps = fps
    clip.reader.lastread = clip.reader.read_frame()
    return clip

def crop_video(clip, crop_percentage=0.75, w=224, h=224):
    # Original width and height- which combined with crop_percentage determines the size of the new video
    ow, oh = clip.size 

    logging.info(f"Cropping and resizing video to ({w}, {h})")
    return crop(clip, x_center=ow/2, y_center=oh/2, width=int(ow*crop_percentage), height=int(crop_percentage*oh)).resize((w,h))

def compute_hash(frame, hash_size=16):
    image = Image.fromarray(np.array(frame))
    return imagehash.phash(image, hash_size)

def binary_array_to_uint8s(arr):
    bit_string = ''.join(str(1 * x) for l in arr for x in l)
    return [int(bit_string[i:i+8], 2) for i in range(0, len(bit_string), 8)]

def compute_hashes(url: str, fps=FPS):
    try:
        filepath = download_video_from_url(url)
        clip = crop_video(VideoFileClip(filepath))
    except IOError:
        logging.warn(f"Falling back to direct streaming from {url} because the downloaded video failed.")
        clip = crop_video(VideoFileClip(url))
        
    for index, frame in enumerate(change_ffmpeg_fps(clip, fps).iter_frames()):
        # Each frame is a triplet of size (height, width, 3) of the video since it is RGB
        # The hash itself is of size (hash_size, hash_size)
        # The uint8 version of the hash is of size (hash_size * highfreq_factor,) and represents the hash
        hashed = np.array(binary_array_to_uint8s(compute_hash(frame).hash), dtype='uint8')
        yield {"frame": 1+index*fps, "hash": hashed}