dodijk commited on
Commit
0112deb
1 Parent(s): b9cd4c4

Prepare for multiple indices

Browse files
Files changed (4) hide show
  1. apb2022.json +42 -0
  2. app.py +10 -7
  3. videohash.py +2 -1
  4. videomatch.py +19 -26
apb2022.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "url": "https://debatgemist.tweedekamer.nl/debatten/algemene-politieke-beschouwingen-1e-termijn-kamer-0",
4
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/0010e470-ce9d-4424-9462-8e8e10efc5af/download-20220921082013Z.mp4"
5
+ },
6
+ {
7
+ "url": "https://debatgemist.tweedekamer.nl/debatten/mededelingen-1690",
8
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/0010e470-ce9d-4424-9462-8e8e10efc5af/download-20220921133023Z.mp4"
9
+ },
10
+ {
11
+ "url": "https://debatgemist.tweedekamer.nl/debatten/regeling-van-werkzaamheden-1227",
12
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/0010e470-ce9d-4424-9462-8e8e10efc5af/download-20220921133038Z.mp4"
13
+ },
14
+ {
15
+ "url": "https://debatgemist.tweedekamer.nl/debatten/algemene-politieke-beschouwingen-voortzetting-1e-termijn-kamer",
16
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/a702d77f-49c5-4a69-a6f7-28e29ae4d1fe/20220921_pz_155828-010143_v2.mp4"
17
+ },
18
+ {
19
+ "url": "https://debatgemist.tweedekamer.nl/debatten/mededelingen-1691",
20
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/74c2e164-212d-4eda-a3ef-2dc406eea19b/download-20220922081620Z.mp4"
21
+ },
22
+ {
23
+ "url": "https://debatgemist.tweedekamer.nl/debatten/algemene-politieke-beschouwingen-antwoord-1e-termijn-rest-deel-1",
24
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/74c2e164-212d-4eda-a3ef-2dc406eea19b/download-20220922081653Z.mp4"
25
+ },
26
+ {
27
+ "url": "https://debatgemist.tweedekamer.nl/debatten/mededelingen-1692",
28
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/74c2e164-212d-4eda-a3ef-2dc406eea19b/download-20220922125211Z.mp4"
29
+ },
30
+ {
31
+ "url": "https://debatgemist.tweedekamer.nl/debatten/regeling-van-werkzaamheden-1228",
32
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/74c2e164-212d-4eda-a3ef-2dc406eea19b/download-20220922125220Z.mp4"
33
+ },
34
+ {
35
+ "url": "https://debatgemist.tweedekamer.nl/debatten/algemene-politieke-beschouwingen-antwoord-1e-termijn-rest-deel-2",
36
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/74c2e164-212d-4eda-a3ef-2dc406eea19b/download-20220922125338Z.mp4"
37
+ },
38
+ {
39
+ "url": "https://debatgemist.tweedekamer.nl/debatten/stemmingen-852",
40
+ "mp4": "https://amcpwetkms-euwe.streaming.media.azure.net/74c2e164-212d-4eda-a3ef-2dc406eea19b/download-20220922213719Z.mp4"
41
+ }
42
+ ]
app.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
 
5
  from config import *
6
  from videomatch import index_hashes_for_video, get_decent_distance, \
7
- get_video_indices, compare_videos, get_change_points, get_videomatch_df
8
  from plot import plot_comparison, plot_multi_comparison
9
 
10
  logging.basicConfig()
@@ -13,8 +13,9 @@ logging.getLogger().setLevel(logging.INFO)
13
 
14
  def get_comparison(url, target, MIN_DISTANCE = 4):
15
  """ Function for Gradio to combine all helper functions"""
16
- video_index, hash_vectors, target_indices = get_video_indices(url, target, MIN_DISTANCE = MIN_DISTANCE)
17
- lims, D, I, hash_vectors = compare_videos(hash_vectors, target_indices, MIN_DISTANCE = MIN_DISTANCE)
 
18
  fig = plot_comparison(lims, D, I, hash_vectors, MIN_DISTANCE = MIN_DISTANCE)
19
  return fig
20
 
@@ -24,8 +25,9 @@ def get_auto_comparison(url, target, smoothing_window_size=10, method="CUSUM"):
24
  if distance == None:
25
  return None
26
  raise gr.Error("No matches found!")
27
- video_index, hash_vectors, target_indices = get_video_indices(url, target, MIN_DISTANCE = distance)
28
- lims, D, I, hash_vectors = compare_videos(hash_vectors, target_indices, MIN_DISTANCE = distance)
 
29
  # fig = plot_comparison(lims, D, I, hash_vectors, MIN_DISTANCE = distance)
30
  df = get_videomatch_df(url, target, min_distance=MIN_DISTANCE, vanilla_df=False)
31
  change_points = get_change_points(df, smoothing_window_size=smoothing_window_size, method=method)
@@ -38,8 +40,9 @@ def get_auto_edit_decision(url, target, smoothing_window_size=10):
38
  if distance == None:
39
  return None
40
  raise gr.Error("No matches found!")
41
- video_index, hash_vectors, target_indices = get_video_indices(url, target, MIN_DISTANCE = distance)
42
- lims, D, I, hash_vectors = compare_videos(hash_vectors, target_indices, MIN_DISTANCE = distance)
 
43
 
44
  df = get_videomatch_df(url, target, min_distance=MIN_DISTANCE, vanilla_df=False)
45
  change_points = get_change_points(df, smoothing_window_size=smoothing_window_size, method="ROBUST")
 
4
 
5
  from config import *
6
  from videomatch import index_hashes_for_video, get_decent_distance, \
7
+ get_video_index, compare_videos, get_change_points, get_videomatch_df
8
  from plot import plot_comparison, plot_multi_comparison
9
 
10
  logging.basicConfig()
 
13
 
14
  def get_comparison(url, target, MIN_DISTANCE = 4):
15
  """ Function for Gradio to combine all helper functions"""
16
+ video_index, hash_vectors = get_video_index(url)
17
+ target_index, _ = get_video_index(target)
18
+ lims, D, I, hash_vectors = compare_videos(hash_vectors, target_index, MIN_DISTANCE = MIN_DISTANCE)
19
  fig = plot_comparison(lims, D, I, hash_vectors, MIN_DISTANCE = MIN_DISTANCE)
20
  return fig
21
 
 
25
  if distance == None:
26
  return None
27
  raise gr.Error("No matches found!")
28
+ video_index, hash_vectors = get_video_index(url)
29
+ target_index, _ = get_video_index(target)
30
+ lims, D, I, hash_vectors = compare_videos(hash_vectors, target_index, MIN_DISTANCE = distance)
31
  # fig = plot_comparison(lims, D, I, hash_vectors, MIN_DISTANCE = distance)
32
  df = get_videomatch_df(url, target, min_distance=MIN_DISTANCE, vanilla_df=False)
33
  change_points = get_change_points(df, smoothing_window_size=smoothing_window_size, method=method)
 
40
  if distance == None:
41
  return None
42
  raise gr.Error("No matches found!")
43
+ video_index, hash_vectors = get_video_index(url)
44
+ target_index, _ = get_video_index(target)
45
+ lims, D, I, hash_vectors = compare_videos(hash_vectors, target_index, MIN_DISTANCE = distance)
46
 
47
  df = get_videomatch_df(url, target, min_distance=MIN_DISTANCE, vanilla_df=False)
48
  change_points = get_change_points(df, smoothing_window_size=smoothing_window_size, method="ROBUST")
videohash.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import urllib.request
 
3
  import logging
4
  import hashlib
5
 
@@ -20,7 +21,7 @@ def download_video_from_url(url):
20
  filepath = filepath_from_url(url)
21
  if not os.path.exists(filepath):
22
  with (urllib.request.urlopen(url)) as f, open(filepath, 'wb') as fileout:
23
- fileout.write(f.read())
24
  logging.info(f"Downloaded video from {url} to {filepath}.")
25
  else:
26
  logging.info(f"Skipping downloading from {url} because {filepath} already exists.")
 
1
  import os
2
  import urllib.request
3
+ import shutil
4
  import logging
5
  import hashlib
6
 
 
21
  filepath = filepath_from_url(url)
22
  if not os.path.exists(filepath):
23
  with (urllib.request.urlopen(url)) as f, open(filepath, 'wb') as fileout:
24
+ shutil.copyfileobj(f, fileout, length=16*1024)
25
  logging.info(f"Downloaded video from {url} to {filepath}.")
26
  else:
27
  logging.info(f"Skipping downloading from {url} because {filepath} already exists.")
videomatch.py CHANGED
@@ -38,45 +38,37 @@ def index_hashes_for_video(url: str) -> faiss.IndexBinaryIVF:
38
  logging.info(f"Indexed hashes for {index.ntotal} frames to {filepath}.index.")
39
  return index
40
 
41
- def get_video_indices(filepath: str, target: str, MIN_DISTANCE: int = 4):
42
- """" The comparison between the target and the original video will be plotted based
43
- on the matches between the target and the original video over time. The matches are determined
44
- based on the minimum distance between hashes (as computed by faiss-vectors) before they're considered a match.
45
-
46
  args:
47
- - url: url of the source video (short video which you want to be checked)
48
- - target: url of the target video (longer video which is a superset of the source video)
49
- - MIN_DISTANCE: integer representing the minimum distance between hashes on bit-level before its considered a match
50
  """
51
- # TODO: Fix crash if no matches are found
52
-
53
  # Url (short video)
54
- video_index = index_hashes_for_video(filepath)
55
  video_index.make_direct_map() # Make sure the index is indexable
56
  hash_vectors = np.array([video_index.reconstruct(i) for i in range(video_index.ntotal)]) # Retrieve original indices
57
 
58
- # Target video (long video)
59
- target_indices = [index_hashes_for_video(x) for x in [target]]
60
 
61
- return video_index, hash_vectors, target_indices
62
-
63
- def compare_videos(hash_vectors, target_indices, MIN_DISTANCE = 3):
64
- """ Search for matches between the indices of the target video (long video)
65
- and the given hash vectors of a video"""
66
  # The results are returned as a triplet of 1D arrays
67
  # lims, D, I, where result for query i is in I[lims[i]:lims[i+1]]
68
  # (indices of neighbors), D[lims[i]:lims[i+1]] (distances).
69
- for index in target_indices:
70
- lims, D, I = index.range_search(hash_vectors, MIN_DISTANCE)
71
- return lims, D, I, hash_vectors
72
 
73
- def get_decent_distance(url, target, MIN_DISTANCE, MAX_DISTANCE):
74
  """ To get a decent heurstic for a base distance check every distance from MIN_DISTANCE to MAX_DISTANCE
75
  until the number of matches found is equal to or higher than the number of frames in the source video"""
76
  for distance in np.arange(start = MIN_DISTANCE - 2, stop = MAX_DISTANCE + 2, step = 2, dtype=int):
77
  distance = int(distance)
78
- video_index, hash_vectors, target_indices = get_video_indices(url, target, MIN_DISTANCE = distance)
79
- lims, D, I, hash_vectors = compare_videos(hash_vectors, target_indices, MIN_DISTANCE = distance)
 
80
  nr_source_frames = video_index.ntotal
81
  nr_matches = len(D)
82
  logging.info(f"{(nr_matches/nr_source_frames) * 100.0:.1f}% of frames have a match for distance '{distance}' ({nr_matches} matches for {nr_source_frames} frames)")
@@ -103,8 +95,9 @@ def get_change_points(df, smoothing_window_size=10, method='CUSUM'):
103
 
104
  def get_videomatch_df(url, target, min_distance=MIN_DISTANCE, vanilla_df=False):
105
  distance = get_decent_distance(url, target, MIN_DISTANCE, MAX_DISTANCE)
106
- video_index, hash_vectors, target_indices = get_video_indices(url, target, MIN_DISTANCE = distance)
107
- lims, D, I, hash_vectors = compare_videos(hash_vectors, target_indices, MIN_DISTANCE = distance)
 
108
 
109
  target = [(lims[i+1]-lims[i]) * [i] for i in range(hash_vectors.shape[0])]
110
  target_s = [i/FPS for j in target for i in j]
 
38
  logging.info(f"Indexed hashes for {index.ntotal} frames to {filepath}.index.")
39
  return index
40
 
41
+ def get_video_index(url: str):
42
+ """" Builds up a FAISS index for a video.
 
 
 
43
  args:
44
+ - filepath: location of the source video
 
 
45
  """
 
 
46
  # Url (short video)
47
+ video_index = index_hashes_for_video(url)
48
  video_index.make_direct_map() # Make sure the index is indexable
49
  hash_vectors = np.array([video_index.reconstruct(i) for i in range(video_index.ntotal)]) # Retrieve original indices
50
 
51
+ return video_index, hash_vectors
 
52
 
53
+ def compare_videos(hash_vectors, target_index, MIN_DISTANCE = 3):
54
+ """ The comparison between the target and the original video will be plotted based
55
+ on the matches between the target and the original video over time. The matches are determined
56
+ based on the minimum distance between hashes (as computed by faiss-vectors) before they're considered a match.
57
+ """
58
  # The results are returned as a triplet of 1D arrays
59
  # lims, D, I, where result for query i is in I[lims[i]:lims[i+1]]
60
  # (indices of neighbors), D[lims[i]:lims[i+1]] (distances).
61
+ lims, D, I = target_index.range_search(hash_vectors, MIN_DISTANCE)
62
+ return lims, D, I, hash_vectors
 
63
 
64
+ def get_decent_distance(filepath, target, MIN_DISTANCE, MAX_DISTANCE):
65
  """ To get a decent heurstic for a base distance check every distance from MIN_DISTANCE to MAX_DISTANCE
66
  until the number of matches found is equal to or higher than the number of frames in the source video"""
67
  for distance in np.arange(start = MIN_DISTANCE - 2, stop = MAX_DISTANCE + 2, step = 2, dtype=int):
68
  distance = int(distance)
69
+ video_index, hash_vectors = get_video_index(filepath)
70
+ target_index, _ = get_video_index(target)
71
+ lims, D, I, hash_vectors = compare_videos(hash_vectors, target_index, MIN_DISTANCE = distance)
72
  nr_source_frames = video_index.ntotal
73
  nr_matches = len(D)
74
  logging.info(f"{(nr_matches/nr_source_frames) * 100.0:.1f}% of frames have a match for distance '{distance}' ({nr_matches} matches for {nr_source_frames} frames)")
 
95
 
96
  def get_videomatch_df(url, target, min_distance=MIN_DISTANCE, vanilla_df=False):
97
  distance = get_decent_distance(url, target, MIN_DISTANCE, MAX_DISTANCE)
98
+ _, hash_vectors = get_video_index(url)
99
+ target_index, _ = get_video_index(target)
100
+ lims, D, I, hash_vectors = compare_videos(hash_vectors, target_index, MIN_DISTANCE = distance)
101
 
102
  target = [(lims[i+1]-lims[i]) * [i] for i in range(hash_vectors.shape[0])]
103
  target_s = [i/FPS for j in target for i in j]