rxavier commited on
Commit
f00d508
1 Parent(s): fd9138f

Update off_topic.py

Browse files
Files changed (1) hide show
  1. off_topic.py +7 -1
off_topic.py CHANGED
@@ -9,6 +9,7 @@ import matplotlib.pyplot as plt
9
  import numpy as np
10
  import torch
11
  import PIL
 
12
  from transformers import CLIPModel, CLIPProcessor
13
  from PIL import Image
14
 
@@ -95,7 +96,12 @@ class OffTopicDetector:
95
  domain = re.sub("_", " ", response["domain_id"].split("-")[-1]).lower()
96
  img_urls = [x["url"] for x in response["pictures"]]
97
  images = self.get_images(img_urls)
98
- return images, domain
 
 
 
 
 
99
 
100
  def get_images(self, urls: List[str]):
101
  start = time.time()
 
9
  import numpy as np
10
  import torch
11
  import PIL
12
+ import imagehash
13
  from transformers import CLIPModel, CLIPProcessor
14
  from PIL import Image
15
 
 
96
  domain = re.sub("_", " ", response["domain_id"].split("-")[-1]).lower()
97
  img_urls = [x["url"] for x in response["pictures"]]
98
  images = self.get_images(img_urls)
99
+ hashes = {}
100
+ for img in images:
101
+ hashes.update({str(imagehash.average_hash(img)): img})
102
+ dedup_hashes = list(dict.fromkeys(hashes))
103
+ dedup_images = [img for hash, img in hashes.items() if hash in dedup_hashes]
104
+ return dedup_images, domain
105
 
106
  def get_images(self, urls: List[str]):
107
  start = time.time()