ncoop57 commited on
Commit
021b099
1 Parent(s): f400687

add initial code

Browse files
Files changed (3) hide show
  1. app.py +87 -0
  2. clip.py +80 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch._C import device
2
+ import ffmpeg
3
+ import youtube_dl
4
+
5
+ import numpy as np
6
+
7
+ from PIL import Image
8
+ import requests
9
+
10
+ import torch
11
+ from sentence_transformers import SentenceTransformer, util, models
12
+ from clip import CLIPModel
13
+ # from sentence_transformers.models import CLIPModel
14
+ from PIL import Image
15
+
16
+ clip = CLIPModel()
17
+ model = SentenceTransformer(modules=[clip]).to(dtype=torch.float32, device=torch.device('cpu'))
18
+
19
+
20
+ def get_embedding(query, video):
21
+ text_emb = model.encode(query, device='cpu')
22
+
23
+ # Encode an image:
24
+ images = []
25
+ for img in video:
26
+ images.append(Image.fromarray(img))
27
+ img_embs = model.encode(images, device='cpu')
28
+
29
+ return text_emb, img_embs
30
+
31
+
32
+ # # Encode an image:
33
+ # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
34
+ # img = Image.fromarray(np.array(Image.open(requests.get(url, stream=True).raw))).convert('RGB')
35
+ # img_emb = model.encode([img, img], device='cpu')
36
+
37
+ # # Encode text descriptions
38
+ # text_emb = model.encode(['Two dogs in the snow', 'Two cats laying on a sofa',
39
+ # 'A picture of London at night'], device='cpu')
40
+
41
+ # # Compute cosine similarities
42
+ # cos_scores = util.cos_sim(img_emb, text_emb)
43
+ # print(cos_scores)
44
+
45
+
46
+ def my_hook(d):
47
+ if d['status'] == 'finished':
48
+ print(d)
49
+ print('Done downloading, now extracting frames ...')
50
+ probe = ffmpeg.probe(d["filename"])
51
+ video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
52
+ width = int(video_stream['width'])
53
+ height = int(video_stream['height'])
54
+ out, _ = (
55
+ ffmpeg
56
+ .input(d["filename"])
57
+ .output('pipe:', format='rawvideo', pix_fmt='rgb24')
58
+ .run(capture_stdout=True)
59
+ )
60
+ video = (
61
+ np
62
+ .frombuffer(out, np.uint8)
63
+ .reshape([-1, height, width, 3])
64
+ )[::10]
65
+
66
+ print(video.shape)
67
+ txt_embd, img_embds = get_embedding("two white puppies", video)
68
+ cos_scores = util.cos_sim(txt_embd, img_embds)
69
+ print(cos_scores)
70
+
71
+
72
+ ydl_opts = {"format": "mp4", "progress_hooks": [my_hook], }
73
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
74
+ ydl.download(['https://youtu.be/I3AaW9ZevIU'])
75
+
76
+
77
+ # # out, _ = (
78
+ # # ffmpeg
79
+ # # .input('in.mp4')
80
+ # # .output('pipe:', format='rawvideo', pix_fmt='rgb24')
81
+ # # .run(capture_stdout=True)
82
+ # # )
83
+ # # video = (
84
+ # # np
85
+ # # .frombuffer(out, np.uint8)
86
+ # # .reshape([-1, height, width, 3])
87
+ # )
clip.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ import transformers
3
+ import torch
4
+ from PIL import Image
5
+
6
+
7
+ class CLIPModel(nn.Module):
8
+ def __init__(self, model_name: str = "openai/clip-vit-base-patch32", processor_name=None):
9
+ super(CLIPModel, self).__init__()
10
+
11
+ if processor_name is None:
12
+ processor_name = model_name
13
+
14
+ self.model = transformers.CLIPModel.from_pretrained(model_name)
15
+ self.processor = transformers.CLIPProcessor.from_pretrained(processor_name)
16
+
17
+ def __repr__(self):
18
+ return "CLIPModel()"
19
+
20
+ def forward(self, features):
21
+ image_embeds = []
22
+ text_embeds = []
23
+
24
+ if 'pixel_values' in features:
25
+ vision_outputs = self.model.vision_model(pixel_values=features['pixel_values'])
26
+ image_embeds = self.model.visual_projection(vision_outputs[1])
27
+
28
+ if 'input_ids' in features:
29
+ text_outputs = self.model.text_model(
30
+ input_ids=features.get('input_ids'),
31
+ attention_mask=features.get('attention_mask', None),
32
+ position_ids=features.get('position_ids', None),
33
+ output_attentions=features.get('output_attentions', None),
34
+ output_hidden_states=features.get('output_hidden_states', None),
35
+ )
36
+ text_embeds = self.model.text_projection(text_outputs[1])
37
+
38
+ sentence_embedding = []
39
+ image_features = iter(image_embeds)
40
+ text_features = iter(text_embeds)
41
+
42
+ for idx, input_type in enumerate(features['image_text_info']):
43
+ if input_type == 0:
44
+ sentence_embedding.append(next(image_features))
45
+ else:
46
+ sentence_embedding.append(next(text_features))
47
+
48
+ features['sentence_embedding'] = torch.stack(sentence_embedding).float()
49
+
50
+ return features
51
+
52
+ def tokenize(self, texts):
53
+ images = []
54
+ texts_values = []
55
+ image_text_info = []
56
+
57
+ for idx, data in enumerate(texts):
58
+ if isinstance(data, Image.Image): # An Image
59
+ images.append(data)
60
+ image_text_info.append(0)
61
+ else: # A text
62
+ texts_values.append(data)
63
+ image_text_info.append(1)
64
+
65
+ if len(texts_values) == 0:
66
+ texts_values = None
67
+ if len(images) == 0:
68
+ images = None
69
+
70
+ inputs = self.processor(text=texts_values, images=images, return_tensors="pt", padding=True)
71
+ inputs['image_text_info'] = image_text_info
72
+ return inputs
73
+
74
+ def save(self, output_path: str):
75
+ self.model.save_pretrained(output_path)
76
+ self.processor.save_pretrained(output_path)
77
+
78
+ @staticmethod
79
+ def load(input_path: str):
80
+ return CLIPModel(model_name=input_path)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ffmpeg-python
2
+ numpy
3
+ pillow
4
+ torch
5
+ git+https://github.com/ncoop57/sentence-transformers@clip-image-check
6
+ youtube_dl