Upload get_embeds.py
Browse files- get_embeds.py +32 -0
get_embeds.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Owlv2TextModel, Owlv2Processor, AutoTokenizer
|
2 |
+
import json
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
import tqdm
|
6 |
+
|
7 |
+
embed_dict = nn.ParameterDict()
|
8 |
+
bsz = 8
|
9 |
+
|
10 |
+
with open("id_to_str.json") as f:
|
11 |
+
data = json.load(f)
|
12 |
+
|
13 |
+
keys = list(data.keys())
|
14 |
+
bar = tqdm.tqdm(range(len(keys)//bsz))
|
15 |
+
|
16 |
+
proc = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained("google/owlv2-base-patch16-ensemble")
|
18 |
+
model = Owlv2TextModel.from_pretrained("google/owlv2-base-patch16-ensemble")
|
19 |
+
|
20 |
+
for i in bar:
|
21 |
+
batch = [data[key].replace("_", " ") for key in keys[i*bsz:(i+1)*bsz]]
|
22 |
+
tokenized = tokenizer(batch)
|
23 |
+
for k in range(bsz):
|
24 |
+
if len(tokenized[k]) > 16:
|
25 |
+
tokenizer.decode(tokenized[k])
|
26 |
+
|
27 |
+
batch = proc(text=batch, return_tensors="pt")
|
28 |
+
output = model(**batch)
|
29 |
+
for k, key in enumerate(keys[i*bsz:(i+1)*bsz]):
|
30 |
+
embed_dict[key] = output.pooler_output[k, :]
|
31 |
+
|
32 |
+
torch.save(embed_dict.state_dict(), "embeds.pt")
|