Spaces:

HF-test-lab
/

bulk_embeddings

Runtime error

nbroad HF staff commited on Jul 15, 2023

Commit

6c6d3ac

•

1 Parent(s): af7a07a

fix batching v2

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -211,6 +211,17 @@ def tokenize(
     )
 @torch.inference_mode()
 def batch_embed(
     ds: datasets.IterableDataset,
@@ -308,18 +319,20 @@ def batch_embed(
         ds,
         batch_size=inference_bs,
         shuffle=False,
-        num_workers=2,
         pin_memory=True,
         drop_last=False,
     ):
-        ids = torch.tensor(batch["input_ids"], device=device)
-        mask = torch.tensor(batch["attention_mask"], device=device)
         t_ids = torch.zeros_like(ids)
         outputs = model(input_ids=ids, attention_mask=mask, token_type_ids=t_ids)
         embeds.extend(mean_pooling(outputs[0], mask).cpu().tolist())
-        texts.extend(batch[column_name])
         current_count += ids.shape[0]

     )
+def collate_fn(examples, tokenizer=None, padding=None, device=None):
+    batch = {k: [] for k in examples[0].keys()}
+    for example in examples:
+        for k, v in example.items():
+            batch[k].append(v)
+    return {
+        k: torch.tensor(v, dtype=torch.long, device=device) if k in {"attention_mask", "input_ids"} else v for k, v in batch.items()
+    }
 @torch.inference_mode()
 def batch_embed(
     ds: datasets.IterableDataset,
         ds,
         batch_size=inference_bs,
         shuffle=False,
+        num_workers=1,
         pin_memory=True,
         drop_last=False,
     ):
+        batch = collate_fn(batch, device=device)
+        ids = batch["input_ids"]
+        mask = batch["attention_mask"]
         t_ids = torch.zeros_like(ids)
         outputs = model(input_ids=ids, attention_mask=mask, token_type_ids=t_ids)
         embeds.extend(mean_pooling(outputs[0], mask).cpu().tolist())
+        texts.extend([b[column_name] for b in batch])
         current_count += ids.shape[0]