Spaces:

open-nlp
/

Chris-lab

Sleeping

App Files Files Community

kz209 commited on Sep 1, 2024

Commit

309f86b

•

1 Parent(s): f276c92

update

Browse files

Files changed (1) hide show

utils/model.py +37 -30

utils/model.py CHANGED Viewed

@@ -60,35 +60,42 @@ class Model(torch.nn.Module):
         input_ids = self.tokenizer(content_list, return_tensors="pt", padding=True, truncation=True).input_ids.to(self.model.device)
         if streaming:
-            # Process each input separately
-            for single_input_ids in input_ids:
-                # Set up the initial generation parameters
-                gen_kwargs = {
-                    "input_ids": single_input_ids.unsqueeze(0),
-                    "max_new_tokens": max_length,
-                    "do_sample": True,
-                    "temperature": temp,
-                    "eos_token_id": self.tokenizer.eos_token_id,
-                }
-                # Generate and yield tokens one by one
-                unfinished_sequences = single_input_ids.unsqueeze(0)
-                while unfinished_sequences.shape[1] < gen_kwargs["max_new_tokens"]:
-                    with torch.no_grad():
-                        output = self.model.generate(**gen_kwargs, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)
-                    next_token_logits = output.scores[0][0]
-                    next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
-                    unfinished_sequences = torch.cat([unfinished_sequences, next_token], dim=-1)
-                    # Yield the newly generated token
-                    yield self.tokenizer.decode(next_token[0], skip_special_tokens=True)
-                    if next_token.item() == self.tokenizer.eos_token_id:
-                        break
-                    # Update input_ids for the next iteration
-                    gen_kwargs["input_ids"] = unfinished_sequences
         else:
             # Non-streaming generation (unchanged)
             outputs = self.model.generate(
@@ -98,4 +105,4 @@ class Model(torch.nn.Module):
                 temperature=temp,
                 eos_token_id=self.tokenizer.eos_token_id,
             )
-            return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

         input_ids = self.tokenizer(content_list, return_tensors="pt", padding=True, truncation=True).input_ids.to(self.model.device)
         if streaming:
+            # Set up the initial generation parameters
+            gen_kwargs = {
+                "input_ids": input_ids,
+                "do_sample": True,
+                "temperature": temp,
+                "eos_token_id": self.tokenizer.eos_token_id,
+                "max_new_tokens": 1,  # Generate one token at a time
+                "return_dict_in_generate": True,
+                "output_scores": True
+            }
+            # Generate and yield tokens one by one
+            generated_tokens = 0
+            batch_size = input_ids.shape[0]
+            active_sequences = torch.arange(batch_size)
+            while generated_tokens < max_length and len(active_sequences) > 0:
+                with torch.no_grad():
+                    output = self.model.generate(**gen_kwargs)
+                next_tokens = output.sequences[:, -1].unsqueeze(-1)
+                # Yield the newly generated tokens for each sequence in the batch
+                for i, token in zip(active_sequences, next_tokens):
+                    yield i, self.tokenizer.decode(token[0], skip_special_tokens=True)
+                # Update input_ids for the next iteration
+                gen_kwargs["input_ids"] = torch.cat([gen_kwargs["input_ids"], next_tokens], dim=-1)
+                generated_tokens += 1
+                # Check for completed sequences
+                completed = (next_tokens.squeeze(-1) == self.tokenizer.eos_token_id).nonzero().squeeze(-1)
+                active_sequences = torch.tensor([i for i in active_sequences if i not in completed])
+                if len(active_sequences) > 0:
+                    gen_kwargs["input_ids"] = gen_kwargs["input_ids"][active_sequences]
         else:
             # Non-streaming generation (unchanged)
             outputs = self.model.generate(
                 temperature=temp,
                 eos_token_id=self.tokenizer.eos_token_id,
             )
+            return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)