verbit
/

hebrew_punctuation

Transformers

PyTorch

Hebrew

Inference Endpoints

Model card Files Files and versions Community

nirraviv89 commited on Sep 30, 2024

Commit

fbc3442

1 Parent(s): 6f8733e

rename and documentation

Browse files

Files changed (3) hide show

src/config.py +9 -11
src/inference.py +26 -40
src/models.py +3 -3

src/config.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from transformers import BertConfig
-class CustomBertConfig(BertConfig):
     r"""
-    This is the configuration class to store the configuration of a [`BertForPunctuation`]. It is based on BERT config
      to the specified arguments, defining the model architecture.
      Args:
         backward_context (`int`, *optional*, defaults to 15):
@@ -20,7 +20,7 @@ class CustomBertConfig(BertConfig):
     >>> from transformers import BertConfig, BertModel
     >>> # Initializing a BERT google-bert/bert-base-uncased style configuration
-    >>> configuration = CustomBertConfig()
     >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
     >>> model = BertForPunctuation(configuration)
@@ -29,15 +29,13 @@ class CustomBertConfig(BertConfig):
     >>> configuration = model.config
     ```"""
-    model_type = "custom_bert"
     def __init__(
-            self,
-            backward_context=15,
-            forward_context=16,
-            output_size=4,
-            dropout=0.3,
-            **kwargs,
     ):
         super().__init__(**kwargs)
         self.backward_context = backward_context

 from transformers import BertConfig
+class PunctuationBertConfig(BertConfig):
     r"""
+    This is the configuration class to store the configuration of a [`PunctuationBertConfig`]. It is based on BERT config
      to the specified arguments, defining the model architecture.
      Args:
         backward_context (`int`, *optional*, defaults to 15):
     >>> from transformers import BertConfig, BertModel
     >>> # Initializing a BERT google-bert/bert-base-uncased style configuration
+    >>> configuration = PunctuationBertConfig()
     >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
     >>> model = BertForPunctuation(configuration)
     >>> configuration = model.config
     ```"""
     def __init__(
+        self,
+        backward_context=15,
+        forward_context=16,
+        output_size=4,
+        dropout=0.3,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.backward_context = backward_context

src/inference.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Tuple
 import numpy as np
 import torch
@@ -8,10 +8,12 @@ from transformers import BertTokenizer
 from src.models import BertForPunctuation
 PUNCTUATION_SIGNS = ['', ',', '.', '?']
 def tokenize_text(
-        word_list: List[str], pause_list: List[float], tokenizer: BertTokenizer
 ) -> Tuple[List[int], List[int], List[float]]:
     """
     Tokenizes text and generates pause list for each word
@@ -47,11 +49,10 @@ def tokenize_text(
 def gen_model_inputs(
-        x: List[int],
-        pause: List[float],
-        forward_context: int,
-        backward_context: int,
-        pause_tokens: Optional[Dict[Tuple, int]] = None,
 ) -> torch.Tensor:
     """
     Generates inputs for model out of list of indexed words.
@@ -60,23 +61,17 @@ def gen_model_inputs(
         x: list of indexed words
         pause: list of corresponding pauses
         forward_context: size of the forward context window
-        backward_context: size of the backward context window (without the pivot token)`
-        pause_tokens: dictionary of pause ranges and corresponding tokens from bert tokenizer
     Returns:
         A tensor of model inputs for each indexed word in x
     """
-    if pause_tokens is None:
-        pause_tokens = {(-1000, 1000): 0}
     model_input = []
-    tokenized_pause = []
     x_pad = [0] * backward_context + x + [0] * forward_context
-    for i, p in enumerate(pause):
-        tokenized_pause.append(next(value for key, value in pause_tokens.items() if key[0] < p <= key[1]))
     for i in range(len(x)):
-        segment = x_pad[i:i + backward_context + forward_context + 1]
         segment.insert(backward_context + 1, tokenized_pause[i])
         model_input.append(segment)
     return torch.tensor(model_input)
@@ -109,16 +104,15 @@ def add_punctuation_to_text(text: str, punct_prob: np.ndarray) -> str:
 def get_prediction(
-        model: BertForPunctuation,
-        text: str,
-        tokenizer: BertTokenizer,
-        batch_size: int = 16,
-        backward_context: int = 15,
-        forward_context: int = 16,
-        pause_list: Optional[List[float]] = None,
-        device: str = 'cpu',
-        return_prob: bool = False,
-):
     """
     Generates predictions for given list of words.
     Args:
@@ -130,18 +124,15 @@ def get_prediction(
         forward_context: size of the forward context window
         pause_list: list of pauses after each word in seconds
         device: device to run model on
-        return_prob: if True returns probabilities, if False returns text with punctuation
     Returns:
-        matrix of probabilities for each punctuation class or text with punctuation
     """
     word_list = text.split()
     if not pause_list:
         # make default pauses if pauses are not provided
         pause_list = [0.0] * len(word_list)
-    # prepare text
-    # we need original word idx since after tokenize number of tokens might not be equal to number of words
     word_idx, x, pause = tokenize_text(word_list=word_list, pause_list=pause_list, tokenizer=tokenizer)
     model_inputs = gen_model_inputs(x, pause, forward_context, backward_context)
@@ -151,35 +142,30 @@ def get_prediction(
     output = []
     with torch.no_grad():
         for ndx in range(0, inputs_length, batch_size):
-            o = model(model_inputs[ndx: min(ndx + batch_size, inputs_length)])
             o = F.softmax(o, dim=1)
             output.append(o.cpu().data.numpy())
     punct_probabilities_matrix = np.concatenate(output, axis=0)
-    if return_prob:
-        return punct_probabilities_matrix
     punct_text = add_punctuation_to_text(text, punct_probabilities_matrix)
     return punct_text
 def main():
-    model = BertForPunctuation.from_pretrained("verbit/hebrew_punctuation")
-    tokenizer = BertTokenizer.from_pretrained("verbit/hebrew_punctuation")
     model.eval()
-    text = ("חברת ורביט פיתחה מערכת לתמלול המבוססת על בינה מלאכותית וגורם אנושי ושוקדת על תמלול עדויות ניצולי שואה את "
-            "התוצאות אפשר לראות כבר ברשת בהן חלקים מעדותו של טוביה ביילסקי שהיה מפקד גדוד הפרטיזנים היהודים "
-            "בביילורוסיה")
     punct_text = get_prediction(
         model=model,
         text=text,
         tokenizer=tokenizer,
         backward_context=model.config.backward_context,
         forward_context=model.config.forward_context,
-        return_prob=False
     )
     print(punct_text)

+from typing import List, Optional, Tuple
 import numpy as np
 import torch
 from src.models import BertForPunctuation
 PUNCTUATION_SIGNS = ['', ',', '.', '?']
+PAUSE_TOKEN = 0
+MODEL_NAME = "verbit/hebrew_punctuation"
 def tokenize_text(
+    word_list: List[str], pause_list: List[float], tokenizer: BertTokenizer
 ) -> Tuple[List[int], List[int], List[float]]:
     """
     Tokenizes text and generates pause list for each word
 def gen_model_inputs(
+    x: List[int],
+    pause: List[float],
+    forward_context: int,
+    backward_context: int,
 ) -> torch.Tensor:
     """
     Generates inputs for model out of list of indexed words.
         x: list of indexed words
         pause: list of corresponding pauses
         forward_context: size of the forward context window
+        backward_context: size of the backward context window (without the predicted token)`
     Returns:
         A tensor of model inputs for each indexed word in x
     """
     model_input = []
+    tokenized_pause = [PAUSE_TOKEN] * len(pause)
     x_pad = [0] * backward_context + x + [0] * forward_context
     for i in range(len(x)):
+        segment = x_pad[i : i + backward_context + forward_context + 1]
         segment.insert(backward_context + 1, tokenized_pause[i])
         model_input.append(segment)
     return torch.tensor(model_input)
 def get_prediction(
+    model: BertForPunctuation,
+    text: str,
+    tokenizer: BertTokenizer,
+    batch_size: int = 16,
+    backward_context: int = 15,
+    forward_context: int = 16,
+    pause_list: Optional[List[float]] = None,
+    device: str = 'cpu',
+) -> str:
     """
     Generates predictions for given list of words.
     Args:
         forward_context: size of the forward context window
         pause_list: list of pauses after each word in seconds
         device: device to run model on
     Returns:
+        text with punctuation
     """
     word_list = text.split()
     if not pause_list:
         # make default pauses if pauses are not provided
         pause_list = [0.0] * len(word_list)
     word_idx, x, pause = tokenize_text(word_list=word_list, pause_list=pause_list, tokenizer=tokenizer)
     model_inputs = gen_model_inputs(x, pause, forward_context, backward_context)
     output = []
     with torch.no_grad():
         for ndx in range(0, inputs_length, batch_size):
+            o = model(model_inputs[ndx : min(ndx + batch_size, inputs_length)])
             o = F.softmax(o, dim=1)
             output.append(o.cpu().data.numpy())
     punct_probabilities_matrix = np.concatenate(output, axis=0)
     punct_text = add_punctuation_to_text(text, punct_probabilities_matrix)
     return punct_text
 def main():
+    model = BertForPunctuation.from_pretrained(MODEL_NAME)
+    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
     model.eval()
+    text = """חברת ורביט פיתחה מערכת לתמלול המבוססת על בינה מלאכותית וגורם אנושי ושוקדת על תמלול עדויות ניצולי שואה
+    את התוצאות אפשר לראות כבר ברשת בהן חלקים מעדותו של טוביה ביילסקי שהיה מפקד גדוד הפרטיזנים היהודים בביילורוסיה"""
     punct_text = get_prediction(
         model=model,
         text=text,
         tokenizer=tokenizer,
         backward_context=model.config.backward_context,
         forward_context=model.config.forward_context,
     )
     print(punct_text)

src/models.py CHANGED Viewed

@@ -1,15 +1,15 @@
 from torch import nn
 from transformers import BertForMaskedLM, PreTrainedModel
-from src.config import CustomBertConfig
 class BertForPunctuation(PreTrainedModel):
-    config_class = CustomBertConfig
     def __init__(self, config):
         super().__init__(config)
-        # backward_context + forward_context + pivot token + pause token
         segment_size = config.backward_context + config.forward_context + 2
         bert_vocab_size = config.vocab_size
         self.bert = BertForMaskedLM(config)

 from torch import nn
 from transformers import BertForMaskedLM, PreTrainedModel
+from src.config import PunctuationBertConfig
 class BertForPunctuation(PreTrainedModel):
+    config_class = PunctuationBertConfig
     def __init__(self, config):
         super().__init__(config)
+        # segment_size equal backward_context + forward_context + predicted token + pause token
         segment_size = config.backward_context + config.forward_context + 2
         bert_vocab_size = config.vocab_size
         self.bert = BertForMaskedLM(config)