Spaces:

mebubo
/

gpted

Sleeping

App Files Files Community

mebubo commited on Oct 8, 2024

Commit

426b33e

1 Parent(s): 98f1760

Snapshot

Browse files

Files changed (2) hide show

app.py +38 -24
text_processing.py +3 -3

app.py CHANGED Viewed

@@ -3,31 +3,36 @@ import time
 from tqdm import tqdm
 from text_processing import split_into_words, Word
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizer
-from pprint import pprint
-def load_model_and_tokenizer(model_name):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
-    return model, tokenizer, device
-def process_input_text(input_text, tokenizer, device):
-    """Process input text to obtain input IDs and attention mask."""
-    inputs = tokenizer(input_text, return_tensors="pt").to(device)
-    input_ids = inputs["input_ids"]
-    attention_mask = inputs["attention_mask"]
-    return inputs, input_ids, attention_mask
-def calculate_log_probabilities(model, tokenizer, inputs, input_ids, attention_mask):
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
-    logits = outputs.logits[0, :-1, :]
-    log_probs = torch.log_softmax(logits, dim=-1)
-    token_log_probs = log_probs[range(log_probs.shape[0]), input_ids[0][1:]]
-    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
-    return list(zip(tokens[1:], token_log_probs.tolist()))
 def generate_replacements(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix_tokens: list[int], device: torch.device, num_samples: int = 5) -> list[str]:
@@ -53,15 +58,24 @@ def generate_replacements(model: PreTrainedModel, tokenizer: PreTrainedTokenizer
     return new_words
 #%%
 model_name = "mistralai/Mistral-7B-v0.1"
-model, tokenizer, device = load_model_and_tokenizer(model_name)
 input_text = "He asked me to prostrate myself before the king, but I rifused."
-inputs, input_ids, attention_mask = process_input_text(input_text, tokenizer, device)
-result = calculate_log_probabilities(model, tokenizer, inputs, input_ids, attention_mask)
-words = split_into_words([token for token, _ in result], [logprob for _, logprob in result])
 log_prob_threshold = -5.0
 low_prob_words = [word for word in words if word.logprob < log_prob_threshold]
@@ -72,7 +86,7 @@ start_time = time.time()
 for word in tqdm(low_prob_words, desc="Processing words"):
     iteration_start_time = time.time()
     prefix_index = word.first_token_index
-    prefix_tokens = tokenizer.convert_tokens_to_ids([token for token, _ in result][:prefix_index + 1])
     replacements = generate_replacements(model, tokenizer, prefix_tokens, device)
     print(f"Original word: {word.text}, Log Probability: {word.logprob:.4f}")
     print(f"Proposed replacements: {replacements}")

 from tqdm import tqdm
 from text_processing import split_into_words, Word
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast, BatchEncoding
+from tokenizers import Encoding
+from typing import cast
+type Tokenizer = PreTrainedTokenizer | PreTrainedTokenizerFast
+def load_model_and_tokenizer(model_name: str, device: torch.device) -> tuple[PreTrainedModel, Tokenizer]:
+    tokenizer: Tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(model_name)
     model.to(device)
+    return model, tokenizer
+def tokenize(input_text: str, tokenizer: Tokenizer, device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
+    inputs: BatchEncoding = tokenizer(input_text, return_tensors="pt").to(device)
+    input_ids = cast(torch.Tensor, inputs["input_ids"])
+    attention_mask = cast(torch.Tensor, inputs["attention_mask"])
+    return input_ids, attention_mask
+def calculate_log_probabilities(model: PreTrainedModel, tokenizer: Tokenizer, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> list[tuple[str, float]]:
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
+    # B x T x V
+    logits: torch.Tensor = outputs.logits[:, :-1, :]
+    # B x T x V
+    log_probs: torch.Tensor = torch.log_softmax(logits, dim=-1)
+    # T - 1
+    token_log_probs: torch.Tensor = log_probs[0, range(log_probs.shape[1]), input_ids[0][1:]]
+    # T - 1
+    tokens: list[str] = tokenizer.convert_ids_to_tokens(input_ids[0])[1:]
+    return list(zip(tokens, token_log_probs.tolist()))
 def generate_replacements(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix_tokens: list[int], device: torch.device, num_samples: int = 5) -> list[str]:
     return new_words
 #%%
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model_name = "mistralai/Mistral-7B-v0.1"
+model, tokenizer = load_model_and_tokenizer(model_name, device)
+#%%
 input_text = "He asked me to prostrate myself before the king, but I rifused."
+input_ids, attention_mask = tokenize(input_text, tokenizer, device)
+#%%
+token_probs: list[tuple[str, float]] = calculate_log_probabilities(model, tokenizer, input_ids, attention_mask)
+#%%
+words = split_into_words(token_probs)
 log_prob_threshold = -5.0
 low_prob_words = [word for word in words if word.logprob < log_prob_threshold]
 for word in tqdm(low_prob_words, desc="Processing words"):
     iteration_start_time = time.time()
     prefix_index = word.first_token_index
+    prefix_tokens = tokenizer.convert_tokens_to_ids([token for token, _ in token_probs][:prefix_index + 1])
     replacements = generate_replacements(model, tokenizer, prefix_tokens, device)
     print(f"Original word: {word.text}, Log Probability: {word.logprob:.4f}")
     print(f"Proposed replacements: {replacements}")

text_processing.py CHANGED Viewed

@@ -2,18 +2,18 @@ from dataclasses import dataclass
 @dataclass
 class Word:
-    tokens: list[int]
     text: str
     logprob: float
     first_token_index: int
-def split_into_words(tokens, log_probs) -> list[Word]:
     words = []
     current_word = []
     current_log_probs = []
     current_word_first_token_index = 0
-    for i, (token, logprob) in enumerate(zip(tokens, log_probs)):
         if not token.startswith(chr(9601)) and token.isalpha():
             current_word.append(token)
             current_log_probs.append(logprob)

 @dataclass
 class Word:
+    tokens: list[str]
     text: str
     logprob: float
     first_token_index: int
+def split_into_words(token_probs: list[tuple[str, float]]) -> list[Word]:
     words = []
     current_word = []
     current_log_probs = []
     current_word_first_token_index = 0
+    for i, (token, logprob) in enumerate(token_probs):
         if not token.startswith(chr(9601)) and token.isalpha():
             current_word.append(token)
             current_log_probs.append(logprob)