Spaces:

mebubo
/

gpted

Sleeping

mebubo commited on Oct 8, 2024

Commit

e72ea09

1 Parent(s): 230a441

Snapshot

Files changed (1) hide show

text_processing.py CHANGED Viewed

@@ -6,13 +6,14 @@ class Word:
     tokens: list[int]
     text: str
     logprob: float
-    first_token_index: int
 def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer) -> list[Word]:
     words: list[Word] = []
     current_word: list[int] = []
     current_log_probs: list[float] = []
     current_word_first_token_index: int = 0
     for i, (token_id, logprob) in enumerate(token_probs):
         token: str = tokenizer.decode([token_id])
@@ -21,12 +22,18 @@ def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer)
             current_log_probs.append(logprob)
         else:
             if current_word:
-                words.append(Word(current_word, tokenizer.decode(current_word), sum(current_log_probs), current_word_first_token_index))
             current_word = [token_id]
             current_log_probs = [logprob]
             current_word_first_token_index = i
     if current_word:
-        words.append(Word(current_word, tokenizer.decode(current_word), sum(current_log_probs), current_word_first_token_index))
     return words

     tokens: list[int]
     text: str
     logprob: float
+    context: list[int]
 def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer) -> list[Word]:
     words: list[Word] = []
     current_word: list[int] = []
     current_log_probs: list[float] = []
     current_word_first_token_index: int = 0
+    all_tokens: list[int] = [token_id for token_id, _ in token_probs]
     for i, (token_id, logprob) in enumerate(token_probs):
         token: str = tokenizer.decode([token_id])
             current_log_probs.append(logprob)
         else:
             if current_word:
+                words.append(Word(current_word,
+                                  tokenizer.decode(current_word),
+                                  sum(current_log_probs),
+                                  all_tokens[:current_word_first_token_index]))
             current_word = [token_id]
             current_log_probs = [logprob]
             current_word_first_token_index = i
     if current_word:
+        words.append(Word(current_word,
+                          tokenizer.decode(current_word),
+                          sum(current_log_probs),
+                          all_tokens[:current_word_first_token_index]))
     return words