Spaces:

ak5005
/

derrobot

Sleeping

App Files Files Community

Aidan Phillips commited on Apr 4

Commit

dc76b04

1 Parent(s): b837a10

sussy math works with default sentence

Browse files

Files changed (3) hide show

categories/fluency.py +74 -50
requirements.txt +2 -1
scorer.ipynb +33 -20

categories/fluency.py CHANGED Viewed

@@ -3,6 +3,7 @@ from transformers import AutoTokenizer, AutoModelForMaskedLM
 import torch
 import numpy as np
 import spacy
 tool = language_tool_python.LanguageTool('en-US')
 model_name="distilbert-base-multilingual-cased"
@@ -12,7 +13,10 @@ model.eval()
 nlp = spacy.load("en_core_web_sm")
-def pseudo_perplexity(text, max_len=128):
     """
     We want to return
     {
@@ -26,67 +30,87 @@ def pseudo_perplexity(text, max_len=128):
         ]
     }
     """
-    input_ids = tokenizer.encode(text, return_tensors="pt")[0]
-    if len(input_ids) > max_len:
-        raise ValueError(f"Input too long for model (>{max_len} tokens).")
     loss_values = []
-    for i in range(1, len(input_ids) - 1):  # skip [CLS] and [SEP]
-        masked_input = input_ids.clone()
-        masked_input[i] = tokenizer.mask_token_id
         with torch.no_grad():
-            outputs = model(masked_input.unsqueeze(0))
-            logits = outputs.logits[0, i]
-            probs = torch.softmax(logits, dim=-1)
-        true_token_id = input_ids[i].item()
-        prob_true_token = probs[true_token_id].item()
-        log_prob = np.log(prob_true_token + 1e-12)
-        loss_values.append(-log_prob)
-    # get longest sequence of tokens with perplexity over some threshold
-    threshold = 12  # Define a perplexity threshold
-    longest_start, longest_end = 0, 0
-    current_start, current_end = 0, 0
-    max_length = 0
-    curr_loss = 0
-    for i, loss in enumerate(loss_values):
-        if loss > threshold:
-            if current_start == current_end:  # Start a new sequence
-                current_start = i
-            current_end = i + 1
-            curr_loss = loss
-        else:
-            if current_end - current_start > max_length:
-                longest_start, longest_end = current_start, current_end
-                max_length = current_end - current_start
-            current_start, current_end = 0, 0
-    if current_end - current_start > max_length:  # Check the last sequence
-        longest_start, longest_end = current_start, current_end
-    longest_sequence = (longest_start, longest_end)
-    ppl = np.exp(np.mean(loss_values))
     res = {
-        "score": __fluency_score_from_ppl(ppl),
-        "errors": [
-            {
-                "start": longest_sequence[0],
-                "end": longest_sequence[1],
-                "message": f"Perplexity above threshold: {curr_loss}"
-            }
-        ]
     }
     return res
-def __fluency_score_from_ppl(ppl, midpoint=20, steepness=0.3):
     """
     Use a logistic function to map perplexity to 0–100.
     Midpoint is the PPL where score is 50.
@@ -135,12 +159,12 @@ def grammar_errors(text) -> tuple[int, list[str]]:
     return res
-def __grammar_score_from_prob(error_ratio, steepness=10):
     """
     Transform the number of errors divided by words into a score from 0 to 100.
     Steepness controls how quickly the score drops as errors increase.
     """
-    score = 100 / (1 + np.exp(steepness * error_ratio))
     return round(score, 2)

 import torch
 import numpy as np
 import spacy
+import wordfreq
 tool = language_tool_python.LanguageTool('en-US')
 model_name="distilbert-base-multilingual-cased"
 nlp = spacy.load("en_core_web_sm")
+def __get_word_pr_score(word, lang="en") -> list[float]:
+    return -np.log(wordfreq.word_frequency(word, lang) + 1e-12)
+def pseudo_perplexity(text, threshold=20, max_len=128):
     """
     We want to return
     {
         ]
     }
     """
+    encoding = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)
+    input_ids = encoding["input_ids"][0]
+    print(input_ids)
+    offset_mapping = encoding["offset_mapping"][0]
+    print(offset_mapping)
+    tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    # Group token indices by word based on offset mapping
+    word_groups = []
+    current_group = []
+    prev_end = None
+    for i, (start, end) in enumerate(offset_mapping):
+        if input_ids[i] in tokenizer.all_special_ids:
+            continue  # skip special tokens like [CLS] and [SEP]
+        if prev_end is not None and start > prev_end:
+            # Word boundary detected → start new group
+            word_groups.append(current_group)
+            current_group = [i]
+        else:
+            current_group.append(i)
+        prev_end = end
+    # Append final group
+    if current_group:
+        word_groups.append(current_group)
     loss_values = []
+    tok_loss = []
+    for group in word_groups:
+        if group[0] == 0 or group[-1] == len(input_ids) - 1:
+            continue  # skip [CLS] and [SEP]
+        masked = input_ids.clone()
+        for i in group:
+            masked[i] = tokenizer.mask_token_id
         with torch.no_grad():
+            outputs = model(masked.unsqueeze(0))
+            logits = outputs.logits[0]
+        log_probs = []
+        for i in group:
+            probs = torch.softmax(logits[i], dim=-1)
+            true_token_id = input_ids[i].item()
+            prob = probs[true_token_id].item()
+            log_probs.append(np.log(prob + 1e-12))
+            tok_loss.append(-np.log(prob + 1e-12))
+        word_loss = -np.sum(log_probs) / len(log_probs)
+        word = tokenizer.decode(input_ids[group[0]])
+        word_loss -= 0.6 * __get_word_pr_score(word)
+        loss_values.append(word_loss)
+    print(loss_values)
+    errors = []
+    for i, l in enumerate(loss_values):
+        if l < threshold:
+            continue
+        errors.append({
+            "start": i,
+            "end": i,
+            "message": f"Perplexity {l} over threshold {threshold}"
+        })
+    print(tok_loss)
+    s_ppl = np.mean(tok_loss)
+    print(s_ppl)
     res = {
+        "score": __fluency_score_from_ppl(s_ppl),
+        "errors": errors
     }
     return res
+def __fluency_score_from_ppl(ppl, midpoint=8, steepness=0.3):
     """
     Use a logistic function to map perplexity to 0–100.
     Midpoint is the PPL where score is 50.
     return res
+def __grammar_score_from_prob(error_ratio):
     """
     Transform the number of errors divided by words into a score from 0 to 100.
     Steepness controls how quickly the score drops as errors increase.
     """
+    score = 100*(1-error_ratio)
     return round(score, 2)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 language_tool_python
 transformers
-torch

 language_tool_python
 transformers
+torch
+wordfreq

scorer.ipynb CHANGED Viewed

@@ -4,16 +4,7 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/envs/teach-bs/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
    "source": [
     "from categories.fluency import *"
    ]
@@ -27,7 +18,25 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sentence: The car hit the cone.\n"
      ]
     }
    ],
@@ -40,7 +49,7 @@
     "print(\"Sentence:\", s)  # Print the input sentence\n",
     "\n",
     "err = grammar_errors(s)  # Call the function to execute the grammar error checking\n",
-    "flu = pseudo_perplexity(s)  # Call the function to execute the fluency checking"
    ]
   },
   {
@@ -52,8 +61,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Perplexity above threshold: 0: The\n",
-      "[{'start': 0, 'end': 0, 'message': 'Perplexity above threshold: 0'}]\n"
      ]
     }
    ],
@@ -62,26 +75,26 @@
     "\n",
     "for e in combined_err:\n",
     "    substr = \" \".join(s.split(\" \")[e[\"start\"]:e[\"end\"]+1])\n",
-    "    print(f\"{e['message']}: {substr}\")  # Print the error messages\n",
-    "\n",
-    "print(combined_err)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Fluency Score: 30.0\n"
      ]
     }
    ],
    "source": [
-    "fluency_score = 0.6 * err[\"score\"] + 0.4 * flu[\"score\"]  # Calculate the fluency score\n",
     "print(\"Fluency Score:\", fluency_score)  # Print the fluency score"
    ]
   }

    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
+   "outputs": [],
    "source": [
     "from categories.fluency import *"
    ]
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Sentence: The cat sat the quickly up apples banana.\n",
+      "tensor([  101, 10117, 41163, 20694, 10105, 23590, 10741, 72894, 11268, 99304,\n",
+      "        10219,   119,   102])\n",
+      "tensor([[ 0,  0],\n",
+      "        [ 0,  3],\n",
+      "        [ 4,  7],\n",
+      "        [ 8, 11],\n",
+      "        [12, 15],\n",
+      "        [16, 23],\n",
+      "        [24, 26],\n",
+      "        [27, 30],\n",
+      "        [30, 33],\n",
+      "        [34, 38],\n",
+      "        [38, 40],\n",
+      "        [40, 41],\n",
+      "        [ 0,  0]])\n",
+      "[np.float64(0.00905743383887514), np.float64(1.1257066968185931), np.float64(4.8056646935577145), np.float64(4.473408069089179), np.float64(4.732453441503642), np.float64(3.028744414819041), np.float64(5.1115574262487735), np.float64(-0.6523823890571343)]\n",
+      "[np.float64(1.7636628003080927), np.float64(6.955413759407024), np.float64(10.828562153345375), np.float64(6.228013435558396), np.float64(10.258657658689351), np.float64(6.635744767229443), np.float64(11.163667119285972), np.float64(10.499412826924114), np.float64(11.96113847381264), np.float64(10.010973250156082), np.float64(2.470404176100153)]\n",
+      "0.5208035409471965\n"
      ]
     }
    ],
     "print(\"Sentence:\", s)  # Print the input sentence\n",
     "\n",
     "err = grammar_errors(s)  # Call the function to execute the grammar error checking\n",
+    "flu = pseudo_perplexity(s, threshold=2.5)  # Call the function to execute the fluency checking"
    ]
   },
   {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "An apostrophe may be missing.: apples banana.\n",
+      "Perplexity 4.8056646935577145 over threshold 2.5: sat\n",
+      "Perplexity 4.473408069089179 over threshold 2.5: the\n",
+      "Perplexity 4.732453441503642 over threshold 2.5: quickly\n",
+      "Perplexity 3.028744414819041 over threshold 2.5: up\n",
+      "Perplexity 5.1115574262487735 over threshold 2.5: apples\n"
      ]
     }
    ],
     "\n",
     "for e in combined_err:\n",
     "    substr = \" \".join(s.split(\" \")[e[\"start\"]:e[\"end\"]+1])\n",
+    "    print(f\"{e['message']}: {substr}\")  # Print the error messages\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "87.5 99.71\n",
+      "Fluency Score: 92.384\n"
      ]
     }
    ],
    "source": [
+    "fluency_score = 0.7 * err[\"score\"] + 0.3 * flu[\"score\"]  # Calculate the fluency score\n",
+    "print(err[\"score\"], flu[\"score\"])  # Print the individual scores\n",
     "print(\"Fluency Score:\", fluency_score)  # Print the fluency score"
    ]
   }