File size: 9,782 Bytes
b837a10 ac0a18f 0885169 dc76b04 0885169 b837a10 f5893dd 0885169 f5893dd 0885169 f5893dd c1e64ce b837a10 0885169 b837a10 f5893dd ac0a18f 0885169 f5893dd 0885169 f5893dd 0885169 f5893dd dc76b04 0885169 b837a10 f5893dd 0885169 f5893dd b837a10 f5893dd dc76b04 f5893dd dc76b04 f5893dd 0885169 f5893dd 0885169 f5893dd 0885169 f5893dd b837a10 f5893dd b837a10 dc76b04 f5893dd dc76b04 0885169 b837a10 f5893dd dc76b04 b837a10 f5893dd b837a10 dc76b04 f5893dd dc76b04 f5893dd dc76b04 f5893dd dc76b04 f5893dd dc76b04 0885169 dc76b04 f5893dd dc76b04 0885169 2970d8e 0885169 dc76b04 0885169 b837a10 0885169 b837a10 0885169 f5893dd 0885169 f5893dd b837a10 f5893dd b837a10 0885169 b837a10 f5893dd b837a10 f5893dd 0885169 f5893dd b837a10 7159c31 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 0885169 b837a10 f5893dd 0885169 f5893dd 0885169 f5893dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 |
import language_tool_python
import numpy as np
import en_core_web_sm
import torch
import wordfreq
from transformers import AutoModelForMaskedLM, AutoTokenizer
# setup global variables on import (bad practice, but whatever)
# --------------------------------------------------------------
# grammar checker
tool = language_tool_python.LanguageTool("en-US")
# masked language model and tokenizer from huggingface
model_name = "xlm-roberta-base"
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name) # tokenizer
# spacy model for parsing
nlp = en_core_web_sm.load()
def __get_rarity(word: str, lang: str = "en") -> float:
"""
Returns the rarity of a word in the given language. word_freq retuns a value
between 0 and 1, where 1 is the most common word. Therefore, taking the log results
in a value between 0 (log 1 = 0) and -27.63 (log 1e-12). We then negate it so super
rare words have a high score and common words have a low score.
Parameters:
word (str): The word to check.
lang (str): The language to check. Default is "en".
Returns:
float: The rarity of the word.
"""
return -np.log(wordfreq.word_frequency(word, lang) + 1e-12)
def __produce_groupings(offset_mapping: list, input_ids: list) -> list:
"""
Produce groupings of tokens that are part of the same word.
Parameters:
offset_mapping (list): The offset mapping of the tokens.
input_ids (list): The input ids of the tokens.
Returns:
list: A list of groupings of tokens.
"""
# Produce groupings of tokens that are part of the same word
res = []
current_group = []
prev_end = None
for i, (start, end) in enumerate(offset_mapping):
if input_ids[i] in tokenizer.all_special_ids:
continue # skip special tokens like [CLS] and [SEP]
if prev_end is not None and start > prev_end:
# Word boundary detected → start new group
res.append(current_group)
current_group = [i]
else:
current_group.append(i)
prev_end = end
# Append final group
if current_group:
res.append(current_group)
return res
def pseudo_perplexity(text: str, threshold: int = 4, max_len: int = 128) -> dict:
"""
Calculate the pseudo-perplexity of a text using a masked language model. Return all
words that exceed a threshold of "adjusted awkwardness". The threshold is a measure
in terms of log probability of the word.
Parameters:
text (str): The text to check.
threshold (float): The threshold for awkwardness. Default is 4.
max_len (int): The maximum length of the text. Default is 128.
Returns:
dict: A dictionary containing the score and errors.
"""
# Tokenize the text and produce groupings
encoding = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)
input_ids = encoding["input_ids"][0]
offset_mapping = encoding["offset_mapping"][0]
tokens = tokenizer.convert_ids_to_tokens(input_ids)
word_groups = __produce_groupings(offset_mapping, input_ids)
# Calculate the loss for each word group
loss_values = []
for group in word_groups:
# Skip special tokens (CLS and SEP)
if group[0] == 0 or group[-1] == len(input_ids) - 1:
continue
# Mask the word group
masked = input_ids.clone()
for i in group:
masked[i] = tokenizer.mask_token_id
# Get the model output distribution
with torch.no_grad():
outputs = model(masked.unsqueeze(0))
logits = outputs.logits[0]
log_probs = []
for i in group:
# Get the probability of the true token
probs = torch.softmax(logits[i], dim=-1)
true_token_id = input_ids[i].item()
prob = probs[true_token_id].item()
# Append the loss of the true token
log_probs.append(np.log(prob + 1e-12))
# Calculate the loss for the entire word group
word_loss = -np.sum(log_probs) / len(log_probs)
# Adjust the loss based on the rarity of the word
word = tokenizer.decode(input_ids[group[0]])
word_loss -= 0.6 * __get_rarity(
word
) # subtract rarity (rare words reduce loss)
loss_values.append(word_loss)
# Structure the results for output
average_loss = np.mean(loss_values)
errors = []
for i, l in enumerate(loss_values):
if l < threshold:
continue
errors.append(
{
"start": i,
"end": i,
"message": f"Adjusted liklihood {round(l, 2)} over threshold {threshold} for word {text.split()[i]}",
}
)
res = {"score": __fluency_score(average_loss), "errors": errors}
return res
def __fluency_score(
loss: float, midpoint: float = 5.0, steepness: float = 0.3
) -> float:
"""
Transform the loss into a score from 0 to 100. Steepness controls how quickly the
score drops as loss increases and midpoint controls the loss at which the score is
50.
Parameters:
loss (float): The loss to transform.
midpoint (float): The loss at which the score is 50. Default is 5.
steepness (float): The steepness of the curve. Default is 0.3.
Returns:
float: The score from 0 to 100.
"""
score = 100 / (1 + np.exp(steepness * (loss - midpoint)))
return round(score, 2)
def grammar_errors(text: str) -> dict:
"""
Check the grammar of a text using a grammar checker and a structural grammar check.
Parameters:
text (str): The text to check.
Returns:
dict: A dictionary containing the score and errors.
"""
matches = tool.check(text)
r = []
for match in matches:
words = text.split()
char_to_word = []
current_char = 0
for i, word in enumerate(words):
for _ in range(len(word)):
char_to_word.append(i)
current_char += len(word)
if current_char < len(text): # Account for spaces between words
char_to_word.append(i)
current_char += 1
start = char_to_word[match.offset]
end = char_to_word[match.offset + match.errorLength - 1] + 1
r.append({"start": start, "end": end, "message": match.message})
struct_err = __check_structural_grammar(text)
for e in struct_err:
r.append(e)
grammar_score = len(r) / len(text.split())
res = {"score": __grammar_score_from_prob(grammar_score), "errors": r}
return res
def __grammar_score_from_prob(error_ratio: float) -> float:
"""
Transform the number of errors divided by words into a score from 0 to 100.
Parameters:
error_ratio (float): The ratio of errors to words.
Returns:
float: The score from 0 to 100.
"""
score = 100 * (1 - error_ratio)
return round(score, 2)
def __check_structural_grammar(text: str) -> list:
"""
Check the structural grammar of a text using spaCy.
Parameters:
text (str): The text to check.
Returns:
list: A list of structural grammar errors.
"""
doc = nlp(text)
issues = []
# 1. Missing main verb (ROOT)
root_verbs = [
tok for tok in doc if tok.dep_ == "ROOT" and tok.pos_ in {"VERB", "AUX"}
]
if not root_verbs:
root_root = [tok for tok in doc if tok.dep_ == "ROOT"]
token = root_root[0] if root_root else doc[0]
issues.append(
{
"start": token.i,
"end": token.i + 1,
"message": "Sentence is missing a main verb (no ROOT verb).",
}
)
# 2. Verb(s) present but no subject
verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}]
subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}]
if verbs and not subjects:
for verb in verbs:
issues.append(
{
"start": verb.i,
"end": verb.i + 1,
"message": "Sentence has verb(s) but no subject (possible fragment).",
}
)
# 3. Dangling prepositions
for tok in doc:
if tok.pos_ == "ADP" and len(list(tok.children)) == 0:
issues.append(
{
"start": tok.i,
"end": tok.i + 1,
"message": f"Dangling preposition '{tok.text}' (no object or complement).",
}
)
# 4. Noun pile-up (no verbs, all tokens are nominal)
if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and all(
tok.pos_ in {"NOUN", "PROPN", "ADJ", "DET", "NUM"}
for tok in doc
if tok.is_alpha
):
token = doc[0]
issues.append(
{
"start": token.i,
"end": token.i + 1,
"message": "Sentence lacks a verb or any verbal structure (nominal phrase pile-up).",
}
)
# 5. Multiple ROOTs (possible run-on)
root_count = sum(1 for tok in doc if tok.dep_ == "ROOT")
if root_count > 1:
for tok in doc:
if tok.dep_ == "ROOT":
issues.append(
{
"start": tok.i,
"end": tok.i + 1,
"message": "Sentence has multiple ROOTs — possible run-on sentence.",
}
)
return issues
# Unit tests can go here eventually
def main():
pass
if __name__ == "__main__":
main()
|