|
import language_tool_python |
|
import numpy as np |
|
import en_core_web_sm |
|
import torch |
|
import wordfreq |
|
from transformers import AutoModelForMaskedLM, AutoTokenizer |
|
|
|
|
|
|
|
|
|
|
|
tool = language_tool_python.LanguageTool("en-US") |
|
|
|
|
|
model_name = "xlm-roberta-base" |
|
model = AutoModelForMaskedLM.from_pretrained(model_name) |
|
model.eval() |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
nlp = en_core_web_sm.load() |
|
|
|
def __get_rarity(word: str, lang: str = "en") -> float: |
|
""" |
|
Returns the rarity of a word in the given language. word_freq retuns a value |
|
between 0 and 1, where 1 is the most common word. Therefore, taking the log results |
|
in a value between 0 (log 1 = 0) and -27.63 (log 1e-12). We then negate it so super |
|
rare words have a high score and common words have a low score. |
|
|
|
Parameters: |
|
word (str): The word to check. |
|
lang (str): The language to check. Default is "en". |
|
|
|
Returns: |
|
float: The rarity of the word. |
|
""" |
|
return -np.log(wordfreq.word_frequency(word, lang) + 1e-12) |
|
|
|
|
|
def __produce_groupings(offset_mapping: list, input_ids: list) -> list: |
|
""" |
|
Produce groupings of tokens that are part of the same word. |
|
|
|
Parameters: |
|
offset_mapping (list): The offset mapping of the tokens. |
|
input_ids (list): The input ids of the tokens. |
|
|
|
Returns: |
|
list: A list of groupings of tokens. |
|
""" |
|
|
|
res = [] |
|
current_group = [] |
|
prev_end = None |
|
for i, (start, end) in enumerate(offset_mapping): |
|
if input_ids[i] in tokenizer.all_special_ids: |
|
continue |
|
if prev_end is not None and start > prev_end: |
|
|
|
res.append(current_group) |
|
current_group = [i] |
|
else: |
|
current_group.append(i) |
|
prev_end = end |
|
|
|
if current_group: |
|
res.append(current_group) |
|
|
|
return res |
|
|
|
|
|
def pseudo_perplexity(text: str, threshold: int = 4, max_len: int = 128) -> dict: |
|
""" |
|
Calculate the pseudo-perplexity of a text using a masked language model. Return all |
|
words that exceed a threshold of "adjusted awkwardness". The threshold is a measure |
|
in terms of log probability of the word. |
|
|
|
Parameters: |
|
text (str): The text to check. |
|
threshold (float): The threshold for awkwardness. Default is 4. |
|
max_len (int): The maximum length of the text. Default is 128. |
|
|
|
Returns: |
|
dict: A dictionary containing the score and errors. |
|
""" |
|
|
|
|
|
encoding = tokenizer(text, return_tensors="pt", return_offsets_mapping=True) |
|
input_ids = encoding["input_ids"][0] |
|
offset_mapping = encoding["offset_mapping"][0] |
|
tokens = tokenizer.convert_ids_to_tokens(input_ids) |
|
word_groups = __produce_groupings(offset_mapping, input_ids) |
|
|
|
|
|
loss_values = [] |
|
for group in word_groups: |
|
|
|
if group[0] == 0 or group[-1] == len(input_ids) - 1: |
|
continue |
|
|
|
|
|
masked = input_ids.clone() |
|
for i in group: |
|
masked[i] = tokenizer.mask_token_id |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(masked.unsqueeze(0)) |
|
logits = outputs.logits[0] |
|
|
|
log_probs = [] |
|
for i in group: |
|
|
|
probs = torch.softmax(logits[i], dim=-1) |
|
true_token_id = input_ids[i].item() |
|
prob = probs[true_token_id].item() |
|
|
|
log_probs.append(np.log(prob + 1e-12)) |
|
|
|
|
|
word_loss = -np.sum(log_probs) / len(log_probs) |
|
|
|
word = tokenizer.decode(input_ids[group[0]]) |
|
word_loss -= 0.6 * __get_rarity( |
|
word |
|
) |
|
loss_values.append(word_loss) |
|
|
|
|
|
average_loss = np.mean(loss_values) |
|
|
|
errors = [] |
|
for i, l in enumerate(loss_values): |
|
if l < threshold: |
|
continue |
|
errors.append( |
|
{ |
|
"start": i, |
|
"end": i, |
|
"message": f"Adjusted liklihood {round(l, 2)} over threshold {threshold} for word {text.split()[i]}", |
|
} |
|
) |
|
|
|
res = {"score": __fluency_score(average_loss), "errors": errors} |
|
|
|
return res |
|
|
|
|
|
def __fluency_score( |
|
loss: float, midpoint: float = 5.0, steepness: float = 0.3 |
|
) -> float: |
|
""" |
|
Transform the loss into a score from 0 to 100. Steepness controls how quickly the |
|
score drops as loss increases and midpoint controls the loss at which the score is |
|
50. |
|
|
|
Parameters: |
|
loss (float): The loss to transform. |
|
midpoint (float): The loss at which the score is 50. Default is 5. |
|
steepness (float): The steepness of the curve. Default is 0.3. |
|
|
|
Returns: |
|
float: The score from 0 to 100. |
|
""" |
|
score = 100 / (1 + np.exp(steepness * (loss - midpoint))) |
|
return round(score, 2) |
|
|
|
|
|
def grammar_errors(text: str) -> dict: |
|
""" |
|
Check the grammar of a text using a grammar checker and a structural grammar check. |
|
|
|
Parameters: |
|
text (str): The text to check. |
|
|
|
Returns: |
|
dict: A dictionary containing the score and errors. |
|
""" |
|
matches = tool.check(text) |
|
|
|
r = [] |
|
for match in matches: |
|
words = text.split() |
|
char_to_word = [] |
|
current_char = 0 |
|
|
|
for i, word in enumerate(words): |
|
for _ in range(len(word)): |
|
char_to_word.append(i) |
|
current_char += len(word) |
|
if current_char < len(text): |
|
char_to_word.append(i) |
|
current_char += 1 |
|
|
|
start = char_to_word[match.offset] |
|
end = char_to_word[match.offset + match.errorLength - 1] + 1 |
|
r.append({"start": start, "end": end, "message": match.message}) |
|
|
|
struct_err = __check_structural_grammar(text) |
|
for e in struct_err: |
|
r.append(e) |
|
|
|
grammar_score = len(r) / len(text.split()) |
|
|
|
res = {"score": __grammar_score_from_prob(grammar_score), "errors": r} |
|
|
|
return res |
|
|
|
|
|
def __grammar_score_from_prob(error_ratio: float) -> float: |
|
""" |
|
Transform the number of errors divided by words into a score from 0 to 100. |
|
|
|
Parameters: |
|
error_ratio (float): The ratio of errors to words. |
|
|
|
Returns: |
|
float: The score from 0 to 100. |
|
""" |
|
score = 100 * (1 - error_ratio) |
|
return round(score, 2) |
|
|
|
|
|
def __check_structural_grammar(text: str) -> list: |
|
""" |
|
Check the structural grammar of a text using spaCy. |
|
|
|
Parameters: |
|
text (str): The text to check. |
|
|
|
Returns: |
|
list: A list of structural grammar errors. |
|
""" |
|
doc = nlp(text) |
|
issues = [] |
|
|
|
|
|
root_verbs = [ |
|
tok for tok in doc if tok.dep_ == "ROOT" and tok.pos_ in {"VERB", "AUX"} |
|
] |
|
if not root_verbs: |
|
root_root = [tok for tok in doc if tok.dep_ == "ROOT"] |
|
token = root_root[0] if root_root else doc[0] |
|
issues.append( |
|
{ |
|
"start": token.i, |
|
"end": token.i + 1, |
|
"message": "Sentence is missing a main verb (no ROOT verb).", |
|
} |
|
) |
|
|
|
|
|
verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}] |
|
subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}] |
|
if verbs and not subjects: |
|
for verb in verbs: |
|
issues.append( |
|
{ |
|
"start": verb.i, |
|
"end": verb.i + 1, |
|
"message": "Sentence has verb(s) but no subject (possible fragment).", |
|
} |
|
) |
|
|
|
|
|
for tok in doc: |
|
if tok.pos_ == "ADP" and len(list(tok.children)) == 0: |
|
issues.append( |
|
{ |
|
"start": tok.i, |
|
"end": tok.i + 1, |
|
"message": f"Dangling preposition '{tok.text}' (no object or complement).", |
|
} |
|
) |
|
|
|
|
|
if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and all( |
|
tok.pos_ in {"NOUN", "PROPN", "ADJ", "DET", "NUM"} |
|
for tok in doc |
|
if tok.is_alpha |
|
): |
|
token = doc[0] |
|
issues.append( |
|
{ |
|
"start": token.i, |
|
"end": token.i + 1, |
|
"message": "Sentence lacks a verb or any verbal structure (nominal phrase pile-up).", |
|
} |
|
) |
|
|
|
|
|
root_count = sum(1 for tok in doc if tok.dep_ == "ROOT") |
|
if root_count > 1: |
|
for tok in doc: |
|
if tok.dep_ == "ROOT": |
|
issues.append( |
|
{ |
|
"start": tok.i, |
|
"end": tok.i + 1, |
|
"message": "Sentence has multiple ROOTs — possible run-on sentence.", |
|
} |
|
) |
|
|
|
return issues |
|
|
|
|
|
|
|
def main(): |
|
pass |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|