Spaces:

ak5005
/

derrobot

Sleeping

File size: 9,782 Bytes

import language_tool_python
import numpy as np
import en_core_web_sm
import torch
import wordfreq
from transformers import AutoModelForMaskedLM, AutoTokenizer

# setup global variables on import (bad practice, but whatever)
# --------------------------------------------------------------

# grammar checker
tool = language_tool_python.LanguageTool("en-US")

# masked language model and tokenizer from huggingface
model_name = "xlm-roberta-base"
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)  # tokenizer

# spacy model for parsing
nlp = en_core_web_sm.load()

def __get_rarity(word: str, lang: str = "en") -> float:
    """
    Returns the rarity of a word in the given language. word_freq retuns a value
    between 0 and 1, where 1 is the most common word. Therefore, taking the log results
    in a value between 0 (log 1 = 0) and -27.63 (log 1e-12). We then negate it so super
    rare words have a high score and common words have a low score.

    Parameters:
        word (str): The word to check.
        lang (str): The language to check. Default is "en".

    Returns:
        float: The rarity of the word.
    """
    return -np.log(wordfreq.word_frequency(word, lang) + 1e-12)


def __produce_groupings(offset_mapping: list, input_ids: list) -> list:
    """
    Produce groupings of tokens that are part of the same word.

    Parameters:
        offset_mapping (list): The offset mapping of the tokens.
        input_ids (list): The input ids of the tokens.

    Returns:
        list: A list of groupings of tokens.
    """
    # Produce groupings of tokens that are part of the same word
    res = []
    current_group = []
    prev_end = None
    for i, (start, end) in enumerate(offset_mapping):
        if input_ids[i] in tokenizer.all_special_ids:
            continue  # skip special tokens like [CLS] and [SEP]
        if prev_end is not None and start > prev_end:
            # Word boundary detected → start new group
            res.append(current_group)
            current_group = [i]
        else:
            current_group.append(i)
        prev_end = end
    # Append final group
    if current_group:
        res.append(current_group)

    return res


def pseudo_perplexity(text: str, threshold: int = 4, max_len: int = 128) -> dict:
    """
    Calculate the pseudo-perplexity of a text using a masked language model. Return all
    words that exceed a threshold of "adjusted awkwardness". The threshold is a measure
    in terms of log probability of the word.

    Parameters:
        text (str): The text to check.
        threshold (float): The threshold for awkwardness. Default is 4.
        max_len (int): The maximum length of the text. Default is 128.

    Returns:
        dict: A dictionary containing the score and errors.
    """

    # Tokenize the text and produce groupings
    encoding = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)
    input_ids = encoding["input_ids"][0]
    offset_mapping = encoding["offset_mapping"][0]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    word_groups = __produce_groupings(offset_mapping, input_ids)

    # Calculate the loss for each word group
    loss_values = []
    for group in word_groups:
        # Skip special tokens (CLS and SEP)
        if group[0] == 0 or group[-1] == len(input_ids) - 1:
            continue

        # Mask the word group
        masked = input_ids.clone()
        for i in group:
            masked[i] = tokenizer.mask_token_id

        # Get the model output distribution
        with torch.no_grad():
            outputs = model(masked.unsqueeze(0))
            logits = outputs.logits[0]

        log_probs = []
        for i in group:
            # Get the probability of the true token
            probs = torch.softmax(logits[i], dim=-1)
            true_token_id = input_ids[i].item()
            prob = probs[true_token_id].item()
            # Append the loss of the true token
            log_probs.append(np.log(prob + 1e-12))

        # Calculate the loss for the entire word group
        word_loss = -np.sum(log_probs) / len(log_probs)
        # Adjust the loss based on the rarity of the word
        word = tokenizer.decode(input_ids[group[0]])
        word_loss -= 0.6 * __get_rarity(
            word
        )  # subtract rarity (rare words reduce loss)
        loss_values.append(word_loss)

    # Structure the results for output
    average_loss = np.mean(loss_values)

    errors = []
    for i, l in enumerate(loss_values):
        if l < threshold:
            continue
        errors.append(
            {
                "start": i,
                "end": i,
                "message": f"Adjusted liklihood {round(l, 2)} over threshold {threshold} for word {text.split()[i]}",
            }
        )

    res = {"score": __fluency_score(average_loss), "errors": errors}

    return res


def __fluency_score(
    loss: float, midpoint: float = 5.0, steepness: float = 0.3
) -> float:
    """
    Transform the loss into a score from 0 to 100. Steepness controls how quickly the
    score drops as loss increases and midpoint controls the loss at which the score is
    50.

    Parameters:
        loss (float): The loss to transform.
        midpoint (float): The loss at which the score is 50. Default is 5.
        steepness (float): The steepness of the curve. Default is 0.3.

    Returns:
        float: The score from 0 to 100.
    """
    score = 100 / (1 + np.exp(steepness * (loss - midpoint)))
    return round(score, 2)


def grammar_errors(text: str) -> dict:
    """
    Check the grammar of a text using a grammar checker and a structural grammar check.

    Parameters:
        text (str): The text to check.

    Returns:
        dict: A dictionary containing the score and errors.
    """
    matches = tool.check(text)

    r = []
    for match in matches:
        words = text.split()
        char_to_word = []
        current_char = 0

        for i, word in enumerate(words):
            for _ in range(len(word)):
                char_to_word.append(i)
            current_char += len(word)
            if current_char < len(text):  # Account for spaces between words
                char_to_word.append(i)
                current_char += 1

        start = char_to_word[match.offset]
        end = char_to_word[match.offset + match.errorLength - 1] + 1
        r.append({"start": start, "end": end, "message": match.message})

    struct_err = __check_structural_grammar(text)
    for e in struct_err:
        r.append(e)

    grammar_score = len(r) / len(text.split())

    res = {"score": __grammar_score_from_prob(grammar_score), "errors": r}

    return res


def __grammar_score_from_prob(error_ratio: float) -> float:
    """
    Transform the number of errors divided by words into a score from 0 to 100.

    Parameters:
        error_ratio (float): The ratio of errors to words.

    Returns:
        float: The score from 0 to 100.
    """
    score = 100 * (1 - error_ratio)
    return round(score, 2)


def __check_structural_grammar(text: str) -> list:
    """
    Check the structural grammar of a text using spaCy.

    Parameters:
        text (str): The text to check.

    Returns:
        list: A list of structural grammar errors.
    """
    doc = nlp(text)
    issues = []

    # 1. Missing main verb (ROOT)
    root_verbs = [
        tok for tok in doc if tok.dep_ == "ROOT" and tok.pos_ in {"VERB", "AUX"}
    ]
    if not root_verbs:
        root_root = [tok for tok in doc if tok.dep_ == "ROOT"]
        token = root_root[0] if root_root else doc[0]
        issues.append(
            {
                "start": token.i,
                "end": token.i + 1,
                "message": "Sentence is missing a main verb (no ROOT verb).",
            }
        )

    # 2. Verb(s) present but no subject
    verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}]
    subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}]
    if verbs and not subjects:
        for verb in verbs:
            issues.append(
                {
                    "start": verb.i,
                    "end": verb.i + 1,
                    "message": "Sentence has verb(s) but no subject (possible fragment).",
                }
            )

    # 3. Dangling prepositions
    for tok in doc:
        if tok.pos_ == "ADP" and len(list(tok.children)) == 0:
            issues.append(
                {
                    "start": tok.i,
                    "end": tok.i + 1,
                    "message": f"Dangling preposition '{tok.text}' (no object or complement).",
                }
            )

    # 4. Noun pile-up (no verbs, all tokens are nominal)
    if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and all(
        tok.pos_ in {"NOUN", "PROPN", "ADJ", "DET", "NUM"}
        for tok in doc
        if tok.is_alpha
    ):
        token = doc[0]
        issues.append(
            {
                "start": token.i,
                "end": token.i + 1,
                "message": "Sentence lacks a verb or any verbal structure (nominal phrase pile-up).",
            }
        )

    # 5. Multiple ROOTs (possible run-on)
    root_count = sum(1 for tok in doc if tok.dep_ == "ROOT")
    if root_count > 1:
        for tok in doc:
            if tok.dep_ == "ROOT":
                issues.append(
                    {
                        "start": tok.i,
                        "end": tok.i + 1,
                        "message": "Sentence has multiple ROOTs — possible run-on sentence.",
                    }
                )

    return issues


# Unit tests can go here eventually
def main():
    pass


if __name__ == "__main__":
    main()