Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

minko186 commited on Apr 12, 2024

Commit

fcfb880

2 Parent(s): cf5bf4c 69b471b

Merge remote-tracking branch 'origin/main' into minko

Browse files

Files changed (5) hide show

analysis.py +172 -72
app.py +53 -0
explainability.py +0 -119
requirements.txt +4 -1
writing_analysis.py +138 -65

analysis.py CHANGED Viewed

@@ -1,31 +1,42 @@
-import requests
-import httpx
-import torch
-import re
-from bs4 import BeautifulSoup
-import numpy as np
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import asyncio
-from scipy.special import softmax
-from evaluate import load
-from datetime import date
-import nltk
-import fitz
-from transformers import GPT2LMHeadModel, GPT2TokenizerFast
-import nltk, spacy, subprocess, torch
-import plotly.graph_objects as go
-import torch.nn.functional as F
-import nltk
-from unidecode import unidecode
-import time
 import yaml
-import nltk
-import os
-from explainability import *
 import subprocess
 nltk.download("punkt")
 nltk.download("stopwords")
 with open("config.yaml", "r") as file:
     params = yaml.safe_load(file)
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -33,64 +44,153 @@ readability_model_id = params["READABILITY_MODEL_ID"]
 gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
 gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
-command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
-subprocess.run(command)
-nlp = spacy.load("en_core_web_sm")
 def depth_analysis(input_text):
-    processed_words = preprocess_text1(input_text)
-    ttr_value = vocabulary_richness_ttr(processed_words)
-    gunning_fog = calculate_gunning_fog(input_text)
-    gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
-    words, sentences = preprocess_text2(input_text)
-    average_sentence_length = calculate_average_sentence_length(sentences)
-    average_word_length = calculate_average_word_length(words)
-    average_sentence_length_norm = normalize(
-        average_sentence_length, min_value=0, max_value=40
     )
-    average_word_length_norm = normalize(
-        average_word_length, min_value=0, max_value=8
     )
-    average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
-    average_tree_depth_norm = normalize(
-        average_tree_depth, min_value=0, max_value=10
     )
-    perplexity = calculate_perplexity(
-        input_text, gpt2_model, gpt2_tokenizer, device
     )
-    perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
     features = {
-        "readability": gunning_fog_norm,
-        "syntactic tree depth": average_tree_depth_norm,
-        "vocabulary richness": ttr_value,
-        "perplexity": perplexity_norm,
-        "average sentence length": average_sentence_length_norm,
-        "average word length": average_word_length_norm,
     }
-    fig = go.Figure()
-    fig.add_trace(
-        go.Scatterpolar(
-            r=list(features.values()),
-            theta=list(features.keys()),
-            fill="toself",
-            name="Radar Plot",
-        )
-    )
-    fig.update_layout(
-        polar=dict(
-            radialaxis=dict(
-                visible=True,
-                range=[0, 100],
-            )
-        ),
-        showlegend=False,
-        margin=dict(
-            l=10,
-            r=20,
-            b=10,
-            t=10,
-        ),
     )
     return fig

 import yaml
 import subprocess
+import nltk
+from nltk import word_tokenize
+from nltk.corpus import cmudict, stopwords
+import spacy
+import torch
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.patches import Circle, RegularPolygon
+from matplotlib.path import Path
+from matplotlib.projections import register_projection
+from matplotlib.projections.polar import PolarAxes
+from matplotlib.spines import Spine
+from matplotlib.transforms import Affine2D
+from writing_analysis import (
+    estimated_slightly_difficult_words_ratio,
+    entity_density,
+    determiners_frequency,
+    punctuation_diversity,
+    type_token_ratio,
+    calculate_perplexity,
+    calculate_syntactic_tree_depth,
+    hapax_legomena_ratio,
+    mtld,
+)
+nltk.download("cmudict")
 nltk.download("punkt")
 nltk.download("stopwords")
+nltk.download("wordnet")
+d = cmudict.dict()
+command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
+subprocess.run(command)
+nlp = spacy.load("en_core_web_sm")
 with open("config.yaml", "r") as file:
     params = yaml.safe_load(file)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
 gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
+def normalize(value, min_value, max_value):
+    normalized_value = ((value - min_value) * 100) / (max_value - min_value)
+    return max(0, min(100, normalized_value))
 def depth_analysis(input_text):
+    usual_ranges = {
+        "estimated_slightly_difficult_words_ratio": (
+            0.2273693623058005,
+            0.557383692351033,
+        ),
+        "entity_density": (-0.07940776754145815, 0.23491038179986615),
+        "determiners_frequency": (0.012461059190031154, 0.15700934579439252),
+        "punctuation_diversity": (-0.21875, 0.53125),
+        "type_token_ratio": (0.33002482852189063, 1.0894414982357028),
+        "calculate_perplexity": (-25.110544681549072, 82.4620680809021),
+        "calculate_syntactic_tree_depth": (1.8380681818181812, 10.997159090909092),
+        "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
+        "mtld": (-84.03125000000001, 248.81875000000002),
+    }
+    vocabulary_level = estimated_slightly_difficult_words_ratio(input_text, d)
+    entity_ratio = entity_density(input_text, nlp)
+    determiner_use = determiners_frequency(input_text, nlp)
+    punctuation_variety = punctuation_diversity(input_text)
+    sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
+    perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
+    lexical_diversity = type_token_ratio(input_text)
+    unique_words = hapax_legomena_ratio(input_text)
+    vocabulary_stability = mtld(input_text)
+    # normalize between 0 and 100
+    vocabulary_level_norm = normalize(
+        vocabulary_level, *usual_ranges["estimated_slightly_difficult_words_ratio"]
+    )
+    entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
+    determiner_use_norm = normalize(
+        determiner_use, *usual_ranges["determiners_frequency"]
     )
+    punctuation_variety_norm = normalize(
+        punctuation_variety, *usual_ranges["punctuation_diversity"]
     )
+    lexical_diversity_norm = normalize(
+        lexical_diversity, *usual_ranges["type_token_ratio"]
     )
+    unique_words_norm = normalize(unique_words, *usual_ranges["hapax_legomena_ratio"])
+    vocabulary_stability_norm = normalize(vocabulary_stability, *usual_ranges["mtld"])
+    sentence_depth_norm = normalize(
+        sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
     )
+    perplexity_norm = normalize(perplexity, *usual_ranges["calculate_perplexity"])
     features = {
+        "Lexical Diversity": lexical_diversity_norm,
+        "Vocabulary Level": vocabulary_level_norm,
+        "Unique Words": unique_words_norm,
+        "Determiner Use": determiner_use_norm,
+        "Punctuation Variety": punctuation_variety_norm,
+        "Sentence Depth": sentence_depth_norm,
+        "Vocabulary Stability": vocabulary_stability_norm,
+        "Entity Ratio": entity_ratio_norm,
+        "Perplexity": perplexity_norm,
     }
+    def radar_factory(num_vars, frame="circle"):
+        theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)
+        class RadarTransform(PolarAxes.PolarTransform):
+            def transform_path_non_affine(self, path):
+                if path._interpolation_steps > 1:
+                    path = path.interpolated(num_vars)
+                return Path(self.transform(path.vertices), path.codes)
+        class RadarAxes(PolarAxes):
+            name = "radar"
+            PolarTransform = RadarTransform
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.set_theta_zero_location("N")
+            def fill(self, *args, closed=True, **kwargs):
+                return super().fill(closed=closed, *args, **kwargs)
+            def plot(self, *args, **kwargs):
+                lines = super().plot(*args, **kwargs)
+                for line in lines:
+                    self._close_line(line)
+            def _close_line(self, line):
+                x, y = line.get_data()
+                if x[0] != x[-1]:
+                    x = np.append(x, x[0])
+                    y = np.append(y, y[0])
+                    line.set_data(x, y)
+            def set_varlabels(self, labels):
+                self.set_thetagrids(np.degrees(theta), labels)
+            def _gen_axes_patch(self):
+                if frame == "circle":
+                    return Circle((0.5, 0.5), 0.5)
+                elif frame == "polygon":
+                    return RegularPolygon(
+                        (0.5, 0.5), num_vars, radius=0.5, edgecolor="k"
+                    )
+            def _gen_axes_spines(self):
+                if frame == "polygon":
+                    spine = Spine(
+                        axes=self,
+                        spine_type="circle",
+                        path=Path.unit_regular_polygon(num_vars),
+                    )
+                    spine.set_transform(
+                        Affine2D().scale(0.5).translate(0.5, 0.5) + self.transAxes
+                    )
+                    return {"polar": spine}
+        register_projection(RadarAxes)
+        return theta
+    N = 9
+    theta = radar_factory(N, frame="polygon")
+    data = features.values()
+    labels = features.keys()
+    fig, ax = plt.subplots(subplot_kw=dict(projection="radar"), figsize=(7.5, 5))
+    ax.plot(theta, data)
+    ax.fill(theta, data, alpha=0.4)
+    ax.set_varlabels(labels)
+    rgrids = np.linspace(0, 100, num=6)
+    ax.set_rgrids(
+        rgrids, labels=[f"{round(r)}%" for r in rgrids], fontsize=8, color="black"
     )
+    ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
+    for dd, (label, value) in enumerate(zip(labels, data)):
+        ax.text(
+            theta[dd] + 0.1,
+            value + 5,
+            f"{value:.0f}",
+            horizontalalignment="left",
+            verticalalignment="bottom",
+            fontsize=8,
+        )
     return fig

app.py CHANGED Viewed

@@ -232,6 +232,59 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
     full_check_btn.click(
         fn=main,

     with gr.Row():
         with gr.Column():
             writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
+        with gr.Column():
+            interpretation = """
+<h2>Writing Analysis Interpretation</h2>
+<ul>
+    <li><b>Lexical Diversity</b>: This feature measures the range of unique words used in a text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Vocabulary Level</b>: This feature assesses the complexity of the words used in a text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Unique Words</b>: This feature counts the number of words that appear only once within the text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Determiner Use</b>: This feature tracks the frequency of articles and quantifiers in the text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Punctuation Variety</b>: This feature indicates the diversity of punctuation marks used in the text.
+        <ul>
+            <li>👤 Higher tends to be Human.</li>
+        </ul>
+    </li>
+    <li><b>Sentence Depth</b>: This feature evaluates the complexity of the sentence structures used in the text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Vocabulary Stability</b>: This feature measures the consistency of vocabulary use throughout the text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Entity Ratio</b>: This feature calculates the proportion of named entities, such as names and places, within the text.
+        <ul>
+            <li>👤 Higher tends to be Human.</li>
+        </ul>
+    </li>
+    <li><b>Perplexity</b>: This feature assesses the predictability of the text based on the sequence of words.
+        <ul>
+            <li>👤 Higher tends to be Human.</li>
+        </ul>
+    </li>
+</ul>
+"""
+            gr.HTML(interpretation, label="Interpretation of Writing Analysis")
     full_check_btn.click(
         fn=main,

explainability.py DELETED Viewed

@@ -1,119 +0,0 @@
-import re, textstat
-from nltk import FreqDist
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize, sent_tokenize
-import torch
-import nltk
-from tqdm import tqdm
-nltk.download("punkt")
-def normalize(value, min_value, max_value):
-    normalized_value = ((value - min_value) * 100) / (max_value - min_value)
-    return max(0, min(100, normalized_value))
-def preprocess_text1(text):
-    text = text.lower()
-    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
-    stop_words = set(stopwords.words("english"))  # remove stopwords
-    words = [word for word in text.split() if word not in stop_words]
-    words = [word for word in words if not word.isdigit()]  # remove numbers
-    return words
-def vocabulary_richness_ttr(words):
-    unique_words = set(words)
-    ttr = len(unique_words) / len(words) * 100
-    return ttr
-def calculate_gunning_fog(text):
-    """range 0-20"""
-    gunning_fog = textstat.gunning_fog(text)
-    return gunning_fog
-def calculate_automated_readability_index(text):
-    """range 1-20"""
-    ari = textstat.automated_readability_index(text)
-    return ari
-def calculate_flesch_reading_ease(text):
-    """range 0-100"""
-    fre = textstat.flesch_reading_ease(text)
-    return fre
-def preprocess_text2(text):
-    sentences = sent_tokenize(text)
-    words = [
-        word.lower()
-        for sent in sentences
-        for word in word_tokenize(sent)
-        if word.isalnum()
-    ]
-    stop_words = set(stopwords.words("english"))
-    words = [word for word in words if word not in stop_words]
-    return words, sentences
-def calculate_average_sentence_length(sentences):
-    """range 0-40 or 50 based on the histogram"""
-    total_words = sum(len(word_tokenize(sent)) for sent in sentences)
-    average_sentence_length = total_words / (len(sentences) + 0.0000001)
-    return average_sentence_length
-def calculate_average_word_length(words):
-    """range 0-8 based on the histogram"""
-    total_characters = sum(len(word) for word in words)
-    average_word_length = total_characters / (len(words) + 0.0000001)
-    return average_word_length
-def calculate_max_depth(sent):
-    return max(len(list(token.ancestors)) for token in sent)
-def calculate_syntactic_tree_depth(nlp, text):
-    """0-10 based on the histogram"""
-    doc = nlp(text)
-    sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
-    average_depth = (
-        sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
-    )
-    return average_depth
-def calculate_perplexity(text, model, tokenizer, device, stride=512):
-    """range 0-30 based on the histogram"""
-    encodings = tokenizer(text, return_tensors="pt")
-    max_length = model.config.n_positions
-    seq_len = encodings.input_ids.size(1)
-    nlls = []
-    prev_end_loc = 0
-    for begin_loc in tqdm(range(0, seq_len, stride)):
-        end_loc = min(begin_loc + max_length, seq_len)
-        trg_len = (
-            end_loc - prev_end_loc
-        )  # may be different from stride on last loop
-        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-        with torch.no_grad():
-            outputs = model(input_ids, labels=target_ids)
-            neg_log_likelihood = outputs.loss
-        nlls.append(neg_log_likelihood)
-        prev_end_loc = end_loc
-        if end_loc == seq_len:
-            break
-    ppl = torch.exp(torch.stack(nlls).mean())
-    return ppl.item()

requirements.txt CHANGED Viewed

@@ -25,4 +25,7 @@ sentence-transformers
 Unidecode
 python-dotenv
 lime
-joblib

 Unidecode
 python-dotenv
 lime
+joblib
+emoji==1.6.1
+matplotlib
+seaborn

writing_analysis.py CHANGED Viewed

@@ -1,85 +1,153 @@
-import re, textstat
-from nltk import FreqDist
 from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize, sent_tokenize
 import torch
-from tqdm import tqdm
-def normalize(value, min_value, max_value):
-    normalized_value = ((value - min_value) * 100) / (max_value - min_value)
-    return max(0, min(100, normalized_value))
-# vocabulary richness
-def preprocess_text1(text):
-    text = text.lower()
-    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
-    stop_words = set(stopwords.words('english')) # remove stopwords
-    words = [word for word in text.split() if word not in stop_words]
-    words = [word for word in words if not word.isdigit()] # remove numbers
-    return words
-def vocabulary_richness_ttr(words):
-    unique_words = set(words)
-    ttr = len(unique_words) / len(words) * 100
-    return ttr
-def calculate_gunning_fog(text):
-    """range 0-20"""
-    gunning_fog = textstat.gunning_fog(text)
-    return gunning_fog
-def calculate_automated_readability_index(text):
-    """range 1-20"""
-    ari = textstat.automated_readability_index(text)
-    return ari
-def calculate_flesch_reading_ease(text):
-    """range 0-100"""
-    fre = textstat.flesch_reading_ease(text)
-    return fre
-def preprocess_text2(text):
-    # tokenize into words and remove punctuation
-    sentences = sent_tokenize(text)
-    words = [word.lower() for sent in sentences for word in word_tokenize(sent) if word.isalnum()]
-    # remove stopwords
-    stop_words = set(stopwords.words('english'))
-    words = [word for word in words if word not in stop_words]
-    return words, sentences
-def calculate_average_sentence_length(sentences):
-    """range 0-40 or 50 based on the histogram"""
-    total_words = sum(len(word_tokenize(sent)) for sent in sentences)
-    average_sentence_length = total_words / (len(sentences) + 0.0000001)
-    return average_sentence_length
-def calculate_average_word_length(words):
-    """range 0-8 based on the histogram"""
-    total_characters = sum(len(word) for word in words)
-    average_word_length = total_characters / (len(words) + 0.0000001)
-    return average_word_length
 def calculate_max_depth(sent):
     return max(len(list(token.ancestors)) for token in sent)
-def calculate_syntactic_tree_depth(nlp, text):
-    """0-10 based on the histogram"""
     doc = nlp(text)
     sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
-    average_depth = sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
     return average_depth
-# reference: https://huggingface.co/docs/transformers/perplexity
 def calculate_perplexity(text, model, tokenizer, device, stride=512):
-    """range 0-30 based on the histogram"""
     encodings = tokenizer(text, return_tensors="pt")
     max_length = model.config.n_positions
     seq_len = encodings.input_ids.size(1)
     nlls = []
     prev_end_loc = 0
-    for begin_loc in tqdm(range(0, seq_len, stride)):
         end_loc = min(begin_loc + max_length, seq_len)
         trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
         input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
@@ -88,6 +156,10 @@ def calculate_perplexity(text, model, tokenizer, device, stride=512):
         with torch.no_grad():
             outputs = model(input_ids, labels=target_ids)
             neg_log_likelihood = outputs.loss
         nlls.append(neg_log_likelihood)
@@ -98,3 +170,4 @@ def calculate_perplexity(text, model, tokenizer, device, stride=512):
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()

+import string
+from collections import Counter
+from nltk import word_tokenize
 from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from nltk.probability import FreqDist
 import torch
+def preprocess_text(text, remove_stopwords=True, use_lemmatization=True):
+    tokens = word_tokenize(text.lower())
+    tokens = [token for token in tokens if token.isalpha()]
+    if remove_stopwords:
+        stop_words = set(stopwords.words("english"))
+        tokens = [token for token in tokens if token not in stop_words]
+    if use_lemmatization:
+        lemmatizer = WordNetLemmatizer()
+        tokens = [lemmatizer.lemmatize(token) for token in tokens]
+    return tokens
+def get_special_chars():
+    import emoji  # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
+    main_special_characters = string.punctuation + string.digits + string.whitespace
+    other_special_characters = (
+        "    　    ’“”–ー一▬…✦�£•€«»°·═"
+        "×士＾˘⇓↓↑←→（）§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
+        "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
+        "゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬？▷Г♫∟™ª₪®「—❖"
+        "」﴾》"
+    )
+    emoji = list(emoji.UNICODE_EMOJI["en"].keys())
+    special_characters_default = set(main_special_characters + other_special_characters)
+    special_characters_default.update(emoji)
+    return special_characters_default
+special_characters_default = get_special_chars()
+# -------------------- Features --------------------
+def syllable_count(word, d):
+    return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])]
+def estimated_slightly_difficult_words_ratio(text, d):
+    words = word_tokenize(text.lower())
+    total_words = len(words)
+    # Considering words with 3 or more syllables as difficult
+    difficult_count = sum(
+        1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2
+    )
+    return difficult_count / total_words if total_words > 0 else 0
+# -------------------- Features --------------------
+def entity_density(text, nlp):
+    doc = nlp(text)
+    return len(doc.ents) / len(doc)
+# -------------------- Features --------------------
+def determiners_frequency(text, nlp):
+    doc = nlp(text)
+    determiners = sum(1 for token in doc if token.pos_ == "DET")
+    total_words = len(doc)
+    return determiners / total_words if total_words else 0
+# -------------------- Features --------------------
+def punctuation_diversity(text):
+    punctuation_counts = Counter(
+        char for char in text if char in special_characters_default
+    )
+    diversity_score = (
+        len(punctuation_counts) / len(special_characters_default)
+        if special_characters_default
+        else 0
+    )
+    return diversity_score
+# -------------------- Features --------------------
+def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True):
+    tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
+    unique_words = set(tokens)
+    return len(unique_words) / len(tokens) if tokens else 0
+# -------------------- Features --------------------
+def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True):
+    tokens = word_tokenize(text.lower())
+    tokens = [token for token in tokens if token.isalpha()]
+    if remove_stopwords:
+        stop_words = set(stopwords.words("english"))
+        tokens = [token for token in tokens if token not in stop_words]
+    if use_lemmatization:
+        lemmatizer = WordNetLemmatizer()
+        tokens = [lemmatizer.lemmatize(token) for token in tokens]
+    freq_dist = FreqDist(tokens)
+    hapaxes = freq_dist.hapaxes()
+    return len(hapaxes) / len(tokens) if tokens else 0
+# -------------------- Features --------------------
+def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True):
+    tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
+    def mtld_calc(direction):
+        token_length, factor_count = 0, 0
+        types = set()
+        for token in tokens if direction == "forward" else reversed(tokens):
+            types.add(token)
+            token_length += 1
+            if len(types) / token_length < threshold:
+                factor_count += 1
+                types = set()
+                token_length = 0
+        factor_count += 1  # For the last segment, even if it didn't reach the threshold
+        return len(tokens) / factor_count if factor_count != 0 else 0
+    return (mtld_calc("forward") + mtld_calc("backward")) / 2
+# -------------------- Features --------------------
 def calculate_max_depth(sent):
     return max(len(list(token.ancestors)) for token in sent)
+def calculate_syntactic_tree_depth(text, nlp):
     doc = nlp(text)
     sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
+    average_depth = (
+        sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
+    )
     return average_depth
+# -------------------- Features --------------------
 def calculate_perplexity(text, model, tokenizer, device, stride=512):
     encodings = tokenizer(text, return_tensors="pt")
     max_length = model.config.n_positions
     seq_len = encodings.input_ids.size(1)
     nlls = []
     prev_end_loc = 0
+    for begin_loc in range(0, seq_len, stride):
         end_loc = min(begin_loc + max_length, seq_len)
         trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
         input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
         with torch.no_grad():
             outputs = model(input_ids, labels=target_ids)
+            # loss is calculated using CrossEntropyLoss which averages over valid labels
+            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+            # to the left by 1.
             neg_log_likelihood = outputs.loss
         nlls.append(neg_log_likelihood)
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()