Spaces:
Sleeping
Sleeping
aliasgerovs
commited on
Merge branch 'main' to into demo
Browse files- analysis.py +172 -72
- app.py +73 -12
- explainability.py +0 -119
- plagiarism.py +141 -92
- requirements.txt +4 -1
- writing_analysis.py +138 -65
analysis.py
CHANGED
@@ -1,31 +1,42 @@
|
|
1 |
-
import requests
|
2 |
-
import httpx
|
3 |
-
import torch
|
4 |
-
import re
|
5 |
-
from bs4 import BeautifulSoup
|
6 |
-
import numpy as np
|
7 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
-
import asyncio
|
9 |
-
from scipy.special import softmax
|
10 |
-
from evaluate import load
|
11 |
-
from datetime import date
|
12 |
-
import nltk
|
13 |
-
import fitz
|
14 |
-
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
15 |
-
import nltk, spacy, subprocess, torch
|
16 |
-
import plotly.graph_objects as go
|
17 |
-
import torch.nn.functional as F
|
18 |
-
import nltk
|
19 |
-
from unidecode import unidecode
|
20 |
-
import time
|
21 |
import yaml
|
22 |
-
import nltk
|
23 |
-
import os
|
24 |
-
from explainability import *
|
25 |
import subprocess
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
|
|
27 |
nltk.download("punkt")
|
28 |
nltk.download("stopwords")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
with open("config.yaml", "r") as file:
|
30 |
params = yaml.safe_load(file)
|
31 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
@@ -33,64 +44,153 @@ readability_model_id = params["READABILITY_MODEL_ID"]
|
|
33 |
gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
|
34 |
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
39 |
|
40 |
|
41 |
def depth_analysis(input_text):
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
)
|
52 |
-
|
53 |
-
|
54 |
)
|
55 |
-
|
56 |
-
|
57 |
-
average_tree_depth, min_value=0, max_value=10
|
58 |
)
|
59 |
-
|
60 |
-
|
|
|
|
|
61 |
)
|
62 |
-
perplexity_norm = normalize(perplexity,
|
63 |
|
64 |
features = {
|
65 |
-
"
|
66 |
-
"
|
67 |
-
"
|
68 |
-
"
|
69 |
-
"
|
70 |
-
"
|
|
|
|
|
|
|
71 |
}
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
)
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import yaml
|
|
|
|
|
|
|
2 |
import subprocess
|
3 |
+
import nltk
|
4 |
+
from nltk import word_tokenize
|
5 |
+
from nltk.corpus import cmudict, stopwords
|
6 |
+
import spacy
|
7 |
+
import torch
|
8 |
+
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
from matplotlib.patches import Circle, RegularPolygon
|
13 |
+
from matplotlib.path import Path
|
14 |
+
from matplotlib.projections import register_projection
|
15 |
+
from matplotlib.projections.polar import PolarAxes
|
16 |
+
from matplotlib.spines import Spine
|
17 |
+
from matplotlib.transforms import Affine2D
|
18 |
+
from writing_analysis import (
|
19 |
+
estimated_slightly_difficult_words_ratio,
|
20 |
+
entity_density,
|
21 |
+
determiners_frequency,
|
22 |
+
punctuation_diversity,
|
23 |
+
type_token_ratio,
|
24 |
+
calculate_perplexity,
|
25 |
+
calculate_syntactic_tree_depth,
|
26 |
+
hapax_legomena_ratio,
|
27 |
+
mtld,
|
28 |
+
)
|
29 |
|
30 |
+
nltk.download("cmudict")
|
31 |
nltk.download("punkt")
|
32 |
nltk.download("stopwords")
|
33 |
+
nltk.download("wordnet")
|
34 |
+
d = cmudict.dict()
|
35 |
+
command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
|
36 |
+
subprocess.run(command)
|
37 |
+
nlp = spacy.load("en_core_web_sm")
|
38 |
+
|
39 |
+
|
40 |
with open("config.yaml", "r") as file:
|
41 |
params = yaml.safe_load(file)
|
42 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
44 |
gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
|
45 |
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
|
46 |
|
47 |
+
|
48 |
+
def normalize(value, min_value, max_value):
|
49 |
+
normalized_value = ((value - min_value) * 100) / (max_value - min_value)
|
50 |
+
return max(0, min(100, normalized_value))
|
51 |
|
52 |
|
53 |
def depth_analysis(input_text):
|
54 |
+
|
55 |
+
usual_ranges = {
|
56 |
+
"estimated_slightly_difficult_words_ratio": (
|
57 |
+
0.2273693623058005,
|
58 |
+
0.557383692351033,
|
59 |
+
),
|
60 |
+
"entity_density": (-0.07940776754145815, 0.23491038179986615),
|
61 |
+
"determiners_frequency": (0.012461059190031154, 0.15700934579439252),
|
62 |
+
"punctuation_diversity": (-0.21875, 0.53125),
|
63 |
+
"type_token_ratio": (0.33002482852189063, 1.0894414982357028),
|
64 |
+
"calculate_perplexity": (-25.110544681549072, 82.4620680809021),
|
65 |
+
"calculate_syntactic_tree_depth": (1.8380681818181812, 10.997159090909092),
|
66 |
+
"hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
|
67 |
+
"mtld": (-84.03125000000001, 248.81875000000002),
|
68 |
+
}
|
69 |
+
|
70 |
+
vocabulary_level = estimated_slightly_difficult_words_ratio(input_text, d)
|
71 |
+
entity_ratio = entity_density(input_text, nlp)
|
72 |
+
determiner_use = determiners_frequency(input_text, nlp)
|
73 |
+
punctuation_variety = punctuation_diversity(input_text)
|
74 |
+
sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
|
75 |
+
perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
|
76 |
+
lexical_diversity = type_token_ratio(input_text)
|
77 |
+
unique_words = hapax_legomena_ratio(input_text)
|
78 |
+
vocabulary_stability = mtld(input_text)
|
79 |
+
|
80 |
+
# normalize between 0 and 100
|
81 |
+
vocabulary_level_norm = normalize(
|
82 |
+
vocabulary_level, *usual_ranges["estimated_slightly_difficult_words_ratio"]
|
83 |
+
)
|
84 |
+
entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
|
85 |
+
determiner_use_norm = normalize(
|
86 |
+
determiner_use, *usual_ranges["determiners_frequency"]
|
87 |
)
|
88 |
+
punctuation_variety_norm = normalize(
|
89 |
+
punctuation_variety, *usual_ranges["punctuation_diversity"]
|
90 |
)
|
91 |
+
lexical_diversity_norm = normalize(
|
92 |
+
lexical_diversity, *usual_ranges["type_token_ratio"]
|
|
|
93 |
)
|
94 |
+
unique_words_norm = normalize(unique_words, *usual_ranges["hapax_legomena_ratio"])
|
95 |
+
vocabulary_stability_norm = normalize(vocabulary_stability, *usual_ranges["mtld"])
|
96 |
+
sentence_depth_norm = normalize(
|
97 |
+
sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
|
98 |
)
|
99 |
+
perplexity_norm = normalize(perplexity, *usual_ranges["calculate_perplexity"])
|
100 |
|
101 |
features = {
|
102 |
+
"Lexical Diversity": lexical_diversity_norm,
|
103 |
+
"Vocabulary Level": vocabulary_level_norm,
|
104 |
+
"Unique Words": unique_words_norm,
|
105 |
+
"Determiner Use": determiner_use_norm,
|
106 |
+
"Punctuation Variety": punctuation_variety_norm,
|
107 |
+
"Sentence Depth": sentence_depth_norm,
|
108 |
+
"Vocabulary Stability": vocabulary_stability_norm,
|
109 |
+
"Entity Ratio": entity_ratio_norm,
|
110 |
+
"Perplexity": perplexity_norm,
|
111 |
}
|
112 |
+
|
113 |
+
def radar_factory(num_vars, frame="circle"):
|
114 |
+
theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)
|
115 |
+
|
116 |
+
class RadarTransform(PolarAxes.PolarTransform):
|
117 |
+
def transform_path_non_affine(self, path):
|
118 |
+
if path._interpolation_steps > 1:
|
119 |
+
path = path.interpolated(num_vars)
|
120 |
+
return Path(self.transform(path.vertices), path.codes)
|
121 |
+
|
122 |
+
class RadarAxes(PolarAxes):
|
123 |
+
name = "radar"
|
124 |
+
PolarTransform = RadarTransform
|
125 |
+
|
126 |
+
def __init__(self, *args, **kwargs):
|
127 |
+
super().__init__(*args, **kwargs)
|
128 |
+
self.set_theta_zero_location("N")
|
129 |
+
|
130 |
+
def fill(self, *args, closed=True, **kwargs):
|
131 |
+
return super().fill(closed=closed, *args, **kwargs)
|
132 |
+
|
133 |
+
def plot(self, *args, **kwargs):
|
134 |
+
lines = super().plot(*args, **kwargs)
|
135 |
+
for line in lines:
|
136 |
+
self._close_line(line)
|
137 |
+
|
138 |
+
def _close_line(self, line):
|
139 |
+
x, y = line.get_data()
|
140 |
+
if x[0] != x[-1]:
|
141 |
+
x = np.append(x, x[0])
|
142 |
+
y = np.append(y, y[0])
|
143 |
+
line.set_data(x, y)
|
144 |
+
|
145 |
+
def set_varlabels(self, labels):
|
146 |
+
self.set_thetagrids(np.degrees(theta), labels)
|
147 |
+
|
148 |
+
def _gen_axes_patch(self):
|
149 |
+
if frame == "circle":
|
150 |
+
return Circle((0.5, 0.5), 0.5)
|
151 |
+
elif frame == "polygon":
|
152 |
+
return RegularPolygon(
|
153 |
+
(0.5, 0.5), num_vars, radius=0.5, edgecolor="k"
|
154 |
+
)
|
155 |
+
|
156 |
+
def _gen_axes_spines(self):
|
157 |
+
if frame == "polygon":
|
158 |
+
spine = Spine(
|
159 |
+
axes=self,
|
160 |
+
spine_type="circle",
|
161 |
+
path=Path.unit_regular_polygon(num_vars),
|
162 |
+
)
|
163 |
+
spine.set_transform(
|
164 |
+
Affine2D().scale(0.5).translate(0.5, 0.5) + self.transAxes
|
165 |
+
)
|
166 |
+
return {"polar": spine}
|
167 |
+
|
168 |
+
register_projection(RadarAxes)
|
169 |
+
return theta
|
170 |
+
|
171 |
+
N = 9
|
172 |
+
theta = radar_factory(N, frame="polygon")
|
173 |
+
data = features.values()
|
174 |
+
labels = features.keys()
|
175 |
+
fig, ax = plt.subplots(subplot_kw=dict(projection="radar"), figsize=(7.5, 5))
|
176 |
+
ax.plot(theta, data)
|
177 |
+
ax.fill(theta, data, alpha=0.4)
|
178 |
+
ax.set_varlabels(labels)
|
179 |
+
|
180 |
+
rgrids = np.linspace(0, 100, num=6)
|
181 |
+
ax.set_rgrids(
|
182 |
+
rgrids, labels=[f"{round(r)}%" for r in rgrids], fontsize=8, color="black"
|
183 |
)
|
184 |
+
ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
|
185 |
+
|
186 |
+
for dd, (label, value) in enumerate(zip(labels, data)):
|
187 |
+
ax.text(
|
188 |
+
theta[dd] + 0.1,
|
189 |
+
value + 5,
|
190 |
+
f"{value:.0f}",
|
191 |
+
horizontalalignment="left",
|
192 |
+
verticalalignment="bottom",
|
193 |
+
fontsize=8,
|
194 |
+
)
|
195 |
+
|
196 |
return fig
|
app.py
CHANGED
@@ -5,7 +5,7 @@ from predictors import predict_bc_scores, predict_mc_scores
|
|
5 |
from predictors import update, correct_text, split_text
|
6 |
from analysis import depth_analysis
|
7 |
from predictors import predict_quillbot
|
8 |
-
from plagiarism import plagiarism_check, build_date
|
9 |
from highlighter import analyze_and_highlight
|
10 |
from utils import extract_text_from_pdf, len_validator
|
11 |
import yaml
|
@@ -21,7 +21,9 @@ model_list = params["MC_OUTPUT_LABELS"]
|
|
21 |
|
22 |
|
23 |
analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
|
24 |
-
analyze_and_highlight_quillbot = partial(
|
|
|
|
|
25 |
|
26 |
|
27 |
def ai_generated_test(option, input, models):
|
@@ -47,7 +49,18 @@ def main(
|
|
47 |
domains_to_skip,
|
48 |
):
|
49 |
|
50 |
-
formatted_tokens = plagiarism_check(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
plag_option,
|
52 |
input,
|
53 |
year_from,
|
@@ -218,20 +231,67 @@ with gr.Blocks() as demo:
|
|
218 |
|
219 |
with gr.Row():
|
220 |
with gr.Column():
|
221 |
-
sentenceBreakdown = gr.
|
222 |
label="Source Detection Sentence Breakdown",
|
223 |
-
|
224 |
-
color_map={
|
225 |
-
"[1]": "red",
|
226 |
-
"[2]": "orange",
|
227 |
-
"[3]": "yellow",
|
228 |
-
"[4]": "green",
|
229 |
-
},
|
230 |
)
|
231 |
|
232 |
with gr.Row():
|
233 |
with gr.Column():
|
234 |
writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
full_check_btn.click(
|
237 |
fn=main,
|
@@ -275,7 +335,8 @@ with gr.Blocks() as demo:
|
|
275 |
)
|
276 |
|
277 |
only_plagiarism_btn.click(
|
278 |
-
fn=plagiarism_check,
|
|
|
279 |
inputs=[
|
280 |
plag_option,
|
281 |
input_text,
|
|
|
5 |
from predictors import update, correct_text, split_text
|
6 |
from analysis import depth_analysis
|
7 |
from predictors import predict_quillbot
|
8 |
+
from plagiarism import plagiarism_check, build_date, html_highlight
|
9 |
from highlighter import analyze_and_highlight
|
10 |
from utils import extract_text_from_pdf, len_validator
|
11 |
import yaml
|
|
|
21 |
|
22 |
|
23 |
analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
|
24 |
+
analyze_and_highlight_quillbot = partial(
|
25 |
+
analyze_and_highlight, model_type="quillbot"
|
26 |
+
)
|
27 |
|
28 |
|
29 |
def ai_generated_test(option, input, models):
|
|
|
49 |
domains_to_skip,
|
50 |
):
|
51 |
|
52 |
+
# formatted_tokens = plagiarism_check(
|
53 |
+
# plag_option,
|
54 |
+
# input,
|
55 |
+
# year_from,
|
56 |
+
# month_from,
|
57 |
+
# day_from,
|
58 |
+
# year_to,
|
59 |
+
# month_to,
|
60 |
+
# day_to,
|
61 |
+
# domains_to_skip,
|
62 |
+
# )
|
63 |
+
formatted_tokens = html_highlight(
|
64 |
plag_option,
|
65 |
input,
|
66 |
year_from,
|
|
|
231 |
|
232 |
with gr.Row():
|
233 |
with gr.Column():
|
234 |
+
sentenceBreakdown = gr.HTML(
|
235 |
label="Source Detection Sentence Breakdown",
|
236 |
+
value="Source Detection Sentence Breakdown",
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
)
|
238 |
|
239 |
with gr.Row():
|
240 |
with gr.Column():
|
241 |
writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
|
242 |
+
with gr.Column():
|
243 |
+
interpretation = """
|
244 |
+
<h2>Writing Analysis Interpretation</h2>
|
245 |
+
<ul>
|
246 |
+
<li><b>Lexical Diversity</b>: This feature measures the range of unique words used in a text.
|
247 |
+
<ul>
|
248 |
+
<li>🤖 Higher tends to be AI.</li>
|
249 |
+
</ul>
|
250 |
+
</li>
|
251 |
+
<li><b>Vocabulary Level</b>: This feature assesses the complexity of the words used in a text.
|
252 |
+
<ul>
|
253 |
+
<li>🤖 Higher tends to be AI.</li>
|
254 |
+
</ul>
|
255 |
+
</li>
|
256 |
+
<li><b>Unique Words</b>: This feature counts the number of words that appear only once within the text.
|
257 |
+
<ul>
|
258 |
+
<li>🤖 Higher tends to be AI.</li>
|
259 |
+
</ul>
|
260 |
+
</li>
|
261 |
+
<li><b>Determiner Use</b>: This feature tracks the frequency of articles and quantifiers in the text.
|
262 |
+
<ul>
|
263 |
+
<li>🤖 Higher tends to be AI.</li>
|
264 |
+
</ul>
|
265 |
+
</li>
|
266 |
+
<li><b>Punctuation Variety</b>: This feature indicates the diversity of punctuation marks used in the text.
|
267 |
+
<ul>
|
268 |
+
<li>👤 Higher tends to be Human.</li>
|
269 |
+
</ul>
|
270 |
+
</li>
|
271 |
+
<li><b>Sentence Depth</b>: This feature evaluates the complexity of the sentence structures used in the text.
|
272 |
+
<ul>
|
273 |
+
<li>🤖 Higher tends to be AI.</li>
|
274 |
+
</ul>
|
275 |
+
</li>
|
276 |
+
<li><b>Vocabulary Stability</b>: This feature measures the consistency of vocabulary use throughout the text.
|
277 |
+
<ul>
|
278 |
+
<li>🤖 Higher tends to be AI.</li>
|
279 |
+
</ul>
|
280 |
+
</li>
|
281 |
+
<li><b>Entity Ratio</b>: This feature calculates the proportion of named entities, such as names and places, within the text.
|
282 |
+
<ul>
|
283 |
+
<li>👤 Higher tends to be Human.</li>
|
284 |
+
</ul>
|
285 |
+
</li>
|
286 |
+
<li><b>Perplexity</b>: This feature assesses the predictability of the text based on the sequence of words.
|
287 |
+
<ul>
|
288 |
+
<li>👤 Higher tends to be Human.</li>
|
289 |
+
</ul>
|
290 |
+
</li>
|
291 |
+
</ul>
|
292 |
+
|
293 |
+
"""
|
294 |
+
gr.HTML(interpretation, label="Interpretation of Writing Analysis")
|
295 |
|
296 |
full_check_btn.click(
|
297 |
fn=main,
|
|
|
335 |
)
|
336 |
|
337 |
only_plagiarism_btn.click(
|
338 |
+
# fn=plagiarism_check,
|
339 |
+
fn=html_highlight,
|
340 |
inputs=[
|
341 |
plag_option,
|
342 |
input_text,
|
explainability.py
DELETED
@@ -1,119 +0,0 @@
|
|
1 |
-
import re, textstat
|
2 |
-
from nltk import FreqDist
|
3 |
-
from nltk.corpus import stopwords
|
4 |
-
from nltk.tokenize import word_tokenize, sent_tokenize
|
5 |
-
import torch
|
6 |
-
import nltk
|
7 |
-
from tqdm import tqdm
|
8 |
-
|
9 |
-
nltk.download("punkt")
|
10 |
-
|
11 |
-
|
12 |
-
def normalize(value, min_value, max_value):
|
13 |
-
normalized_value = ((value - min_value) * 100) / (max_value - min_value)
|
14 |
-
return max(0, min(100, normalized_value))
|
15 |
-
|
16 |
-
|
17 |
-
def preprocess_text1(text):
|
18 |
-
text = text.lower()
|
19 |
-
text = re.sub(r"[^\w\s]", "", text) # remove punctuation
|
20 |
-
stop_words = set(stopwords.words("english")) # remove stopwords
|
21 |
-
words = [word for word in text.split() if word not in stop_words]
|
22 |
-
words = [word for word in words if not word.isdigit()] # remove numbers
|
23 |
-
return words
|
24 |
-
|
25 |
-
|
26 |
-
def vocabulary_richness_ttr(words):
|
27 |
-
unique_words = set(words)
|
28 |
-
ttr = len(unique_words) / len(words) * 100
|
29 |
-
return ttr
|
30 |
-
|
31 |
-
|
32 |
-
def calculate_gunning_fog(text):
|
33 |
-
"""range 0-20"""
|
34 |
-
gunning_fog = textstat.gunning_fog(text)
|
35 |
-
return gunning_fog
|
36 |
-
|
37 |
-
|
38 |
-
def calculate_automated_readability_index(text):
|
39 |
-
"""range 1-20"""
|
40 |
-
ari = textstat.automated_readability_index(text)
|
41 |
-
return ari
|
42 |
-
|
43 |
-
|
44 |
-
def calculate_flesch_reading_ease(text):
|
45 |
-
"""range 0-100"""
|
46 |
-
fre = textstat.flesch_reading_ease(text)
|
47 |
-
return fre
|
48 |
-
|
49 |
-
|
50 |
-
def preprocess_text2(text):
|
51 |
-
sentences = sent_tokenize(text)
|
52 |
-
words = [
|
53 |
-
word.lower()
|
54 |
-
for sent in sentences
|
55 |
-
for word in word_tokenize(sent)
|
56 |
-
if word.isalnum()
|
57 |
-
]
|
58 |
-
stop_words = set(stopwords.words("english"))
|
59 |
-
words = [word for word in words if word not in stop_words]
|
60 |
-
return words, sentences
|
61 |
-
|
62 |
-
|
63 |
-
def calculate_average_sentence_length(sentences):
|
64 |
-
"""range 0-40 or 50 based on the histogram"""
|
65 |
-
total_words = sum(len(word_tokenize(sent)) for sent in sentences)
|
66 |
-
average_sentence_length = total_words / (len(sentences) + 0.0000001)
|
67 |
-
return average_sentence_length
|
68 |
-
|
69 |
-
|
70 |
-
def calculate_average_word_length(words):
|
71 |
-
"""range 0-8 based on the histogram"""
|
72 |
-
total_characters = sum(len(word) for word in words)
|
73 |
-
average_word_length = total_characters / (len(words) + 0.0000001)
|
74 |
-
return average_word_length
|
75 |
-
|
76 |
-
|
77 |
-
def calculate_max_depth(sent):
|
78 |
-
return max(len(list(token.ancestors)) for token in sent)
|
79 |
-
|
80 |
-
|
81 |
-
def calculate_syntactic_tree_depth(nlp, text):
|
82 |
-
"""0-10 based on the histogram"""
|
83 |
-
doc = nlp(text)
|
84 |
-
sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
|
85 |
-
average_depth = (
|
86 |
-
sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
|
87 |
-
)
|
88 |
-
return average_depth
|
89 |
-
|
90 |
-
|
91 |
-
def calculate_perplexity(text, model, tokenizer, device, stride=512):
|
92 |
-
"""range 0-30 based on the histogram"""
|
93 |
-
encodings = tokenizer(text, return_tensors="pt")
|
94 |
-
max_length = model.config.n_positions
|
95 |
-
seq_len = encodings.input_ids.size(1)
|
96 |
-
|
97 |
-
nlls = []
|
98 |
-
prev_end_loc = 0
|
99 |
-
for begin_loc in tqdm(range(0, seq_len, stride)):
|
100 |
-
end_loc = min(begin_loc + max_length, seq_len)
|
101 |
-
trg_len = (
|
102 |
-
end_loc - prev_end_loc
|
103 |
-
) # may be different from stride on last loop
|
104 |
-
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
|
105 |
-
target_ids = input_ids.clone()
|
106 |
-
target_ids[:, :-trg_len] = -100
|
107 |
-
|
108 |
-
with torch.no_grad():
|
109 |
-
outputs = model(input_ids, labels=target_ids)
|
110 |
-
neg_log_likelihood = outputs.loss
|
111 |
-
|
112 |
-
nlls.append(neg_log_likelihood)
|
113 |
-
|
114 |
-
prev_end_loc = end_loc
|
115 |
-
if end_loc == seq_len:
|
116 |
-
break
|
117 |
-
|
118 |
-
ppl = torch.exp(torch.stack(nlls).mean())
|
119 |
-
return ppl.item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plagiarism.py
CHANGED
@@ -19,7 +19,6 @@ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
19 |
# returns cosine similarity of two vectors
|
20 |
# input: two vectors
|
21 |
# output: integer between 0 and 1.
|
22 |
-
|
23 |
def get_cosine(vec1, vec2):
|
24 |
intersection = set(vec1.keys()) & set(vec2.keys())
|
25 |
|
@@ -75,9 +74,9 @@ def sentence_similarity(text1, text2):
|
|
75 |
def google_search(
|
76 |
plag_option,
|
77 |
sentences,
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
sorted_date,
|
82 |
domains_to_skip,
|
83 |
api_key,
|
@@ -112,30 +111,30 @@ def google_search(
|
|
112 |
|
113 |
# update cosine similarity between snippet and given text
|
114 |
url = link["link"]
|
115 |
-
if url not in
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
if plag_option == "Standard":
|
120 |
-
|
121 |
sentence, snippet
|
122 |
)
|
123 |
else:
|
124 |
-
|
125 |
sentence, snippet
|
126 |
)
|
127 |
-
return
|
128 |
|
129 |
|
130 |
def split_sentence_blocks(text):
|
131 |
-
|
132 |
-
sents = sent_tokenize(text)
|
133 |
two_sents = []
|
134 |
-
for
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
|
|
|
|
139 |
return two_sents
|
140 |
|
141 |
|
@@ -191,7 +190,6 @@ async def parallel_scrap(urls):
|
|
191 |
return results
|
192 |
|
193 |
|
194 |
-
|
195 |
def matching_score(sentence_content_tuple):
|
196 |
sentence, content = sentence_content_tuple
|
197 |
if sentence in content:
|
@@ -204,11 +202,99 @@ def matching_score(sentence_content_tuple):
|
|
204 |
matched = [x for x in ngrams if " ".join(x) in content]
|
205 |
return len(matched) / len(ngrams)
|
206 |
|
|
|
207 |
def process_with_multiprocessing(input_data):
|
208 |
with Pool(processes=4) as pool:
|
209 |
scores = pool.map(matching_score, input_data)
|
210 |
return scores
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
def plagiarism_check(
|
213 |
plag_option,
|
214 |
input,
|
@@ -222,116 +308,79 @@ def plagiarism_check(
|
|
222 |
):
|
223 |
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
224 |
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
225 |
-
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
226 |
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
227 |
-
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
228 |
cse_id = "851813e81162b4ed4"
|
229 |
|
|
|
|
|
230 |
sentences = split_sentence_blocks(input)
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
date_from = build_date(year_from, month_from, day_from)
|
235 |
date_to = build_date(year_to, month_to, day_to)
|
236 |
sort_date = f"date:r:{date_from}:{date_to}"
|
237 |
# get list of URLS to check
|
238 |
-
|
239 |
plag_option,
|
240 |
sentences,
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
sort_date,
|
245 |
domains_to_skip,
|
246 |
api_key,
|
247 |
cse_id,
|
248 |
)
|
249 |
-
|
250 |
# Scrape URLs in list
|
251 |
-
|
252 |
-
soups = asyncio.run(parallel_scrap(urlList))
|
253 |
-
|
254 |
-
# # Populate matching scores for scrapped pages
|
255 |
-
# for i, soup in enumerate(soups):
|
256 |
-
# print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
257 |
-
# if soup:
|
258 |
-
# page_content = soup.text
|
259 |
-
|
260 |
-
# for j, sent in enumerate(sentences):
|
261 |
-
# args_list = (sent, page_content)
|
262 |
-
# score = matching_score(args_list)
|
263 |
-
# # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
264 |
-
# ScoreArray[i][j] = score
|
265 |
-
|
266 |
input_data = []
|
267 |
for i, soup in enumerate(soups):
|
268 |
if soup:
|
269 |
page_content = soup.text
|
270 |
for j, sent in enumerate(sentences):
|
271 |
input_data.append((sent, page_content))
|
272 |
-
|
273 |
scores = process_with_multiprocessing(input_data)
|
274 |
-
|
|
|
|
|
275 |
for i, soup in enumerate(soups):
|
276 |
if soup:
|
277 |
for j, _ in enumerate(sentences):
|
278 |
-
|
279 |
-
k += 1
|
280 |
-
|
281 |
-
sentenceToMaxURL = [-1] * len(sentences)
|
282 |
-
|
283 |
-
for j in range(len(sentences)):
|
284 |
-
if j > 0:
|
285 |
-
maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
|
286 |
-
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
287 |
-
else:
|
288 |
-
maxScore = -1
|
289 |
-
|
290 |
-
for i in range(len(ScoreArray)):
|
291 |
-
margin = (
|
292 |
-
0.1
|
293 |
-
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
294 |
-
else 0
|
295 |
-
)
|
296 |
-
if ScoreArray[i][j] - maxScore > margin:
|
297 |
-
maxScore = ScoreArray[i][j]
|
298 |
-
sentenceToMaxURL[j] = i
|
299 |
|
|
|
300 |
index = np.unique(sentenceToMaxURL)
|
301 |
|
302 |
-
|
303 |
for url in index:
|
304 |
s = [
|
305 |
-
|
306 |
for sen in range(len(sentences))
|
307 |
if sentenceToMaxURL[sen] == url
|
308 |
]
|
309 |
-
|
310 |
-
|
311 |
-
index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
|
312 |
-
|
313 |
urlMap = {}
|
314 |
for count, i in enumerate(index_descending):
|
315 |
urlMap[i] = count + 1
|
316 |
-
|
|
|
317 |
for i, sent in enumerate(sentences):
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
for ind in index_descending:
|
327 |
-
|
328 |
-
(
|
329 |
-
|
330 |
-
+ " --- Matching Score: "
|
331 |
-
+ f"{str(round(urlScore[ind] * 100, 2))}%",
|
332 |
-
"[" + str(urlMap[ind]) + "]",
|
333 |
)
|
334 |
-
)
|
335 |
-
formatted_tokens.append(("\n", None))
|
336 |
|
337 |
-
return
|
|
|
19 |
# returns cosine similarity of two vectors
|
20 |
# input: two vectors
|
21 |
# output: integer between 0 and 1.
|
|
|
22 |
def get_cosine(vec1, vec2):
|
23 |
intersection = set(vec1.keys()) & set(vec2.keys())
|
24 |
|
|
|
74 |
def google_search(
|
75 |
plag_option,
|
76 |
sentences,
|
77 |
+
url_count,
|
78 |
+
score_array,
|
79 |
+
url_list,
|
80 |
sorted_date,
|
81 |
domains_to_skip,
|
82 |
api_key,
|
|
|
111 |
|
112 |
# update cosine similarity between snippet and given text
|
113 |
url = link["link"]
|
114 |
+
if url not in url_list:
|
115 |
+
url_list.append(url)
|
116 |
+
score_array.append([0] * len(sentences))
|
117 |
+
url_count[url] = url_count[url] + 1 if url in url_count else 1
|
118 |
if plag_option == "Standard":
|
119 |
+
score_array[url_list.index(url)][i] = cosineSim(
|
120 |
sentence, snippet
|
121 |
)
|
122 |
else:
|
123 |
+
score_array[url_list.index(url)][i] = sentence_similarity(
|
124 |
sentence, snippet
|
125 |
)
|
126 |
+
return url_count, score_array
|
127 |
|
128 |
|
129 |
def split_sentence_blocks(text):
|
|
|
|
|
130 |
two_sents = []
|
131 |
+
for para in text.split("\n\n"):
|
132 |
+
sents = sent_tokenize(para)
|
133 |
+
for i in range(len(sents)):
|
134 |
+
if (i % 2) == 0:
|
135 |
+
two_sents.append(sents[i])
|
136 |
+
else:
|
137 |
+
two_sents[len(two_sents) - 1] += " " + sents[i]
|
138 |
return two_sents
|
139 |
|
140 |
|
|
|
190 |
return results
|
191 |
|
192 |
|
|
|
193 |
def matching_score(sentence_content_tuple):
|
194 |
sentence, content = sentence_content_tuple
|
195 |
if sentence in content:
|
|
|
202 |
matched = [x for x in ngrams if " ".join(x) in content]
|
203 |
return len(matched) / len(ngrams)
|
204 |
|
205 |
+
|
206 |
def process_with_multiprocessing(input_data):
|
207 |
with Pool(processes=4) as pool:
|
208 |
scores = pool.map(matching_score, input_data)
|
209 |
return scores
|
210 |
+
|
211 |
+
|
212 |
+
def print2d(array):
|
213 |
+
for row in array:
|
214 |
+
print(row)
|
215 |
+
|
216 |
+
|
217 |
+
def map_sentence_url(sentences, score_array):
|
218 |
+
sentenceToMaxURL = [-1] * len(sentences)
|
219 |
+
for j in range(len(sentences)):
|
220 |
+
if j > 0:
|
221 |
+
maxScore = score_array[sentenceToMaxURL[j - 1]][j]
|
222 |
+
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
223 |
+
else:
|
224 |
+
maxScore = -1
|
225 |
+
for i in range(len(score_array)):
|
226 |
+
margin = (
|
227 |
+
0.05
|
228 |
+
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
229 |
+
else 0
|
230 |
+
)
|
231 |
+
if score_array[i][j] - maxScore > margin:
|
232 |
+
maxScore = score_array[i][j]
|
233 |
+
sentenceToMaxURL[j] = i
|
234 |
+
return sentenceToMaxURL
|
235 |
+
|
236 |
+
|
237 |
+
def html_highlight(
|
238 |
+
plag_option,
|
239 |
+
input,
|
240 |
+
year_from,
|
241 |
+
month_from,
|
242 |
+
day_from,
|
243 |
+
year_to,
|
244 |
+
month_to,
|
245 |
+
day_to,
|
246 |
+
domains_to_skip,
|
247 |
+
):
|
248 |
+
sentence_scores, url_scores = plagiarism_check(
|
249 |
+
plag_option,
|
250 |
+
input,
|
251 |
+
year_from,
|
252 |
+
month_from,
|
253 |
+
day_from,
|
254 |
+
year_to,
|
255 |
+
month_to,
|
256 |
+
day_to,
|
257 |
+
domains_to_skip,
|
258 |
+
)
|
259 |
+
color_map = [
|
260 |
+
"#cf2323",
|
261 |
+
"#eb9d59",
|
262 |
+
"#c2ad36",
|
263 |
+
"#e1ed72",
|
264 |
+
"#c2db76",
|
265 |
+
"#a2db76",
|
266 |
+
]
|
267 |
+
font = "Roboto"
|
268 |
+
html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
|
269 |
+
prev_idx = None
|
270 |
+
combined_sentence = ""
|
271 |
+
for sentence, _, _, idx in sentence_scores:
|
272 |
+
if idx != prev_idx and prev_idx is not None:
|
273 |
+
color = color_map[prev_idx - 1]
|
274 |
+
index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
|
275 |
+
formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
|
276 |
+
html_content += formatted_sentence
|
277 |
+
combined_sentence = ""
|
278 |
+
combined_sentence += " " + sentence
|
279 |
+
prev_idx = idx
|
280 |
+
|
281 |
+
if combined_sentence:
|
282 |
+
color = color_map[prev_idx - 1]
|
283 |
+
index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
|
284 |
+
formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
|
285 |
+
html_content += formatted_sentence
|
286 |
+
|
287 |
+
html_content += "<hr>"
|
288 |
+
for url, score, idx in url_scores:
|
289 |
+
color = color_map[idx - 1]
|
290 |
+
formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
|
291 |
+
html_content += formatted_url
|
292 |
+
|
293 |
+
html_content += "</div>"
|
294 |
+
|
295 |
+
return html_content
|
296 |
+
|
297 |
+
|
298 |
def plagiarism_check(
|
299 |
plag_option,
|
300 |
input,
|
|
|
308 |
):
|
309 |
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
310 |
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
311 |
+
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
312 |
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
313 |
+
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
314 |
cse_id = "851813e81162b4ed4"
|
315 |
|
316 |
+
url_scores = []
|
317 |
+
sentence_scores = []
|
318 |
sentences = split_sentence_blocks(input)
|
319 |
+
url_count = {}
|
320 |
+
score_array = []
|
321 |
+
url_list = []
|
322 |
date_from = build_date(year_from, month_from, day_from)
|
323 |
date_to = build_date(year_to, month_to, day_to)
|
324 |
sort_date = f"date:r:{date_from}:{date_to}"
|
325 |
# get list of URLS to check
|
326 |
+
url_count, score_array = google_search(
|
327 |
plag_option,
|
328 |
sentences,
|
329 |
+
url_count,
|
330 |
+
score_array,
|
331 |
+
url_list,
|
332 |
sort_date,
|
333 |
domains_to_skip,
|
334 |
api_key,
|
335 |
cse_id,
|
336 |
)
|
|
|
337 |
# Scrape URLs in list
|
338 |
+
soups = asyncio.run(parallel_scrap(url_list))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
input_data = []
|
340 |
for i, soup in enumerate(soups):
|
341 |
if soup:
|
342 |
page_content = soup.text
|
343 |
for j, sent in enumerate(sentences):
|
344 |
input_data.append((sent, page_content))
|
|
|
345 |
scores = process_with_multiprocessing(input_data)
|
346 |
+
|
347 |
+
k = 0
|
348 |
+
# Update score array for each (soup, sentence)
|
349 |
for i, soup in enumerate(soups):
|
350 |
if soup:
|
351 |
for j, _ in enumerate(sentences):
|
352 |
+
score_array[i][j] = scores[k]
|
353 |
+
k += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
|
355 |
+
sentenceToMaxURL = map_sentence_url(sentences, score_array)
|
356 |
index = np.unique(sentenceToMaxURL)
|
357 |
|
358 |
+
url_source = {}
|
359 |
for url in index:
|
360 |
s = [
|
361 |
+
score_array[url][sen]
|
362 |
for sen in range(len(sentences))
|
363 |
if sentenceToMaxURL[sen] == url
|
364 |
]
|
365 |
+
url_source[url] = sum(s) / len(s)
|
366 |
+
index_descending = sorted(url_source, key=url_source.get, reverse=True)
|
|
|
|
|
367 |
urlMap = {}
|
368 |
for count, i in enumerate(index_descending):
|
369 |
urlMap[i] = count + 1
|
370 |
+
|
371 |
+
# build results
|
372 |
for i, sent in enumerate(sentences):
|
373 |
+
ind = sentenceToMaxURL[i]
|
374 |
+
if url_source[ind] > 0.1:
|
375 |
+
sentence_scores.append(
|
376 |
+
[sent, url_source[ind], url_list[ind], urlMap[ind]]
|
377 |
+
)
|
378 |
+
else:
|
379 |
+
sentence_scores.append([sent, None, url_list[ind], -1])
|
|
|
380 |
for ind in index_descending:
|
381 |
+
if url_source[ind] > 0.1:
|
382 |
+
url_scores.append(
|
383 |
+
[url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
|
|
|
|
|
|
|
384 |
)
|
|
|
|
|
385 |
|
386 |
+
return sentence_scores, url_scores
|
requirements.txt
CHANGED
@@ -28,4 +28,7 @@ lime
|
|
28 |
joblib
|
29 |
optimum
|
30 |
clean-text
|
31 |
-
optimum[onnxruntime]
|
|
|
|
|
|
|
|
28 |
joblib
|
29 |
optimum
|
30 |
clean-text
|
31 |
+
optimum[onnxruntime]
|
32 |
+
emoji==1.6.1
|
33 |
+
matplotlib
|
34 |
+
seaborn
|
writing_analysis.py
CHANGED
@@ -1,85 +1,153 @@
|
|
1 |
-
import
|
2 |
-
from
|
|
|
3 |
from nltk.corpus import stopwords
|
4 |
-
from nltk.
|
|
|
5 |
import torch
|
6 |
-
from tqdm import tqdm
|
7 |
-
|
8 |
-
|
9 |
-
def normalize(value, min_value, max_value):
|
10 |
-
normalized_value = ((value - min_value) * 100) / (max_value - min_value)
|
11 |
-
return max(0, min(100, normalized_value))
|
12 |
-
|
13 |
-
# vocabulary richness
|
14 |
-
def preprocess_text1(text):
|
15 |
-
text = text.lower()
|
16 |
-
text = re.sub(r'[^\w\s]', '', text) # remove punctuation
|
17 |
-
stop_words = set(stopwords.words('english')) # remove stopwords
|
18 |
-
words = [word for word in text.split() if word not in stop_words]
|
19 |
-
words = [word for word in words if not word.isdigit()] # remove numbers
|
20 |
-
return words
|
21 |
-
|
22 |
-
def vocabulary_richness_ttr(words):
|
23 |
-
unique_words = set(words)
|
24 |
-
ttr = len(unique_words) / len(words) * 100
|
25 |
-
return ttr
|
26 |
-
|
27 |
-
def calculate_gunning_fog(text):
|
28 |
-
"""range 0-20"""
|
29 |
-
gunning_fog = textstat.gunning_fog(text)
|
30 |
-
return gunning_fog
|
31 |
-
|
32 |
-
def calculate_automated_readability_index(text):
|
33 |
-
"""range 1-20"""
|
34 |
-
ari = textstat.automated_readability_index(text)
|
35 |
-
return ari
|
36 |
-
|
37 |
-
def calculate_flesch_reading_ease(text):
|
38 |
-
"""range 0-100"""
|
39 |
-
fre = textstat.flesch_reading_ease(text)
|
40 |
-
return fre
|
41 |
-
|
42 |
-
def preprocess_text2(text):
|
43 |
-
# tokenize into words and remove punctuation
|
44 |
-
sentences = sent_tokenize(text)
|
45 |
-
words = [word.lower() for sent in sentences for word in word_tokenize(sent) if word.isalnum()]
|
46 |
-
# remove stopwords
|
47 |
-
stop_words = set(stopwords.words('english'))
|
48 |
-
words = [word for word in words if word not in stop_words]
|
49 |
-
return words, sentences
|
50 |
-
|
51 |
-
def calculate_average_sentence_length(sentences):
|
52 |
-
"""range 0-40 or 50 based on the histogram"""
|
53 |
-
total_words = sum(len(word_tokenize(sent)) for sent in sentences)
|
54 |
-
average_sentence_length = total_words / (len(sentences) + 0.0000001)
|
55 |
-
return average_sentence_length
|
56 |
-
|
57 |
-
def calculate_average_word_length(words):
|
58 |
-
"""range 0-8 based on the histogram"""
|
59 |
-
total_characters = sum(len(word) for word in words)
|
60 |
-
average_word_length = total_characters / (len(words) + 0.0000001)
|
61 |
-
return average_word_length
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def calculate_max_depth(sent):
|
64 |
return max(len(list(token.ancestors)) for token in sent)
|
65 |
|
66 |
-
|
67 |
-
|
68 |
doc = nlp(text)
|
69 |
sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
|
70 |
-
average_depth =
|
|
|
|
|
71 |
return average_depth
|
72 |
|
73 |
-
|
|
|
74 |
def calculate_perplexity(text, model, tokenizer, device, stride=512):
|
75 |
-
"""range 0-30 based on the histogram"""
|
76 |
encodings = tokenizer(text, return_tensors="pt")
|
77 |
max_length = model.config.n_positions
|
78 |
seq_len = encodings.input_ids.size(1)
|
79 |
|
80 |
nlls = []
|
81 |
prev_end_loc = 0
|
82 |
-
for begin_loc in
|
83 |
end_loc = min(begin_loc + max_length, seq_len)
|
84 |
trg_len = end_loc - prev_end_loc # may be different from stride on last loop
|
85 |
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
|
@@ -88,6 +156,10 @@ def calculate_perplexity(text, model, tokenizer, device, stride=512):
|
|
88 |
|
89 |
with torch.no_grad():
|
90 |
outputs = model(input_ids, labels=target_ids)
|
|
|
|
|
|
|
|
|
91 |
neg_log_likelihood = outputs.loss
|
92 |
|
93 |
nlls.append(neg_log_likelihood)
|
@@ -98,3 +170,4 @@ def calculate_perplexity(text, model, tokenizer, device, stride=512):
|
|
98 |
|
99 |
ppl = torch.exp(torch.stack(nlls).mean())
|
100 |
return ppl.item()
|
|
|
|
1 |
+
import string
|
2 |
+
from collections import Counter
|
3 |
+
from nltk import word_tokenize
|
4 |
from nltk.corpus import stopwords
|
5 |
+
from nltk.stem import WordNetLemmatizer
|
6 |
+
from nltk.probability import FreqDist
|
7 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
|
10 |
+
def preprocess_text(text, remove_stopwords=True, use_lemmatization=True):
|
11 |
+
tokens = word_tokenize(text.lower())
|
12 |
+
tokens = [token for token in tokens if token.isalpha()]
|
13 |
+
if remove_stopwords:
|
14 |
+
stop_words = set(stopwords.words("english"))
|
15 |
+
tokens = [token for token in tokens if token not in stop_words]
|
16 |
+
if use_lemmatization:
|
17 |
+
lemmatizer = WordNetLemmatizer()
|
18 |
+
tokens = [lemmatizer.lemmatize(token) for token in tokens]
|
19 |
+
return tokens
|
20 |
+
|
21 |
+
|
22 |
+
def get_special_chars():
|
23 |
+
import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
|
24 |
+
|
25 |
+
main_special_characters = string.punctuation + string.digits + string.whitespace
|
26 |
+
other_special_characters = (
|
27 |
+
" ’“”–ー一▬…✦�£•€«»°·═"
|
28 |
+
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰
‑≤≥‖"
|
29 |
+
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
|
30 |
+
"゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
|
31 |
+
"」﴾》"
|
32 |
+
)
|
33 |
+
emoji = list(emoji.UNICODE_EMOJI["en"].keys())
|
34 |
+
special_characters_default = set(main_special_characters + other_special_characters)
|
35 |
+
special_characters_default.update(emoji)
|
36 |
+
return special_characters_default
|
37 |
+
|
38 |
+
special_characters_default = get_special_chars()
|
39 |
+
|
40 |
+
|
41 |
+
# -------------------- Features --------------------
|
42 |
+
def syllable_count(word, d):
|
43 |
+
return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])]
|
44 |
+
|
45 |
+
|
46 |
+
def estimated_slightly_difficult_words_ratio(text, d):
|
47 |
+
words = word_tokenize(text.lower())
|
48 |
+
total_words = len(words)
|
49 |
+
# Considering words with 3 or more syllables as difficult
|
50 |
+
difficult_count = sum(
|
51 |
+
1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2
|
52 |
+
)
|
53 |
+
return difficult_count / total_words if total_words > 0 else 0
|
54 |
+
|
55 |
+
|
56 |
+
# -------------------- Features --------------------
|
57 |
+
def entity_density(text, nlp):
|
58 |
+
doc = nlp(text)
|
59 |
+
return len(doc.ents) / len(doc)
|
60 |
+
|
61 |
+
|
62 |
+
# -------------------- Features --------------------
|
63 |
+
def determiners_frequency(text, nlp):
|
64 |
+
doc = nlp(text)
|
65 |
+
determiners = sum(1 for token in doc if token.pos_ == "DET")
|
66 |
+
total_words = len(doc)
|
67 |
+
return determiners / total_words if total_words else 0
|
68 |
+
|
69 |
+
|
70 |
+
# -------------------- Features --------------------
|
71 |
+
def punctuation_diversity(text):
|
72 |
+
punctuation_counts = Counter(
|
73 |
+
char for char in text if char in special_characters_default
|
74 |
+
)
|
75 |
+
diversity_score = (
|
76 |
+
len(punctuation_counts) / len(special_characters_default)
|
77 |
+
if special_characters_default
|
78 |
+
else 0
|
79 |
+
)
|
80 |
+
return diversity_score
|
81 |
+
|
82 |
+
|
83 |
+
# -------------------- Features --------------------
|
84 |
+
def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True):
|
85 |
+
tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
|
86 |
+
unique_words = set(tokens)
|
87 |
+
return len(unique_words) / len(tokens) if tokens else 0
|
88 |
+
|
89 |
+
|
90 |
+
# -------------------- Features --------------------
|
91 |
+
def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True):
|
92 |
+
tokens = word_tokenize(text.lower())
|
93 |
+
tokens = [token for token in tokens if token.isalpha()]
|
94 |
+
|
95 |
+
if remove_stopwords:
|
96 |
+
stop_words = set(stopwords.words("english"))
|
97 |
+
tokens = [token for token in tokens if token not in stop_words]
|
98 |
+
|
99 |
+
if use_lemmatization:
|
100 |
+
lemmatizer = WordNetLemmatizer()
|
101 |
+
tokens = [lemmatizer.lemmatize(token) for token in tokens]
|
102 |
+
|
103 |
+
freq_dist = FreqDist(tokens)
|
104 |
+
hapaxes = freq_dist.hapaxes()
|
105 |
+
return len(hapaxes) / len(tokens) if tokens else 0
|
106 |
+
|
107 |
+
|
108 |
+
# -------------------- Features --------------------
|
109 |
+
def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True):
|
110 |
+
tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
|
111 |
+
|
112 |
+
def mtld_calc(direction):
|
113 |
+
token_length, factor_count = 0, 0
|
114 |
+
types = set()
|
115 |
+
for token in tokens if direction == "forward" else reversed(tokens):
|
116 |
+
types.add(token)
|
117 |
+
token_length += 1
|
118 |
+
if len(types) / token_length < threshold:
|
119 |
+
factor_count += 1
|
120 |
+
types = set()
|
121 |
+
token_length = 0
|
122 |
+
factor_count += 1 # For the last segment, even if it didn't reach the threshold
|
123 |
+
return len(tokens) / factor_count if factor_count != 0 else 0
|
124 |
+
|
125 |
+
return (mtld_calc("forward") + mtld_calc("backward")) / 2
|
126 |
+
|
127 |
+
|
128 |
+
# -------------------- Features --------------------
|
129 |
def calculate_max_depth(sent):
|
130 |
return max(len(list(token.ancestors)) for token in sent)
|
131 |
|
132 |
+
|
133 |
+
def calculate_syntactic_tree_depth(text, nlp):
|
134 |
doc = nlp(text)
|
135 |
sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
|
136 |
+
average_depth = (
|
137 |
+
sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
|
138 |
+
)
|
139 |
return average_depth
|
140 |
|
141 |
+
|
142 |
+
# -------------------- Features --------------------
|
143 |
def calculate_perplexity(text, model, tokenizer, device, stride=512):
|
|
|
144 |
encodings = tokenizer(text, return_tensors="pt")
|
145 |
max_length = model.config.n_positions
|
146 |
seq_len = encodings.input_ids.size(1)
|
147 |
|
148 |
nlls = []
|
149 |
prev_end_loc = 0
|
150 |
+
for begin_loc in range(0, seq_len, stride):
|
151 |
end_loc = min(begin_loc + max_length, seq_len)
|
152 |
trg_len = end_loc - prev_end_loc # may be different from stride on last loop
|
153 |
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
|
|
|
156 |
|
157 |
with torch.no_grad():
|
158 |
outputs = model(input_ids, labels=target_ids)
|
159 |
+
|
160 |
+
# loss is calculated using CrossEntropyLoss which averages over valid labels
|
161 |
+
# N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
|
162 |
+
# to the left by 1.
|
163 |
neg_log_likelihood = outputs.loss
|
164 |
|
165 |
nlls.append(neg_log_likelihood)
|
|
|
170 |
|
171 |
ppl = torch.exp(torch.stack(nlls).mean())
|
172 |
return ppl.item()
|
173 |
+
|