Spaces:
Running
Running
Merge branch 'minko'
Browse files- __pycache__/analysis.cpython-311.pyc +0 -0
- __pycache__/app.cpython-311.pyc +0 -0
- __pycache__/explainability.cpython-311.pyc +0 -0
- __pycache__/plagiarism.cpython-311.pyc +0 -0
- __pycache__/predictors.cpython-311.pyc +0 -0
- __pycache__/utils.cpython-311.pyc +0 -0
- analysis.py +98 -0
- app.py +43 -518
- explainability.py +119 -0
- plagiarism.py +344 -0
- predictors.py +246 -0
- requirements.txt +2 -2
- utils.py +327 -257
__pycache__/analysis.cpython-311.pyc
ADDED
Binary file (4.75 kB). View file
|
|
__pycache__/app.cpython-311.pyc
ADDED
Binary file (10.9 kB). View file
|
|
__pycache__/explainability.cpython-311.pyc
ADDED
Binary file (7.89 kB). View file
|
|
__pycache__/plagiarism.cpython-311.pyc
ADDED
Binary file (14.1 kB). View file
|
|
__pycache__/predictors.cpython-311.pyc
ADDED
Binary file (12 kB). View file
|
|
__pycache__/utils.cpython-311.pyc
ADDED
Binary file (3.76 kB). View file
|
|
analysis.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import httpx
|
3 |
+
import torch
|
4 |
+
import re
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import numpy as np
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
+
import asyncio
|
9 |
+
from scipy.special import softmax
|
10 |
+
from evaluate import load
|
11 |
+
from datetime import date
|
12 |
+
import nltk
|
13 |
+
import fitz
|
14 |
+
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
15 |
+
import nltk, spacy, subprocess, torch
|
16 |
+
import plotly.graph_objects as go
|
17 |
+
import torch.nn.functional as F
|
18 |
+
import nltk
|
19 |
+
from unidecode import unidecode
|
20 |
+
import time
|
21 |
+
import yaml
|
22 |
+
import nltk
|
23 |
+
import os
|
24 |
+
from explainability import *
|
25 |
+
from dotenv import load_dotenv
|
26 |
+
import subprocess
|
27 |
+
|
28 |
+
nltk.download("punkt")
|
29 |
+
nltk.download("stopwords")
|
30 |
+
load_dotenv()
|
31 |
+
with open("config.yaml", "r") as file:
|
32 |
+
params = yaml.safe_load(file)
|
33 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
34 |
+
readability_model_id = params["READABILITY_MODEL_ID"]
|
35 |
+
gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
|
36 |
+
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
|
37 |
+
|
38 |
+
command = ["python", "-m", "spacy", "download", "en_core_web_sm"]
|
39 |
+
subprocess.run(command)
|
40 |
+
nlp = spacy.load("en_core_web_sm")
|
41 |
+
|
42 |
+
|
43 |
+
def depth_analysis(input_text):
|
44 |
+
processed_words = preprocess_text1(input_text)
|
45 |
+
ttr_value = vocabulary_richness_ttr(processed_words)
|
46 |
+
gunning_fog = calculate_gunning_fog(input_text)
|
47 |
+
gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
|
48 |
+
words, sentences = preprocess_text2(input_text)
|
49 |
+
average_sentence_length = calculate_average_sentence_length(sentences)
|
50 |
+
average_word_length = calculate_average_word_length(words)
|
51 |
+
average_sentence_length_norm = normalize(
|
52 |
+
average_sentence_length, min_value=0, max_value=40
|
53 |
+
)
|
54 |
+
average_word_length_norm = normalize(
|
55 |
+
average_word_length, min_value=0, max_value=8
|
56 |
+
)
|
57 |
+
average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
|
58 |
+
average_tree_depth_norm = normalize(
|
59 |
+
average_tree_depth, min_value=0, max_value=10
|
60 |
+
)
|
61 |
+
perplexity = calculate_perplexity(
|
62 |
+
input_text, gpt2_model, gpt2_tokenizer, device
|
63 |
+
)
|
64 |
+
perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
|
65 |
+
|
66 |
+
features = {
|
67 |
+
"readability": gunning_fog_norm,
|
68 |
+
"syntactic tree depth": average_tree_depth_norm,
|
69 |
+
"vocabulary richness": ttr_value,
|
70 |
+
"perplexity": perplexity_norm,
|
71 |
+
"average sentence length": average_sentence_length_norm,
|
72 |
+
"average word length": average_word_length_norm,
|
73 |
+
}
|
74 |
+
fig = go.Figure()
|
75 |
+
fig.add_trace(
|
76 |
+
go.Scatterpolar(
|
77 |
+
r=list(features.values()),
|
78 |
+
theta=list(features.keys()),
|
79 |
+
fill="toself",
|
80 |
+
name="Radar Plot",
|
81 |
+
)
|
82 |
+
)
|
83 |
+
fig.update_layout(
|
84 |
+
polar=dict(
|
85 |
+
radialaxis=dict(
|
86 |
+
visible=True,
|
87 |
+
range=[0, 100],
|
88 |
+
)
|
89 |
+
),
|
90 |
+
showlegend=False,
|
91 |
+
margin=dict(
|
92 |
+
l=10,
|
93 |
+
r=20,
|
94 |
+
b=10,
|
95 |
+
t=10,
|
96 |
+
),
|
97 |
+
)
|
98 |
+
return fig
|
app.py
CHANGED
@@ -1,405 +1,24 @@
|
|
1 |
-
from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore
|
2 |
import gradio as gr
|
3 |
-
from urllib.request import urlopen, Request
|
4 |
-
from googleapiclient.discovery import build
|
5 |
-
import requests
|
6 |
-
import httpx
|
7 |
-
import torch
|
8 |
-
import re
|
9 |
-
from bs4 import BeautifulSoup
|
10 |
import numpy as np
|
11 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
12 |
-
import asyncio
|
13 |
-
from scipy.special import softmax
|
14 |
-
from evaluate import load
|
15 |
from datetime import date
|
16 |
-
import
|
17 |
-
import
|
18 |
-
from
|
19 |
-
|
20 |
-
import
|
21 |
-
import torch.nn.functional as F
|
22 |
-
import nltk
|
23 |
-
from unidecode import unidecode
|
24 |
-
import time
|
25 |
-
from utils import cos_sim_torch, embed_text
|
26 |
-
import multiprocessing
|
27 |
-
from functools import partial
|
28 |
-
import concurrent.futures
|
29 |
-
|
30 |
-
nltk.download('punkt')
|
31 |
-
|
32 |
-
from writing_analysis import (
|
33 |
-
normalize,
|
34 |
-
preprocess_text1,
|
35 |
-
preprocess_text2,
|
36 |
-
vocabulary_richness_ttr,
|
37 |
-
calculate_gunning_fog,
|
38 |
-
calculate_average_sentence_length,
|
39 |
-
calculate_average_word_length,
|
40 |
-
calculate_syntactic_tree_depth,
|
41 |
-
calculate_perplexity,
|
42 |
-
|
43 |
-
)
|
44 |
|
45 |
np.set_printoptions(suppress=True)
|
46 |
|
47 |
|
48 |
-
def
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
month_to,
|
56 |
-
day_to,
|
57 |
-
domains_to_skip,
|
58 |
-
):
|
59 |
-
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
60 |
-
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
61 |
-
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
62 |
-
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
63 |
-
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
64 |
-
|
65 |
-
cse_id = "851813e81162b4ed4"
|
66 |
-
|
67 |
-
time1 = time.perf_counter()
|
68 |
-
start = time.perf_counter()
|
69 |
-
sentences = getSentences(input)
|
70 |
-
urlCount = {}
|
71 |
-
ScoreArray = []
|
72 |
-
urlList = []
|
73 |
-
|
74 |
-
date_from = build_date(year_from, month_from, day_from)
|
75 |
-
date_to = build_date(year_to, month_to, day_to)
|
76 |
-
sort_date = f"date:r:{date_from}:{date_to}"
|
77 |
-
|
78 |
-
# get list of URLS to check
|
79 |
-
urlCount, ScoreArray = googleSearch(
|
80 |
-
plag_option,
|
81 |
-
sentences,
|
82 |
-
urlCount,
|
83 |
-
ScoreArray,
|
84 |
-
urlList,
|
85 |
-
sort_date,
|
86 |
-
domains_to_skip,
|
87 |
-
api_key,
|
88 |
-
cse_id,
|
89 |
-
)
|
90 |
-
print(f"Time for google search: {time.perf_counter()-time1}")
|
91 |
-
time1 = time.perf_counter()
|
92 |
-
|
93 |
-
print("Number of URLs: ", len(urlCount))
|
94 |
-
print(urlList)
|
95 |
-
|
96 |
-
# Scrape URLs in list
|
97 |
-
formatted_tokens = []
|
98 |
-
soups = asyncio.run(parallel_scrap(urlList))
|
99 |
-
|
100 |
-
print(f"Time for scraping: {time.perf_counter()-time1}")
|
101 |
-
time1 = time.perf_counter()
|
102 |
-
print(len(soups))
|
103 |
-
print(
|
104 |
-
"Successful scraping: "
|
105 |
-
+ str(len([x for x in soups if x is not None]))
|
106 |
-
+ "out of "
|
107 |
-
+ str(len(urlList))
|
108 |
-
)
|
109 |
-
|
110 |
-
source_embeddings = []
|
111 |
-
for i, soup in enumerate(soups):
|
112 |
-
if soup:
|
113 |
-
page_content = soup.text
|
114 |
-
source_embeddings.append(embed_text(page_content))
|
115 |
-
else:
|
116 |
-
source_embeddings.append(None)
|
117 |
-
|
118 |
-
# Populate matching scores for scrapped pages
|
119 |
-
for i, soup in enumerate(soups):
|
120 |
-
print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
121 |
-
if soup:
|
122 |
-
page_content = soup.text
|
123 |
-
for j, sent in enumerate(sentences):
|
124 |
-
score = matchingScore(sent, page_content)
|
125 |
-
score = matchingScore(sent, page_content)
|
126 |
-
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
127 |
-
ScoreArray[i][j] = score
|
128 |
-
|
129 |
-
|
130 |
-
def compute_cosine_similarity(args):
|
131 |
-
sent, source_embedding, i, j = args
|
132 |
-
score = cos_sim_torch(embed_text(sent), source_embedding)
|
133 |
-
return i, j, score
|
134 |
-
|
135 |
-
def main(soups, sentences):
|
136 |
-
source_embeddings = [preprocess(soup) for soup in soups]
|
137 |
-
ScoreArray = [[0 for _ in sentences] for _ in soups]
|
138 |
-
args_list = []
|
139 |
-
for i, soup in enumerate(soups):
|
140 |
-
if soup:
|
141 |
-
for j, sent in enumerate(sentences):
|
142 |
-
args_list.append((sent, source_embeddings[i], i, j))
|
143 |
-
with concurrent.futures.ProcessPoolExecutor() as executor:
|
144 |
-
results = executor.map(compute_cosine_similarity, args_list)
|
145 |
-
for i, j, score in results:
|
146 |
-
ScoreArray[i][j] = score
|
147 |
-
return ScoreArray
|
148 |
-
|
149 |
-
ScoreArray = main(soups, sentences)
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
print(f"Time for matching score: {time.perf_counter()-time1}")
|
154 |
-
time1 = time.perf_counter()
|
155 |
-
|
156 |
-
# ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
|
157 |
-
# print("New Score Array:\n")
|
158 |
-
# print2D(ScoreArray)
|
159 |
-
|
160 |
-
# Gradio formatting section
|
161 |
-
sentencePlag = [False] * len(sentences)
|
162 |
-
sentenceToMaxURL = [-1] * len(sentences)
|
163 |
-
for j in range(len(sentences)):
|
164 |
-
if j > 0:
|
165 |
-
maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
|
166 |
-
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
167 |
-
else:
|
168 |
-
maxScore = -1
|
169 |
-
for i in range(len(ScoreArray)):
|
170 |
-
margin = (
|
171 |
-
0.1
|
172 |
-
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
173 |
-
else 0
|
174 |
-
)
|
175 |
-
if ScoreArray[i][j] - maxScore > margin:
|
176 |
-
maxScore = ScoreArray[i][j]
|
177 |
-
sentenceToMaxURL[j] = i
|
178 |
-
if maxScore > 0.5:
|
179 |
-
sentencePlag[j] = True
|
180 |
-
|
181 |
-
if (
|
182 |
-
(len(sentences) > 1)
|
183 |
-
and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
|
184 |
-
and (
|
185 |
-
ScoreArray[sentenceToMaxURL[0]][0]
|
186 |
-
- ScoreArray[sentenceToMaxURL[1]][0]
|
187 |
-
< 0.1
|
188 |
-
)
|
189 |
-
):
|
190 |
-
sentenceToMaxURL[0] = sentenceToMaxURL[1]
|
191 |
-
|
192 |
-
index = np.unique(sentenceToMaxURL)
|
193 |
-
|
194 |
-
urlScore = {}
|
195 |
-
for url in index:
|
196 |
-
s = [
|
197 |
-
ScoreArray[url][sen]
|
198 |
-
for sen in range(len(sentences))
|
199 |
-
if sentenceToMaxURL[sen] == url
|
200 |
-
]
|
201 |
-
urlScore[url] = sum(s) / len(s)
|
202 |
-
|
203 |
-
index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
|
204 |
-
|
205 |
-
urlMap = {}
|
206 |
-
for count, i in enumerate(index_descending):
|
207 |
-
urlMap[i] = count + 1
|
208 |
-
for i, sent in enumerate(sentences):
|
209 |
-
formatted_tokens.append(
|
210 |
-
(sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
|
211 |
)
|
212 |
|
213 |
-
formatted_tokens.append(("\n", None))
|
214 |
-
formatted_tokens.append(("\n", None))
|
215 |
-
formatted_tokens.append(("\n", None))
|
216 |
-
|
217 |
-
print(formatted_tokens)
|
218 |
-
print(index_descending)
|
219 |
-
|
220 |
-
for ind in index_descending:
|
221 |
-
formatted_tokens.append(
|
222 |
-
(
|
223 |
-
urlList[ind] + " --- Matching Score: " + f"{str(round(urlScore[ind] * 100, 2))}%",
|
224 |
-
"[" + str(urlMap[ind]) + "]",
|
225 |
-
)
|
226 |
-
)
|
227 |
-
formatted_tokens.append(("\n", None))
|
228 |
-
|
229 |
-
print(f"Formatted Tokens: {formatted_tokens}")
|
230 |
-
|
231 |
-
print(f"Time for plagiarism check: {time.perf_counter()-start}")
|
232 |
-
|
233 |
-
return formatted_tokens
|
234 |
-
|
235 |
-
|
236 |
-
"""
|
237 |
-
AI DETECTION SECTION
|
238 |
-
"""
|
239 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
240 |
-
|
241 |
-
text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
|
242 |
-
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
243 |
-
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
|
244 |
-
|
245 |
-
text_mc_model_path = "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
|
246 |
-
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
247 |
-
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
|
248 |
-
|
249 |
-
quillbot_labels = ["Original", "QuillBot"]
|
250 |
-
quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
|
251 |
-
quillbot_model = AutoModelForSequenceClassification.from_pretrained("polygraf-ai/quillbot-detector-28k").to(device)
|
252 |
-
|
253 |
-
def remove_accents(input_str):
|
254 |
-
text_no_accents = unidecode(input_str)
|
255 |
-
return text_no_accents
|
256 |
-
|
257 |
-
def remove_special_characters(text):
|
258 |
-
text = remove_accents(text)
|
259 |
-
pattern = r'[^\w\s\d.,!?\'"()-;]+'
|
260 |
-
text = re.sub(pattern, '', text)
|
261 |
-
return text
|
262 |
-
|
263 |
-
def remove_special_characters_2(text):
|
264 |
-
pattern = r'[^a-zA-Z0-9 ]+'
|
265 |
-
text = re.sub(pattern, '', text)
|
266 |
-
return text
|
267 |
-
|
268 |
-
def update_character_count(text):
|
269 |
-
return f"{len(text)} characters"
|
270 |
-
|
271 |
-
|
272 |
-
def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30, min_last_segment_length=100, type_det='bc'):
|
273 |
-
sentences = nltk.sent_tokenize(text)
|
274 |
-
segments = []
|
275 |
-
current_segment = []
|
276 |
-
current_length = 0
|
277 |
-
|
278 |
-
if type_det == 'bc':
|
279 |
-
tokenizer = text_bc_tokenizer
|
280 |
-
max_length = 333
|
281 |
-
|
282 |
-
elif type_det == 'mc':
|
283 |
-
tokenizer = text_mc_tokenizer
|
284 |
-
max_length = 256
|
285 |
-
|
286 |
-
for sentence in sentences:
|
287 |
-
tokens = tokenizer.tokenize(sentence)
|
288 |
-
sentence_length = len(tokens)
|
289 |
-
|
290 |
-
if current_length + sentence_length <= max_length + tolerance - 2:
|
291 |
-
current_segment.append(sentence)
|
292 |
-
current_length += sentence_length
|
293 |
-
else:
|
294 |
-
if current_segment:
|
295 |
-
encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
|
296 |
-
segments.append((current_segment, len(encoded_segment)))
|
297 |
-
current_segment = [sentence]
|
298 |
-
current_length = sentence_length
|
299 |
-
|
300 |
-
if current_segment:
|
301 |
-
encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
|
302 |
-
segments.append((current_segment, len(encoded_segment)))
|
303 |
-
|
304 |
-
final_segments = []
|
305 |
-
for i, (seg, length) in enumerate(segments):
|
306 |
-
if i == len(segments) - 1:
|
307 |
-
if length < min_last_segment_length and len(final_segments) > 0:
|
308 |
-
prev_seg, prev_length = final_segments[-1]
|
309 |
-
combined_encoded = tokenizer.encode(' '.join(prev_seg + seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
|
310 |
-
if len(combined_encoded) <= max_length + tolerance:
|
311 |
-
final_segments[-1] = (prev_seg + seg, len(combined_encoded))
|
312 |
-
else:
|
313 |
-
final_segments.append((seg, length))
|
314 |
-
else:
|
315 |
-
final_segments.append((seg, length))
|
316 |
-
else:
|
317 |
-
final_segments.append((seg, length))
|
318 |
-
|
319 |
-
decoded_segments = []
|
320 |
-
encoded_segments = []
|
321 |
-
for seg, _ in final_segments:
|
322 |
-
encoded_segment = tokenizer.encode(' '.join(seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
|
323 |
-
decoded_segment = tokenizer.decode(encoded_segment)
|
324 |
-
decoded_segments.append(decoded_segment)
|
325 |
-
return decoded_segments
|
326 |
-
|
327 |
-
def predict_quillbot(text):
|
328 |
-
with torch.no_grad():
|
329 |
-
quillbot_model.eval()
|
330 |
-
tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)
|
331 |
-
output = quillbot_model(**tokenized_text)
|
332 |
-
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
333 |
-
q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
|
334 |
-
return q_score
|
335 |
-
|
336 |
-
def predict_bc(model, tokenizer, text):
|
337 |
-
with torch.no_grad():
|
338 |
-
model.eval()
|
339 |
-
tokens = text_bc_tokenizer(
|
340 |
-
text, padding='max_length', truncation=True, max_length=333, return_tensors="pt"
|
341 |
-
).to(device)
|
342 |
-
output = model(**tokens)
|
343 |
-
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
344 |
-
print("BC Score: ", output_norm)
|
345 |
-
return output_norm
|
346 |
-
|
347 |
-
def predict_mc(model, tokenizer, text):
|
348 |
-
with torch.no_grad():
|
349 |
-
model.eval()
|
350 |
-
tokens = text_mc_tokenizer(
|
351 |
-
text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
|
352 |
-
).to(device)
|
353 |
-
output = model(**tokens)
|
354 |
-
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
355 |
-
print("MC Score: ", output_norm)
|
356 |
-
return output_norm
|
357 |
-
|
358 |
-
def ai_generated_test(ai_option, input):
|
359 |
-
|
360 |
-
bc_scores = []
|
361 |
-
mc_scores = []
|
362 |
-
samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
|
363 |
-
samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc'))
|
364 |
-
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
|
365 |
-
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
|
366 |
-
|
367 |
-
for i in range(samples_len_bc):
|
368 |
-
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
369 |
-
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
|
370 |
-
bc_scores.append(bc_score)
|
371 |
-
|
372 |
-
for i in range(samples_len_mc):
|
373 |
-
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
374 |
-
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
375 |
-
mc_scores.append(mc_score)
|
376 |
-
|
377 |
-
bc_scores_array = np.array(bc_scores)
|
378 |
-
mc_scores_array = np.array(mc_scores)
|
379 |
-
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
380 |
-
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
381 |
-
bc_score_list = average_bc_scores.tolist()
|
382 |
-
mc_score_list = average_mc_scores.tolist()
|
383 |
-
|
384 |
-
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
385 |
-
mc_score = {}
|
386 |
-
label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
|
387 |
-
|
388 |
-
for score, label in zip(mc_score_list, label_map):
|
389 |
-
mc_score[label.upper()] = score
|
390 |
-
|
391 |
-
sum_prob = 1 - bc_score["HUMAN"]
|
392 |
-
for key, value in mc_score.items():
|
393 |
-
mc_score[key] = value * sum_prob
|
394 |
-
|
395 |
-
if ai_option == "Human vs AI":
|
396 |
-
mc_score = {}
|
397 |
-
|
398 |
-
if sum_prob < 0.01 :
|
399 |
-
mc_score = {}
|
400 |
-
return bc_score, mc_score
|
401 |
-
else:
|
402 |
-
return bc_score, mc_score
|
403 |
|
404 |
# COMBINED
|
405 |
def main(
|
@@ -428,117 +47,18 @@ def main(
|
|
428 |
domains_to_skip,
|
429 |
)
|
430 |
depth_analysis_plot = depth_analysis(input)
|
431 |
-
bc_score
|
|
|
432 |
quilscore = predict_quillbot(input)
|
433 |
-
|
434 |
-
return (
|
435 |
-
bc_score,
|
436 |
-
mc_score,
|
437 |
-
formatted_tokens,
|
438 |
-
depth_analysis_plot,
|
439 |
-
quilscore
|
440 |
-
)
|
441 |
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
lengt = len(text_bc_tokenizer.tokenize(text = text, return_tensors="pt"))
|
449 |
-
if lengt < min_tokens:
|
450 |
-
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
|
451 |
-
else :
|
452 |
-
return f"Input length ({lengt}) is satisified."
|
453 |
-
|
454 |
-
def extract_text_from_pdf(pdf_path):
|
455 |
-
doc = fitz.open(pdf_path)
|
456 |
-
text = ""
|
457 |
-
for page in doc:
|
458 |
-
text += page.get_text()
|
459 |
-
return text
|
460 |
-
|
461 |
-
|
462 |
-
# DEPTH ANALYSIS
|
463 |
-
print("loading depth analysis")
|
464 |
-
nltk.download('stopwords')
|
465 |
-
nltk.download('punkt')
|
466 |
-
command = ['python3', '-m', 'spacy', 'download', 'en_core_web_sm']
|
467 |
-
# Execute the command
|
468 |
-
subprocess.run(command)
|
469 |
-
nlp = spacy.load("en_core_web_sm")
|
470 |
-
|
471 |
-
# for perplexity
|
472 |
-
model_id = "gpt2"
|
473 |
-
gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
|
474 |
-
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
|
475 |
-
|
476 |
-
def depth_analysis(input_text):
|
477 |
-
|
478 |
-
# vocanulary richness
|
479 |
-
processed_words = preprocess_text1(input_text)
|
480 |
-
ttr_value = vocabulary_richness_ttr(processed_words)
|
481 |
-
|
482 |
-
# readability
|
483 |
-
gunning_fog = calculate_gunning_fog(input_text)
|
484 |
-
gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
|
485 |
-
|
486 |
-
# average sentence length and average word length
|
487 |
-
words, sentences = preprocess_text2(input_text)
|
488 |
-
average_sentence_length = calculate_average_sentence_length(sentences)
|
489 |
-
average_word_length = calculate_average_word_length(words)
|
490 |
-
average_sentence_length_norm = normalize(average_sentence_length, min_value=0, max_value=40)
|
491 |
-
average_word_length_norm = normalize(average_word_length, min_value=0, max_value=8)
|
492 |
-
|
493 |
-
# syntactic_tree_depth
|
494 |
-
average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
|
495 |
-
average_tree_depth_norm = normalize(average_tree_depth, min_value=0, max_value=10)
|
496 |
-
|
497 |
-
# perplexity
|
498 |
-
perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
|
499 |
-
perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
|
500 |
-
|
501 |
-
features = {
|
502 |
-
"readability": gunning_fog_norm,
|
503 |
-
"syntactic tree depth": average_tree_depth_norm,
|
504 |
-
"vocabulary richness": ttr_value,
|
505 |
-
"perplexity": perplexity_norm,
|
506 |
-
"average sentence length": average_sentence_length_norm,
|
507 |
-
"average word length": average_word_length_norm,
|
508 |
-
}
|
509 |
-
|
510 |
-
print(features)
|
511 |
-
|
512 |
-
fig = go.Figure()
|
513 |
-
|
514 |
-
fig.add_trace(go.Scatterpolar(
|
515 |
-
r=list(features.values()),
|
516 |
-
theta=list(features.keys()),
|
517 |
-
fill='toself',
|
518 |
-
name='Radar Plot'
|
519 |
-
))
|
520 |
-
|
521 |
-
fig.update_layout(
|
522 |
-
polar=dict(
|
523 |
-
radialaxis=dict(
|
524 |
-
visible=True,
|
525 |
-
range=[0, 100],
|
526 |
-
)),
|
527 |
-
showlegend=False,
|
528 |
-
# autosize=False,
|
529 |
-
# width=600,
|
530 |
-
# height=600,
|
531 |
-
margin=dict(
|
532 |
-
l=10,
|
533 |
-
r=20,
|
534 |
-
b=10,
|
535 |
-
t=10,
|
536 |
-
# pad=100
|
537 |
-
),
|
538 |
)
|
539 |
|
540 |
-
return fig
|
541 |
-
|
542 |
|
543 |
# START OF GRADIO
|
544 |
|
@@ -575,16 +95,23 @@ with gr.Blocks() as demo:
|
|
575 |
with gr.Row():
|
576 |
input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
|
577 |
file_input = gr.File(label="Upload PDF")
|
578 |
-
file_input.change(
|
|
|
|
|
579 |
|
580 |
-
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
581 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
582 |
|
583 |
with gr.Row():
|
584 |
with gr.Column():
|
585 |
-
ai_option = gr.Radio(
|
|
|
|
|
|
|
586 |
with gr.Column():
|
587 |
-
plag_option = gr.Radio(
|
|
|
|
|
588 |
|
589 |
with gr.Row():
|
590 |
with gr.Column():
|
@@ -594,7 +121,7 @@ with gr.Blocks() as demo:
|
|
594 |
only_plagiarism_btn = gr.Button("Source Check")
|
595 |
|
596 |
with gr.Row():
|
597 |
-
quillbot_check = gr.Button("Humanized Text Check
|
598 |
|
599 |
with gr.Row():
|
600 |
depth_analysis_btn = gr.Button("Detailed Writing Analysis")
|
@@ -607,14 +134,14 @@ with gr.Blocks() as demo:
|
|
607 |
## Output
|
608 |
"""
|
609 |
)
|
610 |
-
|
611 |
# models = gr.Dropdown(
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
with gr.Row():
|
619 |
with gr.Column():
|
620 |
bcLabel = gr.Label(label="Source")
|
@@ -666,9 +193,7 @@ with gr.Blocks() as demo:
|
|
666 |
|
667 |
with gr.Row():
|
668 |
with gr.Column():
|
669 |
-
writing_analysis_plot = gr.Plot(
|
670 |
-
label="Writing Analysis Plot"
|
671 |
-
)
|
672 |
|
673 |
full_check_btn.click(
|
674 |
fn=main,
|
@@ -690,7 +215,7 @@ with gr.Blocks() as demo:
|
|
690 |
mcLabel,
|
691 |
sentenceBreakdown,
|
692 |
writing_analysis_plot,
|
693 |
-
QLabel
|
694 |
],
|
695 |
api_name="main",
|
696 |
)
|
@@ -740,5 +265,5 @@ with gr.Blocks() as demo:
|
|
740 |
|
741 |
date_from = ""
|
742 |
date_to = ""
|
743 |
-
|
744 |
-
demo.launch(share=True,
|
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
|
|
|
|
|
|
|
|
3 |
from datetime import date
|
4 |
+
from predictors import predict_bc_scores, predict_mc_scores
|
5 |
+
from analysis import depth_analysis
|
6 |
+
from predictors import predict_quillbot
|
7 |
+
from plagiarism import plagiarism_check, build_date
|
8 |
+
from utils import extract_text_from_pdf, len_validator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
np.set_printoptions(suppress=True)
|
11 |
|
12 |
|
13 |
+
def ai_generated_test(option, input):
|
14 |
+
if option == "Human vs AI":
|
15 |
+
return predict_bc_scores(input), None
|
16 |
+
else:
|
17 |
+
return (
|
18 |
+
predict_bc_scores(input),
|
19 |
+
predict_mc_scores(input),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# COMBINED
|
24 |
def main(
|
|
|
47 |
domains_to_skip,
|
48 |
)
|
49 |
depth_analysis_plot = depth_analysis(input)
|
50 |
+
bc_score = predict_bc_scores(input)
|
51 |
+
mc_score = predict_mc_scores(input)
|
52 |
quilscore = predict_quillbot(input)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
return (
|
55 |
+
bc_score,
|
56 |
+
mc_score,
|
57 |
+
formatted_tokens,
|
58 |
+
depth_analysis_plot,
|
59 |
+
quilscore,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
)
|
61 |
|
|
|
|
|
62 |
|
63 |
# START OF GRADIO
|
64 |
|
|
|
95 |
with gr.Row():
|
96 |
input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
|
97 |
file_input = gr.File(label="Upload PDF")
|
98 |
+
file_input.change(
|
99 |
+
fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
|
100 |
+
)
|
101 |
|
102 |
+
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
103 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
104 |
|
105 |
with gr.Row():
|
106 |
with gr.Column():
|
107 |
+
ai_option = gr.Radio(
|
108 |
+
["Human vs AI", "Human vs AI Source Models"],
|
109 |
+
label="Choose an option please.",
|
110 |
+
)
|
111 |
with gr.Column():
|
112 |
+
plag_option = gr.Radio(
|
113 |
+
["Standard", "Advanced"], label="Choose an option please."
|
114 |
+
)
|
115 |
|
116 |
with gr.Row():
|
117 |
with gr.Column():
|
|
|
121 |
only_plagiarism_btn = gr.Button("Source Check")
|
122 |
|
123 |
with gr.Row():
|
124 |
+
quillbot_check = gr.Button("Humanized Text Check")
|
125 |
|
126 |
with gr.Row():
|
127 |
depth_analysis_btn = gr.Button("Detailed Writing Analysis")
|
|
|
134 |
## Output
|
135 |
"""
|
136 |
)
|
137 |
+
|
138 |
# models = gr.Dropdown(
|
139 |
+
# model_list,
|
140 |
+
# value=model_list,
|
141 |
+
# multiselect=True,
|
142 |
+
# label="Models to test against",
|
143 |
+
# )
|
144 |
+
|
145 |
with gr.Row():
|
146 |
with gr.Column():
|
147 |
bcLabel = gr.Label(label="Source")
|
|
|
193 |
|
194 |
with gr.Row():
|
195 |
with gr.Column():
|
196 |
+
writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
|
|
|
|
|
197 |
|
198 |
full_check_btn.click(
|
199 |
fn=main,
|
|
|
215 |
mcLabel,
|
216 |
sentenceBreakdown,
|
217 |
writing_analysis_plot,
|
218 |
+
QLabel,
|
219 |
],
|
220 |
api_name="main",
|
221 |
)
|
|
|
265 |
|
266 |
date_from = ""
|
267 |
date_to = ""
|
268 |
+
|
269 |
+
demo.launch(share=True, auth=("polygraf-admin", "test@aisd"))
|
explainability.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re, textstat
|
2 |
+
from nltk import FreqDist
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
5 |
+
import torch
|
6 |
+
import nltk
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
nltk.download("punkt")
|
10 |
+
|
11 |
+
|
12 |
+
def normalize(value, min_value, max_value):
|
13 |
+
normalized_value = ((value - min_value) * 100) / (max_value - min_value)
|
14 |
+
return max(0, min(100, normalized_value))
|
15 |
+
|
16 |
+
|
17 |
+
def preprocess_text1(text):
|
18 |
+
text = text.lower()
|
19 |
+
text = re.sub(r"[^\w\s]", "", text) # remove punctuation
|
20 |
+
stop_words = set(stopwords.words("english")) # remove stopwords
|
21 |
+
words = [word for word in text.split() if word not in stop_words]
|
22 |
+
words = [word for word in words if not word.isdigit()] # remove numbers
|
23 |
+
return words
|
24 |
+
|
25 |
+
|
26 |
+
def vocabulary_richness_ttr(words):
|
27 |
+
unique_words = set(words)
|
28 |
+
ttr = len(unique_words) / len(words) * 100
|
29 |
+
return ttr
|
30 |
+
|
31 |
+
|
32 |
+
def calculate_gunning_fog(text):
|
33 |
+
"""range 0-20"""
|
34 |
+
gunning_fog = textstat.gunning_fog(text)
|
35 |
+
return gunning_fog
|
36 |
+
|
37 |
+
|
38 |
+
def calculate_automated_readability_index(text):
|
39 |
+
"""range 1-20"""
|
40 |
+
ari = textstat.automated_readability_index(text)
|
41 |
+
return ari
|
42 |
+
|
43 |
+
|
44 |
+
def calculate_flesch_reading_ease(text):
|
45 |
+
"""range 0-100"""
|
46 |
+
fre = textstat.flesch_reading_ease(text)
|
47 |
+
return fre
|
48 |
+
|
49 |
+
|
50 |
+
def preprocess_text2(text):
|
51 |
+
sentences = sent_tokenize(text)
|
52 |
+
words = [
|
53 |
+
word.lower()
|
54 |
+
for sent in sentences
|
55 |
+
for word in word_tokenize(sent)
|
56 |
+
if word.isalnum()
|
57 |
+
]
|
58 |
+
stop_words = set(stopwords.words("english"))
|
59 |
+
words = [word for word in words if word not in stop_words]
|
60 |
+
return words, sentences
|
61 |
+
|
62 |
+
|
63 |
+
def calculate_average_sentence_length(sentences):
|
64 |
+
"""range 0-40 or 50 based on the histogram"""
|
65 |
+
total_words = sum(len(word_tokenize(sent)) for sent in sentences)
|
66 |
+
average_sentence_length = total_words / (len(sentences) + 0.0000001)
|
67 |
+
return average_sentence_length
|
68 |
+
|
69 |
+
|
70 |
+
def calculate_average_word_length(words):
|
71 |
+
"""range 0-8 based on the histogram"""
|
72 |
+
total_characters = sum(len(word) for word in words)
|
73 |
+
average_word_length = total_characters / (len(words) + 0.0000001)
|
74 |
+
return average_word_length
|
75 |
+
|
76 |
+
|
77 |
+
def calculate_max_depth(sent):
|
78 |
+
return max(len(list(token.ancestors)) for token in sent)
|
79 |
+
|
80 |
+
|
81 |
+
def calculate_syntactic_tree_depth(nlp, text):
|
82 |
+
"""0-10 based on the histogram"""
|
83 |
+
doc = nlp(text)
|
84 |
+
sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
|
85 |
+
average_depth = (
|
86 |
+
sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
|
87 |
+
)
|
88 |
+
return average_depth
|
89 |
+
|
90 |
+
|
91 |
+
def calculate_perplexity(text, model, tokenizer, device, stride=512):
|
92 |
+
"""range 0-30 based on the histogram"""
|
93 |
+
encodings = tokenizer(text, return_tensors="pt")
|
94 |
+
max_length = model.config.n_positions
|
95 |
+
seq_len = encodings.input_ids.size(1)
|
96 |
+
|
97 |
+
nlls = []
|
98 |
+
prev_end_loc = 0
|
99 |
+
for begin_loc in tqdm(range(0, seq_len, stride)):
|
100 |
+
end_loc = min(begin_loc + max_length, seq_len)
|
101 |
+
trg_len = (
|
102 |
+
end_loc - prev_end_loc
|
103 |
+
) # may be different from stride on last loop
|
104 |
+
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
|
105 |
+
target_ids = input_ids.clone()
|
106 |
+
target_ids[:, :-trg_len] = -100
|
107 |
+
|
108 |
+
with torch.no_grad():
|
109 |
+
outputs = model(input_ids, labels=target_ids)
|
110 |
+
neg_log_likelihood = outputs.loss
|
111 |
+
|
112 |
+
nlls.append(neg_log_likelihood)
|
113 |
+
|
114 |
+
prev_end_loc = end_loc
|
115 |
+
if end_loc == seq_len:
|
116 |
+
break
|
117 |
+
|
118 |
+
ppl = torch.exp(torch.stack(nlls).mean())
|
119 |
+
return ppl.item()
|
plagiarism.py
ADDED
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from nltk.tokenize import sent_tokenize
|
3 |
+
from googleapiclient.discovery import build
|
4 |
+
from collections import Counter
|
5 |
+
import re, math
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
+
import asyncio
|
8 |
+
import httpx
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
import numpy as np
|
11 |
+
import concurrent
|
12 |
+
|
13 |
+
|
14 |
+
WORD = re.compile(r"\w+")
|
15 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
16 |
+
|
17 |
+
|
18 |
+
# returns cosine similarity of two vectors
|
19 |
+
# input: two vectors
|
20 |
+
# output: integer between 0 and 1.
|
21 |
+
def get_cosine(vec1, vec2):
|
22 |
+
intersection = set(vec1.keys()) & set(vec2.keys())
|
23 |
+
|
24 |
+
# calculating numerator
|
25 |
+
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
26 |
+
|
27 |
+
# calculating denominator
|
28 |
+
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
|
29 |
+
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
|
30 |
+
denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
31 |
+
|
32 |
+
# checking for divide by zero
|
33 |
+
if denominator == 0:
|
34 |
+
return 0.0
|
35 |
+
else:
|
36 |
+
return float(numerator) / denominator
|
37 |
+
|
38 |
+
|
39 |
+
# converts given text into a vector
|
40 |
+
def text_to_vector(text):
|
41 |
+
# uses the Regular expression above and gets all words
|
42 |
+
words = WORD.findall(text)
|
43 |
+
# returns a counter of all the words (count of number of occurences)
|
44 |
+
return Counter(words)
|
45 |
+
|
46 |
+
|
47 |
+
# returns cosine similarity of two words
|
48 |
+
# uses: text_to_vector(text) and get_cosine(v1,v2)
|
49 |
+
def cosineSim(text1, text2):
|
50 |
+
vector1 = text_to_vector(text1)
|
51 |
+
vector2 = text_to_vector(text2)
|
52 |
+
# print vector1,vector2
|
53 |
+
cosine = get_cosine(vector1, vector2)
|
54 |
+
return cosine
|
55 |
+
|
56 |
+
|
57 |
+
def cos_sim_torch(embedding_1, embedding_2):
|
58 |
+
return util.pytorch_cos_sim(embedding_1, embedding_2).item()
|
59 |
+
|
60 |
+
|
61 |
+
def embed_text(text):
|
62 |
+
return model.encode(text, convert_to_tensor=True)
|
63 |
+
|
64 |
+
|
65 |
+
def sentence_similarity(text1, text2):
|
66 |
+
embedding_1 = model.encode(text1, convert_to_tensor=True)
|
67 |
+
embedding_2 = model.encode(text2, convert_to_tensor=True)
|
68 |
+
|
69 |
+
o = util.pytorch_cos_sim(embedding_1, embedding_2)
|
70 |
+
return o.item()
|
71 |
+
|
72 |
+
|
73 |
+
def google_search(
|
74 |
+
plag_option,
|
75 |
+
sentences,
|
76 |
+
urlCount,
|
77 |
+
scoreArray,
|
78 |
+
urlList,
|
79 |
+
sorted_date,
|
80 |
+
domains_to_skip,
|
81 |
+
api_key,
|
82 |
+
cse_id,
|
83 |
+
**kwargs,
|
84 |
+
):
|
85 |
+
service = build("customsearch", "v1", developerKey=api_key)
|
86 |
+
for i, sentence in enumerate(sentences):
|
87 |
+
results = (
|
88 |
+
service.cse()
|
89 |
+
.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
|
90 |
+
.execute()
|
91 |
+
)
|
92 |
+
if "items" in results and len(results["items"]) > 0:
|
93 |
+
for count, link in enumerate(results["items"]):
|
94 |
+
# stop after 3 pages
|
95 |
+
if count >= 3:
|
96 |
+
break
|
97 |
+
# skip user selected domains
|
98 |
+
if any(
|
99 |
+
("." + domain) in link["link"] for domain in domains_to_skip
|
100 |
+
):
|
101 |
+
continue
|
102 |
+
# clean up snippet of '...'
|
103 |
+
snippet = link["snippet"]
|
104 |
+
ind = snippet.find("...")
|
105 |
+
if ind < 20 and ind > 9:
|
106 |
+
snippet = snippet[ind + len("... ") :]
|
107 |
+
ind = snippet.find("...")
|
108 |
+
if ind > len(snippet) - 5:
|
109 |
+
snippet = snippet[:ind]
|
110 |
+
|
111 |
+
# update cosine similarity between snippet and given text
|
112 |
+
url = link["link"]
|
113 |
+
if url not in urlList:
|
114 |
+
urlList.append(url)
|
115 |
+
scoreArray.append([0] * len(sentences))
|
116 |
+
urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
117 |
+
if plag_option == "Standard":
|
118 |
+
scoreArray[urlList.index(url)][i] = cosineSim(
|
119 |
+
sentence, snippet
|
120 |
+
)
|
121 |
+
else:
|
122 |
+
scoreArray[urlList.index(url)][i] = sentence_similarity(
|
123 |
+
sentence, snippet
|
124 |
+
)
|
125 |
+
return urlCount, scoreArray
|
126 |
+
|
127 |
+
|
128 |
+
def split_sentence_blocks(text):
|
129 |
+
|
130 |
+
sents = sent_tokenize(text)
|
131 |
+
two_sents = []
|
132 |
+
for i in range(len(sents)):
|
133 |
+
if (i % 4) == 0:
|
134 |
+
two_sents.append(sents[i])
|
135 |
+
else:
|
136 |
+
two_sents[len(two_sents) - 1] += " " + sents[i]
|
137 |
+
return two_sents
|
138 |
+
|
139 |
+
|
140 |
+
months = {
|
141 |
+
"January": "01",
|
142 |
+
"February": "02",
|
143 |
+
"March": "03",
|
144 |
+
"April": "04",
|
145 |
+
"May": "05",
|
146 |
+
"June": "06",
|
147 |
+
"July": "07",
|
148 |
+
"August": "08",
|
149 |
+
"September": "09",
|
150 |
+
"October": "10",
|
151 |
+
"November": "11",
|
152 |
+
"December": "12",
|
153 |
+
}
|
154 |
+
|
155 |
+
|
156 |
+
def build_date(year=2024, month="March", day=1):
|
157 |
+
return f"{year}{months[month]}{day}"
|
158 |
+
|
159 |
+
|
160 |
+
async def get_url_data(url, client):
|
161 |
+
try:
|
162 |
+
r = await client.get(url)
|
163 |
+
# print(r.status_code)
|
164 |
+
if r.status_code == 200:
|
165 |
+
# print("in")
|
166 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
167 |
+
return soup
|
168 |
+
except Exception:
|
169 |
+
return None
|
170 |
+
|
171 |
+
|
172 |
+
def remove_punc(text):
|
173 |
+
res = re.sub(r"[^\w\s]", "", text)
|
174 |
+
return res
|
175 |
+
|
176 |
+
|
177 |
+
def split_ngrams(text, n):
|
178 |
+
# return n-grams of size n
|
179 |
+
words = text.split()
|
180 |
+
return [words[i : i + n] for i in range(len(words) - n + 1)]
|
181 |
+
|
182 |
+
|
183 |
+
async def parallel_scrap(urls):
|
184 |
+
async with httpx.AsyncClient(timeout=30) as client:
|
185 |
+
tasks = []
|
186 |
+
for url in urls:
|
187 |
+
tasks.append(get_url_data(url=url, client=client))
|
188 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
189 |
+
return results
|
190 |
+
|
191 |
+
|
192 |
+
def matching_score(args_list):
|
193 |
+
sentence = remove_punc(args_list[0])
|
194 |
+
content = remove_punc(args_list[1])
|
195 |
+
if sentence in content:
|
196 |
+
return 1
|
197 |
+
else:
|
198 |
+
n = 5
|
199 |
+
ngrams = split_ngrams(sentence, n)
|
200 |
+
if len(ngrams) == 0:
|
201 |
+
return 0
|
202 |
+
matched = [x for x in ngrams if " ".join(x) in content]
|
203 |
+
return len(matched) / len(ngrams)
|
204 |
+
|
205 |
+
|
206 |
+
def plagiarism_check(
|
207 |
+
plag_option,
|
208 |
+
input,
|
209 |
+
year_from,
|
210 |
+
month_from,
|
211 |
+
day_from,
|
212 |
+
year_to,
|
213 |
+
month_to,
|
214 |
+
day_to,
|
215 |
+
domains_to_skip,
|
216 |
+
):
|
217 |
+
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
218 |
+
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
219 |
+
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
220 |
+
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
221 |
+
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
222 |
+
cse_id = "851813e81162b4ed4"
|
223 |
+
|
224 |
+
sentences = split_sentence_blocks(input)
|
225 |
+
urlCount = {}
|
226 |
+
ScoreArray = []
|
227 |
+
urlList = []
|
228 |
+
date_from = build_date(year_from, month_from, day_from)
|
229 |
+
date_to = build_date(year_to, month_to, day_to)
|
230 |
+
sort_date = f"date:r:{date_from}:{date_to}"
|
231 |
+
# get list of URLS to check
|
232 |
+
urlCount, ScoreArray = google_search(
|
233 |
+
plag_option,
|
234 |
+
sentences,
|
235 |
+
urlCount,
|
236 |
+
ScoreArray,
|
237 |
+
urlList,
|
238 |
+
sort_date,
|
239 |
+
domains_to_skip,
|
240 |
+
api_key,
|
241 |
+
cse_id,
|
242 |
+
)
|
243 |
+
|
244 |
+
# Scrape URLs in list
|
245 |
+
formatted_tokens = []
|
246 |
+
soups = asyncio.run(parallel_scrap(urlList))
|
247 |
+
|
248 |
+
# Populate matching scores for scrapped pages
|
249 |
+
for i, soup in enumerate(soups):
|
250 |
+
print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
251 |
+
if soup:
|
252 |
+
page_content = soup.text
|
253 |
+
for j, sent in enumerate(sentences):
|
254 |
+
args_list = (sent, page_content)
|
255 |
+
score = matching_score(args_list)
|
256 |
+
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
257 |
+
ScoreArray[i][j] = score
|
258 |
+
|
259 |
+
# with concurrent.futures.ProcessPoolExecutor() as executor:
|
260 |
+
# results = executor.map(matching_score, args_list)
|
261 |
+
|
262 |
+
# *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
|
263 |
+
# source_embeddings = []
|
264 |
+
# for i, soup in enumerate(soups):
|
265 |
+
# if soup:
|
266 |
+
# page_content = soup.text
|
267 |
+
# source_embeddings.append(embed_text(page_content))
|
268 |
+
# else:
|
269 |
+
# source_embeddings.append(None)
|
270 |
+
|
271 |
+
# def compute_cosine_similarity(args):
|
272 |
+
# sent, source_embedding, i, j = args
|
273 |
+
# score = cos_sim_torch(embed_text(sent), source_embedding)
|
274 |
+
# return i, j, score
|
275 |
+
|
276 |
+
# def main(soups, sentences):
|
277 |
+
# source_embeddings = [preprocess(soup) for soup in soups]
|
278 |
+
# ScoreArray = [[0 for _ in sentences] for _ in soups]
|
279 |
+
# args_list = []
|
280 |
+
# for i, soup in enumerate(soups):
|
281 |
+
# if soup:
|
282 |
+
# for j, sent in enumerate(sentences):
|
283 |
+
# args_list.append((sent, source_embeddings[i], i, j))
|
284 |
+
# with concurrent.futures.ProcessPoolExecutor() as executor:
|
285 |
+
# results = executor.map(compute_cosine_similarity, args_list)
|
286 |
+
# for i, j, score in results:
|
287 |
+
# ScoreArray[i][j] = score
|
288 |
+
# return ScoreArray
|
289 |
+
|
290 |
+
# # Populate matching scores for scrapped pages
|
291 |
+
# ScoreArray = main(soups, sentences)
|
292 |
+
# *******************************************************************************************
|
293 |
+
|
294 |
+
# Calculate URL of max matching score for each sentence chunk
|
295 |
+
sentenceToMaxURL = [-1] * len(sentences)
|
296 |
+
for j in range(len(sentences)):
|
297 |
+
if j > 0:
|
298 |
+
maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
|
299 |
+
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
300 |
+
else:
|
301 |
+
maxScore = -1
|
302 |
+
|
303 |
+
for i in range(len(ScoreArray)):
|
304 |
+
margin = (
|
305 |
+
0.1
|
306 |
+
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
307 |
+
else 0
|
308 |
+
)
|
309 |
+
if ScoreArray[i][j] - maxScore > margin:
|
310 |
+
maxScore = ScoreArray[i][j]
|
311 |
+
sentenceToMaxURL[j] = i
|
312 |
+
|
313 |
+
index = np.unique(sentenceToMaxURL)
|
314 |
+
|
315 |
+
urlScore = {}
|
316 |
+
for url in index:
|
317 |
+
s = [
|
318 |
+
ScoreArray[url][sen]
|
319 |
+
for sen in range(len(sentences))
|
320 |
+
if sentenceToMaxURL[sen] == url
|
321 |
+
]
|
322 |
+
urlScore[url] = sum(s) / len(s)
|
323 |
+
|
324 |
+
index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
|
325 |
+
|
326 |
+
urlMap = {}
|
327 |
+
for count, i in enumerate(index_descending):
|
328 |
+
urlMap[i] = count + 1
|
329 |
+
for i, sent in enumerate(sentences):
|
330 |
+
formatted_tokens.append(
|
331 |
+
(sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
|
332 |
+
)
|
333 |
+
for ind in index_descending:
|
334 |
+
formatted_tokens.append(
|
335 |
+
(
|
336 |
+
urlList[ind]
|
337 |
+
+ " --- Matching Score: "
|
338 |
+
+ f"{str(round(urlScore[ind] * 100, 2))}%",
|
339 |
+
"[" + str(urlMap[ind]) + "]",
|
340 |
+
)
|
341 |
+
)
|
342 |
+
formatted_tokens.append(("\n", None))
|
343 |
+
|
344 |
+
return formatted_tokens
|
predictors.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import httpx
|
3 |
+
import torch
|
4 |
+
import re
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import numpy as np
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
+
import asyncio
|
9 |
+
from evaluate import load
|
10 |
+
from datetime import date
|
11 |
+
import nltk
|
12 |
+
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
13 |
+
import plotly.graph_objects as go
|
14 |
+
import torch.nn.functional as F
|
15 |
+
import nltk
|
16 |
+
from unidecode import unidecode
|
17 |
+
import time
|
18 |
+
from scipy.special import softmax
|
19 |
+
import yaml
|
20 |
+
import os
|
21 |
+
from utils import *
|
22 |
+
from dotenv import load_dotenv
|
23 |
+
|
24 |
+
with open("config.yaml", "r") as file:
|
25 |
+
params = yaml.safe_load(file)
|
26 |
+
nltk.download("punkt")
|
27 |
+
nltk.download("stopwords")
|
28 |
+
load_dotenv()
|
29 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
30 |
+
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
31 |
+
text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
|
32 |
+
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
|
33 |
+
quillbot_labels = params["QUILLBOT_LABELS"]
|
34 |
+
mc_label_map = params["MC_OUTPUT_LABELS"]
|
35 |
+
mc_token_size = int(params["MC_TOKEN_SIZE"])
|
36 |
+
bc_token_size = int(params["BC_TOKEN_SIZE"])
|
37 |
+
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
38 |
+
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
39 |
+
text_bc_model_path
|
40 |
+
).to(device)
|
41 |
+
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
42 |
+
text_mc_model = AutoModelForSequenceClassification.from_pretrained(
|
43 |
+
text_mc_model_path
|
44 |
+
).to(device)
|
45 |
+
quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path)
|
46 |
+
quillbot_model = AutoModelForSequenceClassification.from_pretrained(
|
47 |
+
text_quillbot_model_path
|
48 |
+
).to(device)
|
49 |
+
|
50 |
+
|
51 |
+
def split_text_allow_complete_sentences_nltk(
|
52 |
+
text,
|
53 |
+
max_length=256,
|
54 |
+
tolerance=30,
|
55 |
+
min_last_segment_length=100,
|
56 |
+
type_det="bc",
|
57 |
+
):
|
58 |
+
sentences = nltk.sent_tokenize(text)
|
59 |
+
segments = []
|
60 |
+
current_segment = []
|
61 |
+
current_length = 0
|
62 |
+
if type_det == "bc":
|
63 |
+
tokenizer = text_bc_tokenizer
|
64 |
+
max_length = bc_token_size
|
65 |
+
elif type_det == "mc":
|
66 |
+
tokenizer = text_mc_tokenizer
|
67 |
+
max_length = mc_token_size
|
68 |
+
for sentence in sentences:
|
69 |
+
tokens = tokenizer.tokenize(sentence)
|
70 |
+
sentence_length = len(tokens)
|
71 |
+
|
72 |
+
if current_length + sentence_length <= max_length + tolerance - 2:
|
73 |
+
current_segment.append(sentence)
|
74 |
+
current_length += sentence_length
|
75 |
+
else:
|
76 |
+
if current_segment:
|
77 |
+
encoded_segment = tokenizer.encode(
|
78 |
+
" ".join(current_segment),
|
79 |
+
add_special_tokens=True,
|
80 |
+
max_length=max_length + tolerance,
|
81 |
+
truncation=True,
|
82 |
+
)
|
83 |
+
segments.append((current_segment, len(encoded_segment)))
|
84 |
+
current_segment = [sentence]
|
85 |
+
current_length = sentence_length
|
86 |
+
|
87 |
+
if current_segment:
|
88 |
+
encoded_segment = tokenizer.encode(
|
89 |
+
" ".join(current_segment),
|
90 |
+
add_special_tokens=True,
|
91 |
+
max_length=max_length + tolerance,
|
92 |
+
truncation=True,
|
93 |
+
)
|
94 |
+
segments.append((current_segment, len(encoded_segment)))
|
95 |
+
|
96 |
+
final_segments = []
|
97 |
+
for i, (seg, length) in enumerate(segments):
|
98 |
+
if i == len(segments) - 1:
|
99 |
+
if length < min_last_segment_length and len(final_segments) > 0:
|
100 |
+
prev_seg, prev_length = final_segments[-1]
|
101 |
+
combined_encoded = tokenizer.encode(
|
102 |
+
" ".join(prev_seg + seg),
|
103 |
+
add_special_tokens=True,
|
104 |
+
max_length=max_length + tolerance,
|
105 |
+
truncation=True,
|
106 |
+
)
|
107 |
+
if len(combined_encoded) <= max_length + tolerance:
|
108 |
+
final_segments[-1] = (prev_seg + seg, len(combined_encoded))
|
109 |
+
else:
|
110 |
+
final_segments.append((seg, length))
|
111 |
+
else:
|
112 |
+
final_segments.append((seg, length))
|
113 |
+
else:
|
114 |
+
final_segments.append((seg, length))
|
115 |
+
|
116 |
+
decoded_segments = []
|
117 |
+
encoded_segments = []
|
118 |
+
for seg, _ in final_segments:
|
119 |
+
encoded_segment = tokenizer.encode(
|
120 |
+
" ".join(seg),
|
121 |
+
add_special_tokens=True,
|
122 |
+
max_length=max_length + tolerance,
|
123 |
+
truncation=True,
|
124 |
+
)
|
125 |
+
decoded_segment = tokenizer.decode(encoded_segment)
|
126 |
+
decoded_segments.append(decoded_segment)
|
127 |
+
return decoded_segments
|
128 |
+
|
129 |
+
|
130 |
+
def predict_quillbot(text):
|
131 |
+
with torch.no_grad():
|
132 |
+
quillbot_model.eval()
|
133 |
+
tokenized_text = quillbot_tokenizer(
|
134 |
+
text,
|
135 |
+
padding="max_length",
|
136 |
+
truncation=True,
|
137 |
+
max_length=256,
|
138 |
+
return_tensors="pt",
|
139 |
+
).to(device)
|
140 |
+
output = quillbot_model(**tokenized_text)
|
141 |
+
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
142 |
+
q_score = {
|
143 |
+
"Humanized": output_norm[1].item(),
|
144 |
+
"Original": output_norm[0].item(),
|
145 |
+
}
|
146 |
+
return q_score
|
147 |
+
|
148 |
+
|
149 |
+
def predict_bc(model, tokenizer, text):
|
150 |
+
with torch.no_grad():
|
151 |
+
model.eval()
|
152 |
+
tokens = text_bc_tokenizer(
|
153 |
+
text,
|
154 |
+
padding="max_length",
|
155 |
+
truncation=True,
|
156 |
+
max_length=bc_token_size,
|
157 |
+
return_tensors="pt",
|
158 |
+
).to(device)
|
159 |
+
output = model(**tokens)
|
160 |
+
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
161 |
+
return output_norm
|
162 |
+
|
163 |
+
|
164 |
+
def predict_mc(model, tokenizer, text):
|
165 |
+
with torch.no_grad():
|
166 |
+
model.eval()
|
167 |
+
tokens = text_mc_tokenizer(
|
168 |
+
text,
|
169 |
+
padding="max_length",
|
170 |
+
truncation=True,
|
171 |
+
return_tensors="pt",
|
172 |
+
max_length=mc_token_size,
|
173 |
+
).to(device)
|
174 |
+
output = model(**tokens)
|
175 |
+
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
176 |
+
return output_norm
|
177 |
+
|
178 |
+
|
179 |
+
def predict_mc_scores(input):
|
180 |
+
bc_scores = []
|
181 |
+
mc_scores = []
|
182 |
+
|
183 |
+
samples_len_bc = len(
|
184 |
+
split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
185 |
+
)
|
186 |
+
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
187 |
+
for i in range(samples_len_bc):
|
188 |
+
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
189 |
+
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
190 |
+
bc_scores.append(bc_score)
|
191 |
+
bc_scores_array = np.array(bc_scores)
|
192 |
+
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
193 |
+
bc_score_list = average_bc_scores.tolist()
|
194 |
+
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
195 |
+
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
196 |
+
samples_len_mc = len(
|
197 |
+
split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
198 |
+
)
|
199 |
+
for i in range(samples_len_mc):
|
200 |
+
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
201 |
+
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
202 |
+
mc_scores.append(mc_score)
|
203 |
+
mc_scores_array = np.array(mc_scores)
|
204 |
+
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
205 |
+
mc_score_list = average_mc_scores.tolist()
|
206 |
+
mc_score = {}
|
207 |
+
for score, label in zip(mc_score_list, mc_label_map):
|
208 |
+
mc_score[label.upper()] = score
|
209 |
+
|
210 |
+
sum_prob = 1 - bc_score["HUMAN"]
|
211 |
+
for key, value in mc_score.items():
|
212 |
+
mc_score[key] = value * sum_prob
|
213 |
+
if sum_prob < 0.01:
|
214 |
+
mc_score = {}
|
215 |
+
|
216 |
+
return mc_score
|
217 |
+
|
218 |
+
|
219 |
+
def predict_bc_scores(input):
|
220 |
+
bc_scores = []
|
221 |
+
mc_scores = []
|
222 |
+
samples_len_bc = len(
|
223 |
+
split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
224 |
+
)
|
225 |
+
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
226 |
+
for i in range(samples_len_bc):
|
227 |
+
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
228 |
+
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
229 |
+
bc_scores.append(bc_score)
|
230 |
+
bc_scores_array = np.array(bc_scores)
|
231 |
+
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
232 |
+
bc_score_list = average_bc_scores.tolist()
|
233 |
+
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
234 |
+
return bc_score
|
235 |
+
|
236 |
+
|
237 |
+
# def predict_1on1(input):
|
238 |
+
# models = ['bard', 'claude', 'gpt4', 'mistral_ai', 'llama2']
|
239 |
+
# text = str(row["text"])
|
240 |
+
# predictions = {}
|
241 |
+
# prediction = predict(text, bard_model, bard_tokenizer) predictions['bard'] = prediction[1]
|
242 |
+
# prediction = predict(text, claude_model, claude_tokenizer) predictions['claude'] = prediction[1]
|
243 |
+
# prediction = predict(text, gpt4_model, gpt4_tokenizer) predictions['gpt4'] = prediction[1]
|
244 |
+
# prediction = predict(text, mistral_ai_model, mistral_ai_tokenizer) predictions['mistral_ai'] = prediction[1]
|
245 |
+
# prediction = predict(text, llama2_model, llama2_tokenizer) predictions['llama2'] = prediction[1]
|
246 |
+
# max_key = max(predictions, key=predictions.get)
|
requirements.txt
CHANGED
@@ -6,8 +6,8 @@ BeautifulSoup4
|
|
6 |
scrapingbee
|
7 |
requests
|
8 |
numpy
|
9 |
-
torch
|
10 |
-
transformers
|
11 |
transformers-interpret
|
12 |
textstat
|
13 |
scipy
|
|
|
6 |
scrapingbee
|
7 |
requests
|
8 |
numpy
|
9 |
+
torch
|
10 |
+
transformers
|
11 |
transformers-interpret
|
12 |
textstat
|
13 |
scipy
|
utils.py
CHANGED
@@ -11,284 +11,354 @@ import asyncio
|
|
11 |
import nltk
|
12 |
from sentence_transformers import SentenceTransformer, util
|
13 |
import threading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
nltk.download('punkt')
|
16 |
|
17 |
WORD = re.compile(r"\w+")
|
18 |
-
model = SentenceTransformer(
|
19 |
|
20 |
|
21 |
# returns cosine similarity of two vectors
|
22 |
# input: two vectors
|
23 |
# output: integer between 0 and 1.
|
24 |
-
def get_cosine(vec1, vec2):
|
25 |
-
|
26 |
|
27 |
-
|
28 |
-
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
# checking for divide by zero
|
36 |
-
if denominator == 0:
|
37 |
-
return 0.0
|
38 |
-
else:
|
39 |
-
return float(numerator) / denominator
|
40 |
-
|
41 |
-
|
42 |
-
# converts given text into a vector
|
43 |
-
def text_to_vector(text):
|
44 |
-
# uses the Regular expression above and gets all words
|
45 |
-
words = WORD.findall(text)
|
46 |
-
# returns a counter of all the words (count of number of occurences)
|
47 |
-
return Counter(words)
|
48 |
-
|
49 |
-
|
50 |
-
# returns cosine similarity of two words
|
51 |
-
# uses: text_to_vector(text) and get_cosine(v1,v2)
|
52 |
-
def cosineSim(text1, text2):
|
53 |
-
vector1 = text_to_vector(text1)
|
54 |
-
vector2 = text_to_vector(text2)
|
55 |
-
# print vector1,vector2
|
56 |
-
cosine = get_cosine(vector1, vector2)
|
57 |
-
return cosine
|
58 |
-
|
59 |
-
def cos_sim_torch(embedding_1, embedding_2):
|
60 |
-
return util.pytorch_cos_sim(embedding_1, embedding_2).item()
|
61 |
-
|
62 |
-
def embed_text(text):
|
63 |
-
return model.encode(text, convert_to_tensor=True)
|
64 |
-
|
65 |
-
def sentence_similarity(text1, text2):
|
66 |
-
embedding_1= model.encode(text1, convert_to_tensor=True)
|
67 |
-
embedding_2 = model.encode(text2, convert_to_tensor=True)
|
68 |
-
|
69 |
-
o = util.pytorch_cos_sim(embedding_1, embedding_2)
|
70 |
-
return o.item()
|
71 |
-
|
72 |
-
def get_soup_requests(url):
|
73 |
-
page = requests.get(url)
|
74 |
-
if page.status_code == 200:
|
75 |
-
soup = BeautifulSoup(page.content, "html.parser")
|
76 |
-
return soup
|
77 |
-
print("HTML soup failed")
|
78 |
-
return None
|
79 |
-
|
80 |
-
|
81 |
-
def get_soup_httpx(url):
|
82 |
-
client = httpx.Client(timeout=30)
|
83 |
-
try:
|
84 |
-
page = client.get(url)
|
85 |
-
if page.status_code == httpx.codes.OK:
|
86 |
-
soup = BeautifulSoup(page.content, "html.parser")
|
87 |
-
return soup
|
88 |
-
except:
|
89 |
-
print("HTTPx soup failed")
|
90 |
-
return None
|
91 |
-
|
92 |
-
def getSentences(text):
|
93 |
-
from nltk.tokenize import sent_tokenize
|
94 |
-
|
95 |
-
sents = sent_tokenize(text)
|
96 |
-
two_sents = []
|
97 |
-
for i in range(len(sents)):
|
98 |
-
if (i % 2) == 0:
|
99 |
-
two_sents.append(sents[i])
|
100 |
-
else:
|
101 |
-
two_sents[len(two_sents) - 1] += " " + sents[i]
|
102 |
-
return two_sents
|
103 |
-
|
104 |
-
|
105 |
-
def googleSearch(
|
106 |
-
plag_option,
|
107 |
-
sentences,
|
108 |
-
urlCount,
|
109 |
-
scoreArray,
|
110 |
-
urlList,
|
111 |
-
sorted_date,
|
112 |
-
domains_to_skip,
|
113 |
-
api_key,
|
114 |
-
cse_id,
|
115 |
-
**kwargs,
|
116 |
-
):
|
117 |
-
service = build("customsearch", "v1", developerKey=api_key)
|
118 |
-
for i, sentence in enumerate(sentences):
|
119 |
-
results = (
|
120 |
-
service.cse()
|
121 |
-
.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
|
122 |
-
.execute()
|
123 |
-
)
|
124 |
-
if "items" in results and len(results["items"]) > 0:
|
125 |
-
for count, link in enumerate(results["items"]):
|
126 |
-
# stop after 3 pages
|
127 |
-
if count >= 3:
|
128 |
-
break
|
129 |
-
# skip user selected domains
|
130 |
-
if any(
|
131 |
-
("." + domain) in link["link"]
|
132 |
-
for domain in domains_to_skip
|
133 |
-
):
|
134 |
-
continue
|
135 |
-
# clean up snippet of '...'
|
136 |
-
snippet = link["snippet"]
|
137 |
-
ind = snippet.find("...")
|
138 |
-
if ind < 20 and ind > 9:
|
139 |
-
snippet = snippet[ind + len("... ") :]
|
140 |
-
ind = snippet.find("...")
|
141 |
-
if ind > len(snippet) - 5:
|
142 |
-
snippet = snippet[:ind]
|
143 |
-
|
144 |
-
# update cosine similarity between snippet and given text
|
145 |
-
url = link["link"]
|
146 |
-
if url not in urlList:
|
147 |
-
urlList.append(url)
|
148 |
-
scoreArray.append([0] * len(sentences))
|
149 |
-
urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
150 |
-
if plag_option == 'Standard':
|
151 |
-
scoreArray[urlList.index(url)][i] = cosineSim(
|
152 |
-
sentence, snippet)
|
153 |
-
else :
|
154 |
-
scoreArray[urlList.index(url)][i] = sentence_similarity(
|
155 |
-
sentence, snippet
|
156 |
-
)
|
157 |
-
else:
|
158 |
-
print("Google Search failed")
|
159 |
-
return urlCount, scoreArray
|
160 |
-
|
161 |
-
|
162 |
-
def getQueries(text, n):
|
163 |
-
# return n-grams of size n
|
164 |
-
words = text.split()
|
165 |
-
return [words[i : i + n] for i in range(len(words) - n + 1)]
|
166 |
-
|
167 |
-
|
168 |
-
def print2D(array):
|
169 |
-
print(np.array(array))
|
170 |
-
|
171 |
-
|
172 |
-
def removePunc(text):
|
173 |
-
res = re.sub(r"[^\w\s]", "", text)
|
174 |
-
return res
|
175 |
-
|
176 |
-
|
177 |
-
async def get_url_data(url, client):
|
178 |
-
try:
|
179 |
-
r = await client.get(url)
|
180 |
-
# print(r.status_code)
|
181 |
-
if r.status_code == 200:
|
182 |
-
# print("in")
|
183 |
-
soup = BeautifulSoup(r.content, "html.parser")
|
184 |
-
return soup
|
185 |
-
except Exception:
|
186 |
-
print("HTTPx parallel soup failed")
|
187 |
-
return None
|
188 |
-
|
189 |
-
|
190 |
-
async def parallel_scrap(urls):
|
191 |
-
async with httpx.AsyncClient(timeout=30) as client:
|
192 |
-
tasks = []
|
193 |
-
for url in urls:
|
194 |
-
tasks.append(get_url_data(url=url, client=client))
|
195 |
-
results = await asyncio.gather(*tasks, return_exceptions=True)
|
196 |
-
return results
|
197 |
-
|
198 |
-
|
199 |
-
class TimeoutError(Exception):
|
200 |
-
pass
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
def matchingScore(sentence, content):
|
205 |
-
if sentence in content:
|
206 |
-
return 1
|
207 |
-
sentence = removePunc(sentence)
|
208 |
-
content = removePunc(content)
|
209 |
-
if sentence in content:
|
210 |
-
return 1
|
211 |
-
else:
|
212 |
-
n = 5
|
213 |
-
ngrams = getQueries(sentence, n)
|
214 |
-
if len(ngrams) == 0:
|
215 |
-
return 0
|
216 |
-
matched = [x for x in ngrams if " ".join(x) in content]
|
217 |
-
return len(matched) / len(ngrams)
|
218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
-
# def matchingScoreWithTimeout(sentence, content):
|
221 |
-
# def timeout_handler():
|
222 |
-
# raise TimeoutError("Function timed out")
|
223 |
|
224 |
-
#
|
225 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
# try:
|
227 |
-
#
|
228 |
-
#
|
229 |
-
#
|
230 |
-
#
|
231 |
-
# except
|
232 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
|
|
|
|
234 |
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
# content = removePunc(content)
|
237 |
# for j, sentence in enumerate(sentences):
|
238 |
# sentence = removePunc(sentence)
|
239 |
-
#
|
240 |
-
#
|
241 |
-
# else:
|
242 |
-
# n = 5
|
243 |
-
# ngrams = getQueries(sentence, n)
|
244 |
-
# if len(ngrams) == 0:
|
245 |
-
# return 0
|
246 |
-
# matched = [x for x in ngrams if " ".join(x) in content]
|
247 |
-
# ScoreArray[content_idx][j] = len(matched) / len(ngrams)
|
248 |
# print(
|
249 |
-
# f"Analyzed {content_idx+1} of
|
250 |
# )
|
251 |
# return ScoreArray
|
252 |
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
|
287 |
-
)
|
288 |
-
tasks[i][j] = sentence_similarity(sent, page_content)
|
289 |
-
else:
|
290 |
-
print(
|
291 |
-
f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
292 |
-
)
|
293 |
-
ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
294 |
-
return ScoreArray
|
|
|
11 |
import nltk
|
12 |
from sentence_transformers import SentenceTransformer, util
|
13 |
import threading
|
14 |
+
import torch
|
15 |
+
import re
|
16 |
+
import numpy as np
|
17 |
+
import asyncio
|
18 |
+
from datetime import date
|
19 |
+
import nltk
|
20 |
+
from unidecode import unidecode
|
21 |
+
from scipy.special import softmax
|
22 |
+
from transformers import AutoTokenizer
|
23 |
+
import yaml
|
24 |
+
import fitz
|
25 |
+
import os
|
26 |
+
|
27 |
+
|
28 |
+
def remove_accents(input_str):
|
29 |
+
text_no_accents = unidecode(input_str)
|
30 |
+
return text_no_accents
|
31 |
+
|
32 |
+
|
33 |
+
def remove_special_characters(text):
|
34 |
+
text = remove_accents(text)
|
35 |
+
pattern = r'[^\w\s\d.,!?\'"()-;]+'
|
36 |
+
text = re.sub(pattern, "", text)
|
37 |
+
return text
|
38 |
+
|
39 |
+
|
40 |
+
def remove_special_characters_2(text):
|
41 |
+
pattern = r"[^a-zA-Z0-9 ]+"
|
42 |
+
text = re.sub(pattern, "", text)
|
43 |
+
return text
|
44 |
+
|
45 |
+
|
46 |
+
def update_character_count(text):
|
47 |
+
return f"{len(text)} characters"
|
48 |
+
|
49 |
+
|
50 |
+
nltk.download("punkt")
|
51 |
+
|
52 |
+
|
53 |
+
with open("config.yaml", "r") as file:
|
54 |
+
params = yaml.safe_load(file)
|
55 |
+
|
56 |
+
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
57 |
+
|
58 |
+
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
59 |
+
|
60 |
+
|
61 |
+
def len_validator(text):
|
62 |
+
min_tokens = 200
|
63 |
+
lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
|
64 |
+
if lengt < min_tokens:
|
65 |
+
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
|
66 |
+
else:
|
67 |
+
return f"Input length ({lengt}) is satisified."
|
68 |
+
|
69 |
+
|
70 |
+
def extract_text_from_pdf(pdf_path):
|
71 |
+
doc = fitz.open(pdf_path)
|
72 |
+
text = ""
|
73 |
+
for page in doc:
|
74 |
+
text += page.get_text()
|
75 |
+
return text
|
76 |
|
|
|
77 |
|
78 |
WORD = re.compile(r"\w+")
|
79 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
80 |
|
81 |
|
82 |
# returns cosine similarity of two vectors
|
83 |
# input: two vectors
|
84 |
# output: integer between 0 and 1.
|
85 |
+
# def get_cosine(vec1, vec2):
|
86 |
+
# intersection = set(vec1.keys()) & set(vec2.keys())
|
87 |
|
88 |
+
# # calculating numerator
|
89 |
+
# numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
90 |
|
91 |
+
# # calculating denominator
|
92 |
+
# sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
|
93 |
+
# sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
|
94 |
+
# denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
95 |
+
|
96 |
+
# # checking for divide by zero
|
97 |
+
# if denominator == 0:
|
98 |
+
# return 0.0
|
99 |
+
# else:
|
100 |
+
# return float(numerator) / denominator
|
101 |
+
|
102 |
+
|
103 |
+
# # converts given text into a vector
|
104 |
+
# def text_to_vector(text):
|
105 |
+
# # uses the Regular expression above and gets all words
|
106 |
+
# words = WORD.findall(text)
|
107 |
+
# # returns a counter of all the words (count of number of occurences)
|
108 |
+
# return Counter(words)
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
+
# # returns cosine similarity of two words
|
112 |
+
# # uses: text_to_vector(text) and get_cosine(v1,v2)
|
113 |
+
# def cosineSim(text1, text2):
|
114 |
+
# vector1 = text_to_vector(text1)
|
115 |
+
# vector2 = text_to_vector(text2)
|
116 |
+
# # print vector1,vector2
|
117 |
+
# cosine = get_cosine(vector1, vector2)
|
118 |
+
# return cosine
|
119 |
|
|
|
|
|
|
|
120 |
|
121 |
+
# def cos_sim_torch(embedding_1, embedding_2):
|
122 |
+
# return util.pytorch_cos_sim(embedding_1, embedding_2).item()
|
123 |
+
|
124 |
+
|
125 |
+
# def embed_text(text):
|
126 |
+
# return model.encode(text, convert_to_tensor=True)
|
127 |
+
|
128 |
+
|
129 |
+
# def sentence_similarity(text1, text2):
|
130 |
+
# embedding_1 = model.encode(text1, convert_to_tensor=True)
|
131 |
+
# embedding_2 = model.encode(text2, convert_to_tensor=True)
|
132 |
+
|
133 |
+
# o = util.pytorch_cos_sim(embedding_1, embedding_2)
|
134 |
+
# return o.item()
|
135 |
+
|
136 |
+
|
137 |
+
# def get_soup_requests(url):
|
138 |
+
# page = requests.get(url)
|
139 |
+
# if page.status_code == 200:
|
140 |
+
# soup = BeautifulSoup(page.content, "html.parser")
|
141 |
+
# return soup
|
142 |
+
# print("HTML soup failed")
|
143 |
+
# return None
|
144 |
+
|
145 |
+
|
146 |
+
# def get_soup_httpx(url):
|
147 |
+
# client = httpx.Client(timeout=30)
|
148 |
# try:
|
149 |
+
# page = client.get(url)
|
150 |
+
# if page.status_code == httpx.codes.OK:
|
151 |
+
# soup = BeautifulSoup(page.content, "html.parser")
|
152 |
+
# return soup
|
153 |
+
# except:
|
154 |
+
# print("HTTPx soup failed")
|
155 |
+
# return None
|
156 |
+
|
157 |
+
|
158 |
+
# def getSentences(text):
|
159 |
+
# from nltk.tokenize import sent_tokenize
|
160 |
+
|
161 |
+
# sents = sent_tokenize(text)
|
162 |
+
# two_sents = []
|
163 |
+
# for i in range(len(sents)):
|
164 |
+
# if (i % 2) == 0:
|
165 |
+
# two_sents.append(sents[i])
|
166 |
+
# else:
|
167 |
+
# two_sents[len(two_sents) - 1] += " " + sents[i]
|
168 |
+
# return two_sents
|
169 |
+
|
170 |
+
|
171 |
+
# def googleSearch(
|
172 |
+
# plag_option,
|
173 |
+
# sentences,
|
174 |
+
# urlCount,
|
175 |
+
# scoreArray,
|
176 |
+
# urlList,
|
177 |
+
# sorted_date,
|
178 |
+
# domains_to_skip,
|
179 |
+
# api_key,
|
180 |
+
# cse_id,
|
181 |
+
# **kwargs,
|
182 |
+
# ):
|
183 |
+
# service = build("customsearch", "v1", developerKey=api_key)
|
184 |
+
# for i, sentence in enumerate(sentences):
|
185 |
+
# results = (
|
186 |
+
# service.cse()
|
187 |
+
# .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
|
188 |
+
# .execute()
|
189 |
+
# )
|
190 |
+
# if "items" in results and len(results["items"]) > 0:
|
191 |
+
# for count, link in enumerate(results["items"]):
|
192 |
+
# # stop after 3 pages
|
193 |
+
# if count >= 3:
|
194 |
+
# break
|
195 |
+
# # skip user selected domains
|
196 |
+
# if any(
|
197 |
+
# ("." + domain) in link["link"] for domain in domains_to_skip
|
198 |
+
# ):
|
199 |
+
# continue
|
200 |
+
# # clean up snippet of '...'
|
201 |
+
# snippet = link["snippet"]
|
202 |
+
# ind = snippet.find("...")
|
203 |
+
# if ind < 20 and ind > 9:
|
204 |
+
# snippet = snippet[ind + len("... ") :]
|
205 |
+
# ind = snippet.find("...")
|
206 |
+
# if ind > len(snippet) - 5:
|
207 |
+
# snippet = snippet[:ind]
|
208 |
+
|
209 |
+
# # update cosine similarity between snippet and given text
|
210 |
+
# url = link["link"]
|
211 |
+
# if url not in urlList:
|
212 |
+
# urlList.append(url)
|
213 |
+
# scoreArray.append([0] * len(sentences))
|
214 |
+
# urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
215 |
+
# if plag_option == "Standard":
|
216 |
+
# scoreArray[urlList.index(url)][i] = cosineSim(
|
217 |
+
# sentence, snippet
|
218 |
+
# )
|
219 |
+
# else:
|
220 |
+
# scoreArray[urlList.index(url)][i] = sentence_similarity(
|
221 |
+
# sentence, snippet
|
222 |
+
# )
|
223 |
+
# else:
|
224 |
+
# print("Google Search failed")
|
225 |
+
# return urlCount, scoreArray
|
226 |
+
|
227 |
+
|
228 |
+
# def getQueries(text, n):
|
229 |
+
# # return n-grams of size n
|
230 |
+
# words = text.split()
|
231 |
+
# return [words[i : i + n] for i in range(len(words) - n + 1)]
|
232 |
+
|
233 |
|
234 |
+
# def print2D(array):
|
235 |
+
# print(np.array(array))
|
236 |
|
237 |
+
|
238 |
+
# def removePunc(text):
|
239 |
+
# res = re.sub(r"[^\w\s]", "", text)
|
240 |
+
# return res
|
241 |
+
|
242 |
+
|
243 |
+
# async def get_url_data(url, client):
|
244 |
+
# try:
|
245 |
+
# r = await client.get(url)
|
246 |
+
# # print(r.status_code)
|
247 |
+
# if r.status_code == 200:
|
248 |
+
# # print("in")
|
249 |
+
# soup = BeautifulSoup(r.content, "html.parser")
|
250 |
+
# return soup
|
251 |
+
# except Exception:
|
252 |
+
# print("HTTPx parallel soup failed")
|
253 |
+
# return None
|
254 |
+
|
255 |
+
|
256 |
+
# async def parallel_scrap(urls):
|
257 |
+
# async with httpx.AsyncClient(timeout=30) as client:
|
258 |
+
# tasks = []
|
259 |
+
# for url in urls:
|
260 |
+
# tasks.append(get_url_data(url=url, client=client))
|
261 |
+
# results = await asyncio.gather(*tasks, return_exceptions=True)
|
262 |
+
# return results
|
263 |
+
|
264 |
+
|
265 |
+
# class TimeoutError(Exception):
|
266 |
+
# pass
|
267 |
+
|
268 |
+
|
269 |
+
# def matchingScore(sentence, content):
|
270 |
+
# if sentence in content:
|
271 |
+
# return 1
|
272 |
+
# sentence = removePunc(sentence)
|
273 |
+
# content = removePunc(content)
|
274 |
+
# if sentence in content:
|
275 |
+
# return 1
|
276 |
+
# else:
|
277 |
+
# n = 5
|
278 |
+
# ngrams = getQueries(sentence, n)
|
279 |
+
# if len(ngrams) == 0:
|
280 |
+
# return 0
|
281 |
+
# matched = [x for x in ngrams if " ".join(x) in content]
|
282 |
+
# return len(matched) / len(ngrams)
|
283 |
+
|
284 |
+
|
285 |
+
# # def matchingScoreWithTimeout(sentence, content):
|
286 |
+
# # def timeout_handler():
|
287 |
+
# # raise TimeoutError("Function timed out")
|
288 |
+
|
289 |
+
# # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds
|
290 |
+
# # timer.start()
|
291 |
+
# # try:
|
292 |
+
# # score = sentence_similarity(sentence, content)
|
293 |
+
# # # score = matchingScore(sentence, content)
|
294 |
+
# # timer.cancel() # Cancel the timer if calculation completes before timeout
|
295 |
+
# # return score
|
296 |
+
# # except TimeoutError:
|
297 |
+
# # return 0
|
298 |
+
|
299 |
+
|
300 |
+
# # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
|
301 |
+
# # content = removePunc(content)
|
302 |
+
# # for j, sentence in enumerate(sentences):
|
303 |
+
# # sentence = removePunc(sentence)
|
304 |
+
# # if sentence in content:
|
305 |
+
# # ScoreArray[content_idx][j] = 1
|
306 |
+
# # else:
|
307 |
+
# # n = 5
|
308 |
+
# # ngrams = getQueries(sentence, n)
|
309 |
+
# # if len(ngrams) == 0:
|
310 |
+
# # return 0
|
311 |
+
# # matched = [x for x in ngrams if " ".join(x) in content]
|
312 |
+
# # ScoreArray[content_idx][j] = len(matched) / len(ngrams)
|
313 |
+
# # print(
|
314 |
+
# # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
|
315 |
+
# # )
|
316 |
+
# # return ScoreArray
|
317 |
+
|
318 |
+
|
319 |
+
# async def matchingScoreAsync(
|
320 |
+
# sentences, content, content_idx, ScoreArray, model, util
|
321 |
+
# ):
|
322 |
# content = removePunc(content)
|
323 |
# for j, sentence in enumerate(sentences):
|
324 |
# sentence = removePunc(sentence)
|
325 |
+
# similarity_score = sentence_similarity(sentence, content, model, util)
|
326 |
+
# ScoreArray[content_idx][j] = similarity_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
# print(
|
328 |
+
# f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................"
|
329 |
# )
|
330 |
# return ScoreArray
|
331 |
|
332 |
+
|
333 |
+
# async def parallel_analyze(soups, sentences, ScoreArray):
|
334 |
+
# tasks = []
|
335 |
+
# for i, soup in enumerate(soups):
|
336 |
+
# if soup:
|
337 |
+
# page_content = soup.text
|
338 |
+
# tasks.append(
|
339 |
+
# matchingScoreAsync(sentences, page_content, i, ScoreArray)
|
340 |
+
# )
|
341 |
+
# else:
|
342 |
+
# print(
|
343 |
+
# f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
344 |
+
# )
|
345 |
+
# ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
346 |
+
# return ScoreArray
|
347 |
+
|
348 |
+
|
349 |
+
# async def parallel_analyze_2(soups, sentences, ScoreArray):
|
350 |
+
# tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
|
351 |
+
# for i, soup in enumerate(soups):
|
352 |
+
# if soup:
|
353 |
+
# page_content = soup.text
|
354 |
+
# for j, sent in enumerate(sentences):
|
355 |
+
# print(
|
356 |
+
# f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
|
357 |
+
# )
|
358 |
+
# tasks[i][j] = sentence_similarity(sent, page_content)
|
359 |
+
# else:
|
360 |
+
# print(
|
361 |
+
# f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
362 |
+
# )
|
363 |
+
# ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
364 |
+
# return ScoreArray
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|