aliasgerovs commited on
Commit
71d19c5
2 Parent(s): 25f5a14 8e98c8e
__pycache__/analysis.cpython-311.pyc ADDED
Binary file (4.75 kB). View file
 
__pycache__/app.cpython-311.pyc ADDED
Binary file (10.9 kB). View file
 
__pycache__/explainability.cpython-311.pyc ADDED
Binary file (7.89 kB). View file
 
__pycache__/plagiarism.cpython-311.pyc ADDED
Binary file (14.1 kB). View file
 
__pycache__/predictors.cpython-311.pyc ADDED
Binary file (12 kB). View file
 
__pycache__/utils.cpython-311.pyc ADDED
Binary file (3.76 kB). View file
 
analysis.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import httpx
3
+ import torch
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ import numpy as np
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+ import asyncio
9
+ from scipy.special import softmax
10
+ from evaluate import load
11
+ from datetime import date
12
+ import nltk
13
+ import fitz
14
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
15
+ import nltk, spacy, subprocess, torch
16
+ import plotly.graph_objects as go
17
+ import torch.nn.functional as F
18
+ import nltk
19
+ from unidecode import unidecode
20
+ import time
21
+ import yaml
22
+ import nltk
23
+ import os
24
+ from explainability import *
25
+ from dotenv import load_dotenv
26
+ import subprocess
27
+
28
+ nltk.download("punkt")
29
+ nltk.download("stopwords")
30
+ load_dotenv()
31
+ with open("config.yaml", "r") as file:
32
+ params = yaml.safe_load(file)
33
+ device = "cuda" if torch.cuda.is_available() else "cpu"
34
+ readability_model_id = params["READABILITY_MODEL_ID"]
35
+ gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
36
+ gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
37
+
38
+ command = ["python", "-m", "spacy", "download", "en_core_web_sm"]
39
+ subprocess.run(command)
40
+ nlp = spacy.load("en_core_web_sm")
41
+
42
+
43
+ def depth_analysis(input_text):
44
+ processed_words = preprocess_text1(input_text)
45
+ ttr_value = vocabulary_richness_ttr(processed_words)
46
+ gunning_fog = calculate_gunning_fog(input_text)
47
+ gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
48
+ words, sentences = preprocess_text2(input_text)
49
+ average_sentence_length = calculate_average_sentence_length(sentences)
50
+ average_word_length = calculate_average_word_length(words)
51
+ average_sentence_length_norm = normalize(
52
+ average_sentence_length, min_value=0, max_value=40
53
+ )
54
+ average_word_length_norm = normalize(
55
+ average_word_length, min_value=0, max_value=8
56
+ )
57
+ average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
58
+ average_tree_depth_norm = normalize(
59
+ average_tree_depth, min_value=0, max_value=10
60
+ )
61
+ perplexity = calculate_perplexity(
62
+ input_text, gpt2_model, gpt2_tokenizer, device
63
+ )
64
+ perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
65
+
66
+ features = {
67
+ "readability": gunning_fog_norm,
68
+ "syntactic tree depth": average_tree_depth_norm,
69
+ "vocabulary richness": ttr_value,
70
+ "perplexity": perplexity_norm,
71
+ "average sentence length": average_sentence_length_norm,
72
+ "average word length": average_word_length_norm,
73
+ }
74
+ fig = go.Figure()
75
+ fig.add_trace(
76
+ go.Scatterpolar(
77
+ r=list(features.values()),
78
+ theta=list(features.keys()),
79
+ fill="toself",
80
+ name="Radar Plot",
81
+ )
82
+ )
83
+ fig.update_layout(
84
+ polar=dict(
85
+ radialaxis=dict(
86
+ visible=True,
87
+ range=[0, 100],
88
+ )
89
+ ),
90
+ showlegend=False,
91
+ margin=dict(
92
+ l=10,
93
+ r=20,
94
+ b=10,
95
+ t=10,
96
+ ),
97
+ )
98
+ return fig
app.py CHANGED
@@ -1,366 +1,24 @@
1
- from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore, matchingScoreWithTimeout
2
  import gradio as gr
3
- from urllib.request import urlopen, Request
4
- from googleapiclient.discovery import build
5
- import requests
6
- import httpx
7
- import torch
8
- import re
9
- from bs4 import BeautifulSoup
10
  import numpy as np
11
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
- import asyncio
13
- from scipy.special import softmax
14
- from evaluate import load
15
  from datetime import date
16
- import nltk
17
- import fitz
18
- from transformers import GPT2LMHeadModel, GPT2TokenizerFast
19
- import nltk, spacy, subprocess, torch
20
- import plotly.graph_objects as go
21
- import torch.nn.functional as F
22
- import nltk
23
- from unidecode import unidecode
24
- import time
25
-
26
- nltk.download('punkt')
27
-
28
- from writing_analysis import (
29
- normalize,
30
- preprocess_text1,
31
- preprocess_text2,
32
- vocabulary_richness_ttr,
33
- calculate_gunning_fog,
34
- calculate_average_sentence_length,
35
- calculate_average_word_length,
36
- calculate_syntactic_tree_depth,
37
- calculate_perplexity,
38
-
39
- )
40
 
41
  np.set_printoptions(suppress=True)
42
 
43
 
44
- def plagiarism_check(
45
- plag_option,
46
- input,
47
- year_from,
48
- month_from,
49
- day_from,
50
- year_to,
51
- month_to,
52
- day_to,
53
- domains_to_skip,
54
- ):
55
- api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
56
- api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
57
- api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
58
- # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
59
- cse_id = "851813e81162b4ed4"
60
-
61
- time1 = time.perf_counter()
62
- start = time.perf_counter()
63
- sentences = getSentences(input)
64
- urlCount = {}
65
- ScoreArray = []
66
- urlList = []
67
-
68
- date_from = build_date(year_from, month_from, day_from)
69
- date_to = build_date(year_to, month_to, day_to)
70
- sort_date = f"date:r:{date_from}:{date_to}"
71
-
72
- # get list of URLS to check
73
- urlCount, ScoreArray = googleSearch(
74
- plag_option,
75
- sentences,
76
- urlCount,
77
- ScoreArray,
78
- urlList,
79
- sort_date,
80
- domains_to_skip,
81
- api_key,
82
- cse_id,
83
- )
84
- print(f"Time for google search: {time.perf_counter()-time1}")
85
- time1 = time.perf_counter()
86
-
87
- print("Number of URLs: ", len(urlCount))
88
- print(urlList)
89
-
90
- # Scrape URLs in list
91
- formatted_tokens = []
92
- soups = asyncio.run(parallel_scrap(urlList))
93
-
94
- print(f"Time for scraping: {time.perf_counter()-time1}")
95
- time1 = time.perf_counter()
96
- print(len(soups))
97
- print(
98
- "Successful scraping: "
99
- + str(len([x for x in soups if x is not None]))
100
- + "out of "
101
- + str(len(urlList))
102
- )
103
-
104
- # Populate matching scores for scrapped pages
105
- for i, soup in enumerate(soups):
106
- print(f"Analyzing {i+1} of {len(soups)} soups........................")
107
- if soup:
108
- page_content = soup.text
109
- for j, sent in enumerate(sentences):
110
- # score = matchingScore(sent, page_content)
111
- score = matchingScoreWithTimeout(sent, page_content)
112
- ScoreArray[i][j] = score
113
-
114
- print(f"Time for matching score: {time.perf_counter()-time1}")
115
- time1 = time.perf_counter()
116
-
117
- # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
118
- # print("New Score Array:\n")
119
- # print2D(ScoreArray)
120
-
121
- # Gradio formatting section
122
- sentencePlag = [False] * len(sentences)
123
- sentenceToMaxURL = [-1] * len(sentences)
124
- for j in range(len(sentences)):
125
- if j > 0:
126
- maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
127
- sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
128
- else:
129
- maxScore = -1
130
- for i in range(len(ScoreArray)):
131
- margin = (
132
- 0.1
133
- if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
134
- else 0
135
- )
136
- if ScoreArray[i][j] - maxScore > margin:
137
- maxScore = ScoreArray[i][j]
138
- sentenceToMaxURL[j] = i
139
- if maxScore > 0.5:
140
- sentencePlag[j] = True
141
-
142
- if (
143
- (len(sentences) > 1)
144
- and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
145
- and (
146
- ScoreArray[sentenceToMaxURL[0]][0]
147
- - ScoreArray[sentenceToMaxURL[1]][0]
148
- < 0.1
149
- )
150
- ):
151
- sentenceToMaxURL[0] = sentenceToMaxURL[1]
152
-
153
- index = np.unique(sentenceToMaxURL)
154
-
155
- urlScore = {}
156
- for url in index:
157
- s = [
158
- ScoreArray[url][sen]
159
- for sen in range(len(sentences))
160
- if sentenceToMaxURL[sen] == url
161
- ]
162
- urlScore[url] = sum(s) / len(s)
163
-
164
- index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
165
-
166
- urlMap = {}
167
- for count, i in enumerate(index_descending):
168
- urlMap[i] = count + 1
169
- for i, sent in enumerate(sentences):
170
- formatted_tokens.append(
171
- (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
172
- )
173
-
174
- formatted_tokens.append(("\n", None))
175
- formatted_tokens.append(("\n", None))
176
- formatted_tokens.append(("\n", None))
177
-
178
- print(formatted_tokens)
179
- print(index_descending)
180
-
181
- for ind in index_descending:
182
- formatted_tokens.append(
183
- (
184
- urlList[ind] + " --- Matching Score: " + f"{str(round(urlScore[ind] * 100, 2))}%",
185
- "[" + str(urlMap[ind]) + "]",
186
- )
187
  )
188
- formatted_tokens.append(("\n", None))
189
-
190
- print(f"Formatted Tokens: {formatted_tokens}")
191
-
192
- print(f"Time for plagiarism check: {time.perf_counter()-start}")
193
 
194
- return formatted_tokens
195
-
196
-
197
- """
198
- AI DETECTION SECTION
199
- """
200
- device = "cuda" if torch.cuda.is_available() else "cpu"
201
-
202
- text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
203
- text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
204
- text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
205
-
206
- text_mc_model_path = "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
207
- text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
208
- text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
209
-
210
- quillbot_labels = ["Original", "QuillBot"]
211
- quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
212
- quillbot_model = AutoModelForSequenceClassification.from_pretrained("polygraf-ai/quillbot-detector-28k").to(device)
213
-
214
- def remove_accents(input_str):
215
- text_no_accents = unidecode(input_str)
216
- return text_no_accents
217
-
218
- def remove_special_characters(text):
219
- text = remove_accents(text)
220
- pattern = r'[^\w\s\d.,!?\'"()-;]+'
221
- text = re.sub(pattern, '', text)
222
- return text
223
-
224
- def remove_special_characters_2(text):
225
- pattern = r'[^a-zA-Z0-9 ]+'
226
- text = re.sub(pattern, '', text)
227
- return text
228
-
229
- def update_character_count(text):
230
- return f"{len(text)} characters"
231
-
232
-
233
- def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30, min_last_segment_length=100, type_det='bc'):
234
- sentences = nltk.sent_tokenize(text)
235
- segments = []
236
- current_segment = []
237
- current_length = 0
238
-
239
- if type_det == 'bc':
240
- tokenizer = text_bc_tokenizer
241
- max_length = 333
242
-
243
- elif type_det == 'mc':
244
- tokenizer = text_mc_tokenizer
245
- max_length = 256
246
-
247
- for sentence in sentences:
248
- tokens = tokenizer.tokenize(sentence)
249
- sentence_length = len(tokens)
250
-
251
- if current_length + sentence_length <= max_length + tolerance - 2:
252
- current_segment.append(sentence)
253
- current_length += sentence_length
254
- else:
255
- if current_segment:
256
- encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
257
- segments.append((current_segment, len(encoded_segment)))
258
- current_segment = [sentence]
259
- current_length = sentence_length
260
-
261
- if current_segment:
262
- encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
263
- segments.append((current_segment, len(encoded_segment)))
264
-
265
- final_segments = []
266
- for i, (seg, length) in enumerate(segments):
267
- if i == len(segments) - 1:
268
- if length < min_last_segment_length and len(final_segments) > 0:
269
- prev_seg, prev_length = final_segments[-1]
270
- combined_encoded = tokenizer.encode(' '.join(prev_seg + seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
271
- if len(combined_encoded) <= max_length + tolerance:
272
- final_segments[-1] = (prev_seg + seg, len(combined_encoded))
273
- else:
274
- final_segments.append((seg, length))
275
- else:
276
- final_segments.append((seg, length))
277
- else:
278
- final_segments.append((seg, length))
279
-
280
- decoded_segments = []
281
- encoded_segments = []
282
- for seg, _ in final_segments:
283
- encoded_segment = tokenizer.encode(' '.join(seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
284
- decoded_segment = tokenizer.decode(encoded_segment)
285
- decoded_segments.append(decoded_segment)
286
- return decoded_segments
287
-
288
- def predict_quillbot(text):
289
- with torch.no_grad():
290
- quillbot_model.eval()
291
- tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)
292
- output = quillbot_model(**tokenized_text)
293
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
294
- q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
295
- return q_score
296
-
297
- def predict_bc(model, tokenizer, text):
298
- with torch.no_grad():
299
- model.eval()
300
- tokens = text_bc_tokenizer(
301
- text, padding='max_length', truncation=True, max_length=333, return_tensors="pt"
302
- ).to(device)
303
- output = model(**tokens)
304
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
305
- print("BC Score: ", output_norm)
306
- return output_norm
307
-
308
- def predict_mc(model, tokenizer, text):
309
- with torch.no_grad():
310
- model.eval()
311
- tokens = text_mc_tokenizer(
312
- text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
313
- ).to(device)
314
- output = model(**tokens)
315
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
316
- print("MC Score: ", output_norm)
317
- return output_norm
318
-
319
- def ai_generated_test(ai_option, input):
320
-
321
- bc_scores = []
322
- mc_scores = []
323
- samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
324
- samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc'))
325
- segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
326
- segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
327
-
328
- for i in range(samples_len_bc):
329
- cleaned_text_bc = remove_special_characters(segments_bc[i])
330
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
331
- bc_scores.append(bc_score)
332
-
333
- for i in range(samples_len_mc):
334
- cleaned_text_mc = remove_special_characters(segments_mc[i])
335
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
336
- mc_scores.append(mc_score)
337
-
338
- bc_scores_array = np.array(bc_scores)
339
- mc_scores_array = np.array(mc_scores)
340
- average_bc_scores = np.mean(bc_scores_array, axis=0)
341
- average_mc_scores = np.mean(mc_scores_array, axis=0)
342
- bc_score_list = average_bc_scores.tolist()
343
- mc_score_list = average_mc_scores.tolist()
344
-
345
- bc_score = {"AI": bc_score[1].item(), "HUMAN": bc_score[0].item()}
346
- mc_score = {}
347
- label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
348
-
349
- for score, label in zip(mc_score_list, label_map):
350
- mc_score[label.upper()] = score
351
-
352
- sum_prob = 1 - bc_score["HUMAN"]
353
- for key, value in mc_score.items():
354
- mc_score[key] = value * sum_prob
355
-
356
- if ai_option == "Human vs AI":
357
- mc_score = {}
358
-
359
- if sum_prob < 0.01 :
360
- mc_score = {}
361
- return bc_score, mc_score
362
- else:
363
- return bc_score, mc_score
364
 
365
  # COMBINED
366
  def main(
@@ -389,117 +47,18 @@ def main(
389
  domains_to_skip,
390
  )
391
  depth_analysis_plot = depth_analysis(input)
392
- bc_score, mc_score = ai_generated_test(ai_option,input)
 
393
  quilscore = predict_quillbot(input)
394
-
395
- return (
396
- bc_score,
397
- mc_score,
398
- formatted_tokens,
399
- depth_analysis_plot,
400
- quilscore
401
- )
402
 
403
-
404
- def build_date(year, month, day):
405
- return f"{year}{months[month]}{day}"
406
-
407
- def len_validator(text):
408
- min_tokens = 200
409
- lengt = len(text_bc_tokenizer.tokenize(text = text, return_tensors="pt"))
410
- if lengt < min_tokens:
411
- return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
412
- else :
413
- return f"Input length ({lengt}) is satisified."
414
-
415
- def extract_text_from_pdf(pdf_path):
416
- doc = fitz.open(pdf_path)
417
- text = ""
418
- for page in doc:
419
- text += page.get_text()
420
- return text
421
-
422
-
423
- # DEPTH ANALYSIS
424
- print("loading depth analysis")
425
- nltk.download('stopwords')
426
- nltk.download('punkt')
427
- command = ['python3', '-m', 'spacy', 'download', 'en_core_web_sm']
428
- # Execute the command
429
- subprocess.run(command)
430
- nlp = spacy.load("en_core_web_sm")
431
-
432
- # for perplexity
433
- model_id = "gpt2"
434
- gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
435
- gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
436
-
437
- def depth_analysis(input_text):
438
-
439
- # vocanulary richness
440
- processed_words = preprocess_text1(input_text)
441
- ttr_value = vocabulary_richness_ttr(processed_words)
442
-
443
- # readability
444
- gunning_fog = calculate_gunning_fog(input_text)
445
- gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
446
-
447
- # average sentence length and average word length
448
- words, sentences = preprocess_text2(input_text)
449
- average_sentence_length = calculate_average_sentence_length(sentences)
450
- average_word_length = calculate_average_word_length(words)
451
- average_sentence_length_norm = normalize(average_sentence_length, min_value=0, max_value=40)
452
- average_word_length_norm = normalize(average_word_length, min_value=0, max_value=8)
453
-
454
- # syntactic_tree_depth
455
- average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
456
- average_tree_depth_norm = normalize(average_tree_depth, min_value=0, max_value=10)
457
-
458
- # perplexity
459
- perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
460
- perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
461
-
462
- features = {
463
- "readability": gunning_fog_norm,
464
- "syntactic tree depth": average_tree_depth_norm,
465
- "vocabulary richness": ttr_value,
466
- "perplexity": perplexity_norm,
467
- "average sentence length": average_sentence_length_norm,
468
- "average word length": average_word_length_norm,
469
- }
470
-
471
- print(features)
472
-
473
- fig = go.Figure()
474
-
475
- fig.add_trace(go.Scatterpolar(
476
- r=list(features.values()),
477
- theta=list(features.keys()),
478
- fill='toself',
479
- name='Radar Plot'
480
- ))
481
-
482
- fig.update_layout(
483
- polar=dict(
484
- radialaxis=dict(
485
- visible=True,
486
- range=[0, 100],
487
- )),
488
- showlegend=False,
489
- # autosize=False,
490
- # width=600,
491
- # height=600,
492
- margin=dict(
493
- l=10,
494
- r=20,
495
- b=10,
496
- t=10,
497
- # pad=100
498
- ),
499
  )
500
 
501
- return fig
502
-
503
 
504
  # START OF GRADIO
505
 
@@ -536,16 +95,23 @@ with gr.Blocks() as demo:
536
  with gr.Row():
537
  input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
538
  file_input = gr.File(label="Upload PDF")
539
- file_input.change(fn=extract_text_from_pdf, inputs=file_input, outputs=input_text)
 
 
540
 
541
- char_count = gr.Textbox(label="Minumum Character Limit Check")
542
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
543
 
544
  with gr.Row():
545
  with gr.Column():
546
- ai_option = gr.Radio(["Human vs AI", "Human vs AI Source Models"], label="Choose an option please.")
 
 
 
547
  with gr.Column():
548
- plag_option = gr.Radio(["Standard", "Advanced"], label="Choose an option please.")
 
 
549
 
550
  with gr.Row():
551
  with gr.Column():
@@ -555,7 +121,7 @@ with gr.Blocks() as demo:
555
  only_plagiarism_btn = gr.Button("Source Check")
556
 
557
  with gr.Row():
558
- quillbot_check = gr.Button("Humanized Text Check (Quillbot)")
559
 
560
  with gr.Row():
561
  depth_analysis_btn = gr.Button("Detailed Writing Analysis")
@@ -568,14 +134,14 @@ with gr.Blocks() as demo:
568
  ## Output
569
  """
570
  )
571
-
572
  # models = gr.Dropdown(
573
- # model_list,
574
- # value=model_list,
575
- # multiselect=True,
576
- # label="Models to test against",
577
- # )
578
-
579
  with gr.Row():
580
  with gr.Column():
581
  bcLabel = gr.Label(label="Source")
@@ -627,9 +193,7 @@ with gr.Blocks() as demo:
627
 
628
  with gr.Row():
629
  with gr.Column():
630
- writing_analysis_plot = gr.Plot(
631
- label="Writing Analysis Plot"
632
- )
633
 
634
  full_check_btn.click(
635
  fn=main,
@@ -651,7 +215,7 @@ with gr.Blocks() as demo:
651
  mcLabel,
652
  sentenceBreakdown,
653
  writing_analysis_plot,
654
- QLabel
655
  ],
656
  api_name="main",
657
  )
@@ -702,4 +266,4 @@ with gr.Blocks() as demo:
702
  date_from = ""
703
  date_to = ""
704
 
705
- demo.launch(share=True, server_name="0.0.0.0", server_port =80, auth=("polygraf-admin", "test@aisd"))
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
2
  import numpy as np
 
 
 
 
3
  from datetime import date
4
+ from predictors import predict_bc_scores, predict_mc_scores
5
+ from analysis import depth_analysis
6
+ from predictors import predict_quillbot
7
+ from plagiarism import plagiarism_check, build_date
8
+ from utils import extract_text_from_pdf, len_validator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  np.set_printoptions(suppress=True)
11
 
12
 
13
+ def ai_generated_test(option, input):
14
+ if option == "Human vs AI":
15
+ return predict_bc_scores(input), None
16
+ else:
17
+ return (
18
+ predict_bc_scores(input),
19
+ predict_mc_scores(input),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  )
 
 
 
 
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # COMBINED
24
  def main(
 
47
  domains_to_skip,
48
  )
49
  depth_analysis_plot = depth_analysis(input)
50
+ bc_score = predict_bc_scores(input)
51
+ mc_score = predict_mc_scores(input)
52
  quilscore = predict_quillbot(input)
 
 
 
 
 
 
 
 
53
 
54
+ return (
55
+ bc_score,
56
+ mc_score,
57
+ formatted_tokens,
58
+ depth_analysis_plot,
59
+ quilscore,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  )
61
 
 
 
62
 
63
  # START OF GRADIO
64
 
 
95
  with gr.Row():
96
  input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
97
  file_input = gr.File(label="Upload PDF")
98
+ file_input.change(
99
+ fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
100
+ )
101
 
102
+ char_count = gr.Textbox(label="Minumum Character Limit Check")
103
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
104
 
105
  with gr.Row():
106
  with gr.Column():
107
+ ai_option = gr.Radio(
108
+ ["Human vs AI", "Human vs AI Source Models"],
109
+ label="Choose an option please.",
110
+ )
111
  with gr.Column():
112
+ plag_option = gr.Radio(
113
+ ["Standard", "Advanced"], label="Choose an option please."
114
+ )
115
 
116
  with gr.Row():
117
  with gr.Column():
 
121
  only_plagiarism_btn = gr.Button("Source Check")
122
 
123
  with gr.Row():
124
+ quillbot_check = gr.Button("Humanized Text Check")
125
 
126
  with gr.Row():
127
  depth_analysis_btn = gr.Button("Detailed Writing Analysis")
 
134
  ## Output
135
  """
136
  )
137
+
138
  # models = gr.Dropdown(
139
+ # model_list,
140
+ # value=model_list,
141
+ # multiselect=True,
142
+ # label="Models to test against",
143
+ # )
144
+
145
  with gr.Row():
146
  with gr.Column():
147
  bcLabel = gr.Label(label="Source")
 
193
 
194
  with gr.Row():
195
  with gr.Column():
196
+ writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
 
 
197
 
198
  full_check_btn.click(
199
  fn=main,
 
215
  mcLabel,
216
  sentenceBreakdown,
217
  writing_analysis_plot,
218
+ QLabel,
219
  ],
220
  api_name="main",
221
  )
 
266
  date_from = ""
267
  date_to = ""
268
 
269
+ demo.launch(share=True, server_name="0.0.0.0", server_port =80, auth=("polygraf-admin", "test@aisd"))
explainability.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, textstat
2
+ from nltk import FreqDist
3
+ from nltk.corpus import stopwords
4
+ from nltk.tokenize import word_tokenize, sent_tokenize
5
+ import torch
6
+ import nltk
7
+ from tqdm import tqdm
8
+
9
+ nltk.download("punkt")
10
+
11
+
12
+ def normalize(value, min_value, max_value):
13
+ normalized_value = ((value - min_value) * 100) / (max_value - min_value)
14
+ return max(0, min(100, normalized_value))
15
+
16
+
17
+ def preprocess_text1(text):
18
+ text = text.lower()
19
+ text = re.sub(r"[^\w\s]", "", text) # remove punctuation
20
+ stop_words = set(stopwords.words("english")) # remove stopwords
21
+ words = [word for word in text.split() if word not in stop_words]
22
+ words = [word for word in words if not word.isdigit()] # remove numbers
23
+ return words
24
+
25
+
26
+ def vocabulary_richness_ttr(words):
27
+ unique_words = set(words)
28
+ ttr = len(unique_words) / len(words) * 100
29
+ return ttr
30
+
31
+
32
+ def calculate_gunning_fog(text):
33
+ """range 0-20"""
34
+ gunning_fog = textstat.gunning_fog(text)
35
+ return gunning_fog
36
+
37
+
38
+ def calculate_automated_readability_index(text):
39
+ """range 1-20"""
40
+ ari = textstat.automated_readability_index(text)
41
+ return ari
42
+
43
+
44
+ def calculate_flesch_reading_ease(text):
45
+ """range 0-100"""
46
+ fre = textstat.flesch_reading_ease(text)
47
+ return fre
48
+
49
+
50
+ def preprocess_text2(text):
51
+ sentences = sent_tokenize(text)
52
+ words = [
53
+ word.lower()
54
+ for sent in sentences
55
+ for word in word_tokenize(sent)
56
+ if word.isalnum()
57
+ ]
58
+ stop_words = set(stopwords.words("english"))
59
+ words = [word for word in words if word not in stop_words]
60
+ return words, sentences
61
+
62
+
63
+ def calculate_average_sentence_length(sentences):
64
+ """range 0-40 or 50 based on the histogram"""
65
+ total_words = sum(len(word_tokenize(sent)) for sent in sentences)
66
+ average_sentence_length = total_words / (len(sentences) + 0.0000001)
67
+ return average_sentence_length
68
+
69
+
70
+ def calculate_average_word_length(words):
71
+ """range 0-8 based on the histogram"""
72
+ total_characters = sum(len(word) for word in words)
73
+ average_word_length = total_characters / (len(words) + 0.0000001)
74
+ return average_word_length
75
+
76
+
77
+ def calculate_max_depth(sent):
78
+ return max(len(list(token.ancestors)) for token in sent)
79
+
80
+
81
+ def calculate_syntactic_tree_depth(nlp, text):
82
+ """0-10 based on the histogram"""
83
+ doc = nlp(text)
84
+ sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
85
+ average_depth = (
86
+ sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
87
+ )
88
+ return average_depth
89
+
90
+
91
+ def calculate_perplexity(text, model, tokenizer, device, stride=512):
92
+ """range 0-30 based on the histogram"""
93
+ encodings = tokenizer(text, return_tensors="pt")
94
+ max_length = model.config.n_positions
95
+ seq_len = encodings.input_ids.size(1)
96
+
97
+ nlls = []
98
+ prev_end_loc = 0
99
+ for begin_loc in tqdm(range(0, seq_len, stride)):
100
+ end_loc = min(begin_loc + max_length, seq_len)
101
+ trg_len = (
102
+ end_loc - prev_end_loc
103
+ ) # may be different from stride on last loop
104
+ input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
105
+ target_ids = input_ids.clone()
106
+ target_ids[:, :-trg_len] = -100
107
+
108
+ with torch.no_grad():
109
+ outputs = model(input_ids, labels=target_ids)
110
+ neg_log_likelihood = outputs.loss
111
+
112
+ nlls.append(neg_log_likelihood)
113
+
114
+ prev_end_loc = end_loc
115
+ if end_loc == seq_len:
116
+ break
117
+
118
+ ppl = torch.exp(torch.stack(nlls).mean())
119
+ return ppl.item()
plagiarism.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from nltk.tokenize import sent_tokenize
3
+ from googleapiclient.discovery import build
4
+ from collections import Counter
5
+ import re, math
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import asyncio
8
+ import httpx
9
+ from bs4 import BeautifulSoup
10
+ import numpy as np
11
+ import concurrent
12
+
13
+
14
+ WORD = re.compile(r"\w+")
15
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
16
+
17
+
18
+ # returns cosine similarity of two vectors
19
+ # input: two vectors
20
+ # output: integer between 0 and 1.
21
+ def get_cosine(vec1, vec2):
22
+ intersection = set(vec1.keys()) & set(vec2.keys())
23
+
24
+ # calculating numerator
25
+ numerator = sum([vec1[x] * vec2[x] for x in intersection])
26
+
27
+ # calculating denominator
28
+ sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
29
+ sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
30
+ denominator = math.sqrt(sum1) * math.sqrt(sum2)
31
+
32
+ # checking for divide by zero
33
+ if denominator == 0:
34
+ return 0.0
35
+ else:
36
+ return float(numerator) / denominator
37
+
38
+
39
+ # converts given text into a vector
40
+ def text_to_vector(text):
41
+ # uses the Regular expression above and gets all words
42
+ words = WORD.findall(text)
43
+ # returns a counter of all the words (count of number of occurences)
44
+ return Counter(words)
45
+
46
+
47
+ # returns cosine similarity of two words
48
+ # uses: text_to_vector(text) and get_cosine(v1,v2)
49
+ def cosineSim(text1, text2):
50
+ vector1 = text_to_vector(text1)
51
+ vector2 = text_to_vector(text2)
52
+ # print vector1,vector2
53
+ cosine = get_cosine(vector1, vector2)
54
+ return cosine
55
+
56
+
57
+ def cos_sim_torch(embedding_1, embedding_2):
58
+ return util.pytorch_cos_sim(embedding_1, embedding_2).item()
59
+
60
+
61
+ def embed_text(text):
62
+ return model.encode(text, convert_to_tensor=True)
63
+
64
+
65
+ def sentence_similarity(text1, text2):
66
+ embedding_1 = model.encode(text1, convert_to_tensor=True)
67
+ embedding_2 = model.encode(text2, convert_to_tensor=True)
68
+
69
+ o = util.pytorch_cos_sim(embedding_1, embedding_2)
70
+ return o.item()
71
+
72
+
73
+ def google_search(
74
+ plag_option,
75
+ sentences,
76
+ urlCount,
77
+ scoreArray,
78
+ urlList,
79
+ sorted_date,
80
+ domains_to_skip,
81
+ api_key,
82
+ cse_id,
83
+ **kwargs,
84
+ ):
85
+ service = build("customsearch", "v1", developerKey=api_key)
86
+ for i, sentence in enumerate(sentences):
87
+ results = (
88
+ service.cse()
89
+ .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
90
+ .execute()
91
+ )
92
+ if "items" in results and len(results["items"]) > 0:
93
+ for count, link in enumerate(results["items"]):
94
+ # stop after 3 pages
95
+ if count >= 3:
96
+ break
97
+ # skip user selected domains
98
+ if any(
99
+ ("." + domain) in link["link"] for domain in domains_to_skip
100
+ ):
101
+ continue
102
+ # clean up snippet of '...'
103
+ snippet = link["snippet"]
104
+ ind = snippet.find("...")
105
+ if ind < 20 and ind > 9:
106
+ snippet = snippet[ind + len("... ") :]
107
+ ind = snippet.find("...")
108
+ if ind > len(snippet) - 5:
109
+ snippet = snippet[:ind]
110
+
111
+ # update cosine similarity between snippet and given text
112
+ url = link["link"]
113
+ if url not in urlList:
114
+ urlList.append(url)
115
+ scoreArray.append([0] * len(sentences))
116
+ urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
117
+ if plag_option == "Standard":
118
+ scoreArray[urlList.index(url)][i] = cosineSim(
119
+ sentence, snippet
120
+ )
121
+ else:
122
+ scoreArray[urlList.index(url)][i] = sentence_similarity(
123
+ sentence, snippet
124
+ )
125
+ return urlCount, scoreArray
126
+
127
+
128
+ def split_sentence_blocks(text):
129
+
130
+ sents = sent_tokenize(text)
131
+ two_sents = []
132
+ for i in range(len(sents)):
133
+ if (i % 4) == 0:
134
+ two_sents.append(sents[i])
135
+ else:
136
+ two_sents[len(two_sents) - 1] += " " + sents[i]
137
+ return two_sents
138
+
139
+
140
+ months = {
141
+ "January": "01",
142
+ "February": "02",
143
+ "March": "03",
144
+ "April": "04",
145
+ "May": "05",
146
+ "June": "06",
147
+ "July": "07",
148
+ "August": "08",
149
+ "September": "09",
150
+ "October": "10",
151
+ "November": "11",
152
+ "December": "12",
153
+ }
154
+
155
+
156
+ def build_date(year=2024, month="March", day=1):
157
+ return f"{year}{months[month]}{day}"
158
+
159
+
160
+ async def get_url_data(url, client):
161
+ try:
162
+ r = await client.get(url)
163
+ # print(r.status_code)
164
+ if r.status_code == 200:
165
+ # print("in")
166
+ soup = BeautifulSoup(r.content, "html.parser")
167
+ return soup
168
+ except Exception:
169
+ return None
170
+
171
+
172
+ def remove_punc(text):
173
+ res = re.sub(r"[^\w\s]", "", text)
174
+ return res
175
+
176
+
177
+ def split_ngrams(text, n):
178
+ # return n-grams of size n
179
+ words = text.split()
180
+ return [words[i : i + n] for i in range(len(words) - n + 1)]
181
+
182
+
183
+ async def parallel_scrap(urls):
184
+ async with httpx.AsyncClient(timeout=30) as client:
185
+ tasks = []
186
+ for url in urls:
187
+ tasks.append(get_url_data(url=url, client=client))
188
+ results = await asyncio.gather(*tasks, return_exceptions=True)
189
+ return results
190
+
191
+
192
+ def matching_score(args_list):
193
+ sentence = remove_punc(args_list[0])
194
+ content = remove_punc(args_list[1])
195
+ if sentence in content:
196
+ return 1
197
+ else:
198
+ n = 5
199
+ ngrams = split_ngrams(sentence, n)
200
+ if len(ngrams) == 0:
201
+ return 0
202
+ matched = [x for x in ngrams if " ".join(x) in content]
203
+ return len(matched) / len(ngrams)
204
+
205
+
206
+ def plagiarism_check(
207
+ plag_option,
208
+ input,
209
+ year_from,
210
+ month_from,
211
+ day_from,
212
+ year_to,
213
+ month_to,
214
+ day_to,
215
+ domains_to_skip,
216
+ ):
217
+ api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
218
+ api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
219
+ api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
220
+ # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
221
+ api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
222
+ cse_id = "851813e81162b4ed4"
223
+
224
+ sentences = split_sentence_blocks(input)
225
+ urlCount = {}
226
+ ScoreArray = []
227
+ urlList = []
228
+ date_from = build_date(year_from, month_from, day_from)
229
+ date_to = build_date(year_to, month_to, day_to)
230
+ sort_date = f"date:r:{date_from}:{date_to}"
231
+ # get list of URLS to check
232
+ urlCount, ScoreArray = google_search(
233
+ plag_option,
234
+ sentences,
235
+ urlCount,
236
+ ScoreArray,
237
+ urlList,
238
+ sort_date,
239
+ domains_to_skip,
240
+ api_key,
241
+ cse_id,
242
+ )
243
+
244
+ # Scrape URLs in list
245
+ formatted_tokens = []
246
+ soups = asyncio.run(parallel_scrap(urlList))
247
+
248
+ # Populate matching scores for scrapped pages
249
+ for i, soup in enumerate(soups):
250
+ print(f"Analyzing {i+1} of {len(soups)} soups........................")
251
+ if soup:
252
+ page_content = soup.text
253
+ for j, sent in enumerate(sentences):
254
+ args_list = (sent, page_content)
255
+ score = matching_score(args_list)
256
+ # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
257
+ ScoreArray[i][j] = score
258
+
259
+ # with concurrent.futures.ProcessPoolExecutor() as executor:
260
+ # results = executor.map(matching_score, args_list)
261
+
262
+ # *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
263
+ # source_embeddings = []
264
+ # for i, soup in enumerate(soups):
265
+ # if soup:
266
+ # page_content = soup.text
267
+ # source_embeddings.append(embed_text(page_content))
268
+ # else:
269
+ # source_embeddings.append(None)
270
+
271
+ # def compute_cosine_similarity(args):
272
+ # sent, source_embedding, i, j = args
273
+ # score = cos_sim_torch(embed_text(sent), source_embedding)
274
+ # return i, j, score
275
+
276
+ # def main(soups, sentences):
277
+ # source_embeddings = [preprocess(soup) for soup in soups]
278
+ # ScoreArray = [[0 for _ in sentences] for _ in soups]
279
+ # args_list = []
280
+ # for i, soup in enumerate(soups):
281
+ # if soup:
282
+ # for j, sent in enumerate(sentences):
283
+ # args_list.append((sent, source_embeddings[i], i, j))
284
+ # with concurrent.futures.ProcessPoolExecutor() as executor:
285
+ # results = executor.map(compute_cosine_similarity, args_list)
286
+ # for i, j, score in results:
287
+ # ScoreArray[i][j] = score
288
+ # return ScoreArray
289
+
290
+ # # Populate matching scores for scrapped pages
291
+ # ScoreArray = main(soups, sentences)
292
+ # *******************************************************************************************
293
+
294
+ # Calculate URL of max matching score for each sentence chunk
295
+ sentenceToMaxURL = [-1] * len(sentences)
296
+ for j in range(len(sentences)):
297
+ if j > 0:
298
+ maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
299
+ sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
300
+ else:
301
+ maxScore = -1
302
+
303
+ for i in range(len(ScoreArray)):
304
+ margin = (
305
+ 0.1
306
+ if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
307
+ else 0
308
+ )
309
+ if ScoreArray[i][j] - maxScore > margin:
310
+ maxScore = ScoreArray[i][j]
311
+ sentenceToMaxURL[j] = i
312
+
313
+ index = np.unique(sentenceToMaxURL)
314
+
315
+ urlScore = {}
316
+ for url in index:
317
+ s = [
318
+ ScoreArray[url][sen]
319
+ for sen in range(len(sentences))
320
+ if sentenceToMaxURL[sen] == url
321
+ ]
322
+ urlScore[url] = sum(s) / len(s)
323
+
324
+ index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
325
+
326
+ urlMap = {}
327
+ for count, i in enumerate(index_descending):
328
+ urlMap[i] = count + 1
329
+ for i, sent in enumerate(sentences):
330
+ formatted_tokens.append(
331
+ (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
332
+ )
333
+ for ind in index_descending:
334
+ formatted_tokens.append(
335
+ (
336
+ urlList[ind]
337
+ + " --- Matching Score: "
338
+ + f"{str(round(urlScore[ind] * 100, 2))}%",
339
+ "[" + str(urlMap[ind]) + "]",
340
+ )
341
+ )
342
+ formatted_tokens.append(("\n", None))
343
+
344
+ return formatted_tokens
predictors.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import httpx
3
+ import torch
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ import numpy as np
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+ import asyncio
9
+ from evaluate import load
10
+ from datetime import date
11
+ import nltk
12
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
13
+ import plotly.graph_objects as go
14
+ import torch.nn.functional as F
15
+ import nltk
16
+ from unidecode import unidecode
17
+ import time
18
+ from scipy.special import softmax
19
+ import yaml
20
+ import os
21
+ from utils import *
22
+ from dotenv import load_dotenv
23
+
24
+ with open("config.yaml", "r") as file:
25
+ params = yaml.safe_load(file)
26
+ nltk.download("punkt")
27
+ nltk.download("stopwords")
28
+ load_dotenv()
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
31
+ text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
32
+ text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
33
+ quillbot_labels = params["QUILLBOT_LABELS"]
34
+ mc_label_map = params["MC_OUTPUT_LABELS"]
35
+ mc_token_size = int(params["MC_TOKEN_SIZE"])
36
+ bc_token_size = int(params["BC_TOKEN_SIZE"])
37
+ text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
38
+ text_bc_model = AutoModelForSequenceClassification.from_pretrained(
39
+ text_bc_model_path
40
+ ).to(device)
41
+ text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
42
+ text_mc_model = AutoModelForSequenceClassification.from_pretrained(
43
+ text_mc_model_path
44
+ ).to(device)
45
+ quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path)
46
+ quillbot_model = AutoModelForSequenceClassification.from_pretrained(
47
+ text_quillbot_model_path
48
+ ).to(device)
49
+
50
+
51
+ def split_text_allow_complete_sentences_nltk(
52
+ text,
53
+ max_length=256,
54
+ tolerance=30,
55
+ min_last_segment_length=100,
56
+ type_det="bc",
57
+ ):
58
+ sentences = nltk.sent_tokenize(text)
59
+ segments = []
60
+ current_segment = []
61
+ current_length = 0
62
+ if type_det == "bc":
63
+ tokenizer = text_bc_tokenizer
64
+ max_length = bc_token_size
65
+ elif type_det == "mc":
66
+ tokenizer = text_mc_tokenizer
67
+ max_length = mc_token_size
68
+ for sentence in sentences:
69
+ tokens = tokenizer.tokenize(sentence)
70
+ sentence_length = len(tokens)
71
+
72
+ if current_length + sentence_length <= max_length + tolerance - 2:
73
+ current_segment.append(sentence)
74
+ current_length += sentence_length
75
+ else:
76
+ if current_segment:
77
+ encoded_segment = tokenizer.encode(
78
+ " ".join(current_segment),
79
+ add_special_tokens=True,
80
+ max_length=max_length + tolerance,
81
+ truncation=True,
82
+ )
83
+ segments.append((current_segment, len(encoded_segment)))
84
+ current_segment = [sentence]
85
+ current_length = sentence_length
86
+
87
+ if current_segment:
88
+ encoded_segment = tokenizer.encode(
89
+ " ".join(current_segment),
90
+ add_special_tokens=True,
91
+ max_length=max_length + tolerance,
92
+ truncation=True,
93
+ )
94
+ segments.append((current_segment, len(encoded_segment)))
95
+
96
+ final_segments = []
97
+ for i, (seg, length) in enumerate(segments):
98
+ if i == len(segments) - 1:
99
+ if length < min_last_segment_length and len(final_segments) > 0:
100
+ prev_seg, prev_length = final_segments[-1]
101
+ combined_encoded = tokenizer.encode(
102
+ " ".join(prev_seg + seg),
103
+ add_special_tokens=True,
104
+ max_length=max_length + tolerance,
105
+ truncation=True,
106
+ )
107
+ if len(combined_encoded) <= max_length + tolerance:
108
+ final_segments[-1] = (prev_seg + seg, len(combined_encoded))
109
+ else:
110
+ final_segments.append((seg, length))
111
+ else:
112
+ final_segments.append((seg, length))
113
+ else:
114
+ final_segments.append((seg, length))
115
+
116
+ decoded_segments = []
117
+ encoded_segments = []
118
+ for seg, _ in final_segments:
119
+ encoded_segment = tokenizer.encode(
120
+ " ".join(seg),
121
+ add_special_tokens=True,
122
+ max_length=max_length + tolerance,
123
+ truncation=True,
124
+ )
125
+ decoded_segment = tokenizer.decode(encoded_segment)
126
+ decoded_segments.append(decoded_segment)
127
+ return decoded_segments
128
+
129
+
130
+ def predict_quillbot(text):
131
+ with torch.no_grad():
132
+ quillbot_model.eval()
133
+ tokenized_text = quillbot_tokenizer(
134
+ text,
135
+ padding="max_length",
136
+ truncation=True,
137
+ max_length=256,
138
+ return_tensors="pt",
139
+ ).to(device)
140
+ output = quillbot_model(**tokenized_text)
141
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
142
+ q_score = {
143
+ "Humanized": output_norm[1].item(),
144
+ "Original": output_norm[0].item(),
145
+ }
146
+ return q_score
147
+
148
+
149
+ def predict_bc(model, tokenizer, text):
150
+ with torch.no_grad():
151
+ model.eval()
152
+ tokens = text_bc_tokenizer(
153
+ text,
154
+ padding="max_length",
155
+ truncation=True,
156
+ max_length=bc_token_size,
157
+ return_tensors="pt",
158
+ ).to(device)
159
+ output = model(**tokens)
160
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
161
+ return output_norm
162
+
163
+
164
+ def predict_mc(model, tokenizer, text):
165
+ with torch.no_grad():
166
+ model.eval()
167
+ tokens = text_mc_tokenizer(
168
+ text,
169
+ padding="max_length",
170
+ truncation=True,
171
+ return_tensors="pt",
172
+ max_length=mc_token_size,
173
+ ).to(device)
174
+ output = model(**tokens)
175
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
176
+ return output_norm
177
+
178
+
179
+ def predict_mc_scores(input):
180
+ bc_scores = []
181
+ mc_scores = []
182
+
183
+ samples_len_bc = len(
184
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
185
+ )
186
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
187
+ for i in range(samples_len_bc):
188
+ cleaned_text_bc = remove_special_characters(segments_bc[i])
189
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
190
+ bc_scores.append(bc_score)
191
+ bc_scores_array = np.array(bc_scores)
192
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
193
+ bc_score_list = average_bc_scores.tolist()
194
+ bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
195
+ segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
196
+ samples_len_mc = len(
197
+ split_text_allow_complete_sentences_nltk(input, type_det="mc")
198
+ )
199
+ for i in range(samples_len_mc):
200
+ cleaned_text_mc = remove_special_characters(segments_mc[i])
201
+ mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
202
+ mc_scores.append(mc_score)
203
+ mc_scores_array = np.array(mc_scores)
204
+ average_mc_scores = np.mean(mc_scores_array, axis=0)
205
+ mc_score_list = average_mc_scores.tolist()
206
+ mc_score = {}
207
+ for score, label in zip(mc_score_list, mc_label_map):
208
+ mc_score[label.upper()] = score
209
+
210
+ sum_prob = 1 - bc_score["HUMAN"]
211
+ for key, value in mc_score.items():
212
+ mc_score[key] = value * sum_prob
213
+ if sum_prob < 0.01:
214
+ mc_score = {}
215
+
216
+ return mc_score
217
+
218
+
219
+ def predict_bc_scores(input):
220
+ bc_scores = []
221
+ mc_scores = []
222
+ samples_len_bc = len(
223
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
224
+ )
225
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
226
+ for i in range(samples_len_bc):
227
+ cleaned_text_bc = remove_special_characters(segments_bc[i])
228
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
229
+ bc_scores.append(bc_score)
230
+ bc_scores_array = np.array(bc_scores)
231
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
232
+ bc_score_list = average_bc_scores.tolist()
233
+ bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
234
+ return bc_score
235
+
236
+
237
+ # def predict_1on1(input):
238
+ # models = ['bard', 'claude', 'gpt4', 'mistral_ai', 'llama2']
239
+ # text = str(row["text"])
240
+ # predictions = {}
241
+ # prediction = predict(text, bard_model, bard_tokenizer) predictions['bard'] = prediction[1]
242
+ # prediction = predict(text, claude_model, claude_tokenizer) predictions['claude'] = prediction[1]
243
+ # prediction = predict(text, gpt4_model, gpt4_tokenizer) predictions['gpt4'] = prediction[1]
244
+ # prediction = predict(text, mistral_ai_model, mistral_ai_tokenizer) predictions['mistral_ai'] = prediction[1]
245
+ # prediction = predict(text, llama2_model, llama2_tokenizer) predictions['llama2'] = prediction[1]
246
+ # max_key = max(predictions, key=predictions.get)
requirements.txt CHANGED
@@ -6,8 +6,8 @@ BeautifulSoup4
6
  scrapingbee
7
  requests
8
  numpy
9
- torch==1.13.0
10
- transformers==4.25.1
11
  transformers-interpret
12
  textstat
13
  scipy
@@ -22,4 +22,5 @@ plotly
22
  tqdm
23
  pymupdf
24
  sentence-transformers
25
- Unidecode
 
 
6
  scrapingbee
7
  requests
8
  numpy
9
+ torch
10
+ transformers
11
  transformers-interpret
12
  textstat
13
  scipy
 
22
  tqdm
23
  pymupdf
24
  sentence-transformers
25
+ Unidecode
26
+ python-dotenv
utils.py CHANGED
@@ -11,268 +11,354 @@ import asyncio
11
  import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
  import threading
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- nltk.download('punkt')
16
 
17
- WORD = re.compile(r"\w+")
18
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 
19
 
20
 
21
- # returns cosine similarity of two vectors
22
- # input: two vectors
23
- # output: integer between 0 and 1.
24
- def get_cosine(vec1, vec2):
25
- intersection = set(vec1.keys()) & set(vec2.keys())
26
 
27
- # calculating numerator
28
- numerator = sum([vec1[x] * vec2[x] for x in intersection])
29
 
30
- # calculating denominator
31
- sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
32
- sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
33
- denominator = math.sqrt(sum1) * math.sqrt(sum2)
34
 
35
- # checking for divide by zero
36
- if denominator == 0:
37
- return 0.0
38
- else:
39
- return float(numerator) / denominator
40
-
41
-
42
- # converts given text into a vector
43
- def text_to_vector(text):
44
- # uses the Regular expression above and gets all words
45
- words = WORD.findall(text)
46
- # returns a counter of all the words (count of number of occurences)
47
- return Counter(words)
48
-
49
-
50
- # returns cosine similarity of two words
51
- # uses: text_to_vector(text) and get_cosine(v1,v2)
52
- def cosineSim(text1, text2):
53
- vector1 = text_to_vector(text1)
54
- vector2 = text_to_vector(text2)
55
- # print vector1,vector2
56
- cosine = get_cosine(vector1, vector2)
57
- return cosine
58
-
59
- def sentence_similarity(text1, text2):
60
- embedding_1= model.encode(text1, convert_to_tensor=True)
61
- embedding_2 = model.encode(text2, convert_to_tensor=True)
62
-
63
- o = util.pytorch_cos_sim(embedding_1, embedding_2)
64
- return o.item()
65
-
66
- def get_soup_requests(url):
67
- page = requests.get(url)
68
- if page.status_code == 200:
69
- soup = BeautifulSoup(page.content, "html.parser")
70
- return soup
71
- print("HTML soup failed")
72
- return None
73
-
74
-
75
- def get_soup_httpx(url):
76
- client = httpx.Client(timeout=30)
77
- try:
78
- page = client.get(url)
79
- if page.status_code == httpx.codes.OK:
80
- soup = BeautifulSoup(page.content, "html.parser")
81
- return soup
82
- except:
83
- print("HTTPx soup failed")
84
- return None
85
-
86
- def getSentences(text):
87
- from nltk.tokenize import sent_tokenize
88
-
89
- sents = sent_tokenize(text)
90
- two_sents = []
91
- for i in range(len(sents)):
92
- if (i % 2) == 0:
93
- two_sents.append(sents[i])
94
- else:
95
- two_sents[len(two_sents) - 1] += " " + sents[i]
96
- return two_sents
97
-
98
-
99
- def googleSearch(
100
- plag_option,
101
- sentences,
102
- urlCount,
103
- scoreArray,
104
- urlList,
105
- sorted_date,
106
- domains_to_skip,
107
- api_key,
108
- cse_id,
109
- **kwargs,
110
- ):
111
- service = build("customsearch", "v1", developerKey=api_key)
112
- for i, sentence in enumerate(sentences):
113
- results = (
114
- service.cse()
115
- .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
116
- .execute()
117
- )
118
- if "items" in results and len(results["items"]) > 0:
119
- for count, link in enumerate(results["items"]):
120
- # stop after 3 pages
121
- if count >= 3:
122
- break
123
- # skip user selected domains
124
- if any(
125
- ("." + domain) in link["link"]
126
- for domain in domains_to_skip
127
- ):
128
- continue
129
- # clean up snippet of '...'
130
- snippet = link["snippet"]
131
- ind = snippet.find("...")
132
- if ind < 20 and ind > 9:
133
- snippet = snippet[ind + len("... ") :]
134
- ind = snippet.find("...")
135
- if ind > len(snippet) - 5:
136
- snippet = snippet[:ind]
137
-
138
- # update cosine similarity between snippet and given text
139
- url = link["link"]
140
- if url not in urlList:
141
- urlList.append(url)
142
- scoreArray.append([0] * len(sentences))
143
- urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
144
- if plag_option == 'Standard':
145
- scoreArray[urlList.index(url)][i] = cosineSim(
146
- sentence, snippet)
147
- else :
148
- scoreArray[urlList.index(url)][i] = sentence_similarity(
149
- sentence, snippet
150
- )
151
- else:
152
- print("Google Search failed")
153
- return urlCount, scoreArray
154
-
155
-
156
- def getQueries(text, n):
157
- # return n-grams of size n
158
- words = text.split()
159
- return [words[i : i + n] for i in range(len(words) - n + 1)]
160
-
161
-
162
- def print2D(array):
163
- print(np.array(array))
164
-
165
-
166
- def removePunc(text):
167
- res = re.sub(r"[^\w\s]", "", text)
168
- return res
169
-
170
-
171
- async def get_url_data(url, client):
172
- try:
173
- r = await client.get(url)
174
- # print(r.status_code)
175
- if r.status_code == 200:
176
- # print("in")
177
- soup = BeautifulSoup(r.content, "html.parser")
178
- return soup
179
- except Exception:
180
- print("HTTPx parallel soup failed")
181
- return None
182
-
183
-
184
- async def parallel_scrap(urls):
185
- async with httpx.AsyncClient(timeout=30) as client:
186
- tasks = []
187
- for url in urls:
188
- tasks.append(get_url_data(url=url, client=client))
189
- results = await asyncio.gather(*tasks, return_exceptions=True)
190
- return results
191
-
192
-
193
- class TimeoutError(Exception):
194
- pass
195
-
196
-
197
-
198
- def matchingScore(sentence, content):
199
- if sentence in content:
200
- return 1
201
- sentence = removePunc(sentence)
202
- content = removePunc(content)
203
- if sentence in content:
204
- return 1
205
  else:
206
- n = 5
207
- ngrams = getQueries(sentence, n)
208
- if len(ngrams) == 0:
209
- return 0
210
- matched = [x for x in ngrams if " ".join(x) in content]
211
- return len(matched) / len(ngrams)
212
-
213
-
214
- def matchingScoreWithTimeout(sentence, content):
215
- def timeout_handler():
216
- raise TimeoutError("Function timed out")
217
-
218
- timer = threading.Timer(2, timeout_handler) # Set a timer for 2 seconds
219
- timer.start()
220
- try:
221
- score = matchingScore(sentence, content)
222
- timer.cancel() # Cancel the timer if calculation completes before timeout
223
- return score
224
- except TimeoutError:
225
- return 0
226
-
227
-
228
- async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
229
- content = removePunc(content)
230
- for j, sentence in enumerate(sentences):
231
- sentence = removePunc(sentence)
232
- if sentence in content:
233
- ScoreArray[content_idx][j] = 1
234
- else:
235
- n = 5
236
- ngrams = getQueries(sentence, n)
237
- if len(ngrams) == 0:
238
- return 0
239
- matched = [x for x in ngrams if " ".join(x) in content]
240
- ScoreArray[content_idx][j] = len(matched) / len(ngrams)
241
- print(
242
- f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
243
- )
244
- return ScoreArray
245
-
246
-
247
- async def parallel_analyze(soups, sentences, ScoreArray):
248
- tasks = []
249
- for i, soup in enumerate(soups):
250
- if soup:
251
- page_content = soup.text
252
- tasks.append(
253
- matchingScoreAsync(sentences, page_content, i, ScoreArray)
254
- )
255
- else:
256
- print(
257
- f"Analyzed {i+1} of soups (SOUP FAILED)........................"
258
- )
259
- ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
260
- return ScoreArray
261
-
262
-
263
- async def parallel_analyze_2(soups, sentences, ScoreArray):
264
- tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
265
- for i, soup in enumerate(soups):
266
- if soup:
267
- page_content = soup.text
268
- for j, sent in enumerate(sentences):
269
- print(
270
- f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
271
- )
272
- tasks[i][j] = matchingScore(sent, page_content)
273
- else:
274
- print(
275
- f"Analyzed {i+1} of soups (SOUP FAILED)........................"
276
- )
277
- ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
278
- return ScoreArray
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
  import threading
14
+ import torch
15
+ import re
16
+ import numpy as np
17
+ import asyncio
18
+ from datetime import date
19
+ import nltk
20
+ from unidecode import unidecode
21
+ from scipy.special import softmax
22
+ from transformers import AutoTokenizer
23
+ import yaml
24
+ import fitz
25
+ import os
26
 
 
27
 
28
+ def remove_accents(input_str):
29
+ text_no_accents = unidecode(input_str)
30
+ return text_no_accents
31
 
32
 
33
+ def remove_special_characters(text):
34
+ text = remove_accents(text)
35
+ pattern = r'[^\w\s\d.,!?\'"()-;]+'
36
+ text = re.sub(pattern, "", text)
37
+ return text
38
 
 
 
39
 
40
+ def remove_special_characters_2(text):
41
+ pattern = r"[^a-zA-Z0-9 ]+"
42
+ text = re.sub(pattern, "", text)
43
+ return text
44
 
45
+
46
+ def update_character_count(text):
47
+ return f"{len(text)} characters"
48
+
49
+
50
+ nltk.download("punkt")
51
+
52
+
53
+ with open("config.yaml", "r") as file:
54
+ params = yaml.safe_load(file)
55
+
56
+ text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
57
+
58
+ text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
59
+
60
+
61
+ def len_validator(text):
62
+ min_tokens = 200
63
+ lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
64
+ if lengt < min_tokens:
65
+ return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  else:
67
+ return f"Input length ({lengt}) is satisified."
68
+
69
+
70
+ def extract_text_from_pdf(pdf_path):
71
+ doc = fitz.open(pdf_path)
72
+ text = ""
73
+ for page in doc:
74
+ text += page.get_text()
75
+ return text
76
+
77
+
78
+ WORD = re.compile(r"\w+")
79
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
80
+
81
+
82
+ # returns cosine similarity of two vectors
83
+ # input: two vectors
84
+ # output: integer between 0 and 1.
85
+ # def get_cosine(vec1, vec2):
86
+ # intersection = set(vec1.keys()) & set(vec2.keys())
87
+
88
+ # # calculating numerator
89
+ # numerator = sum([vec1[x] * vec2[x] for x in intersection])
90
+
91
+ # # calculating denominator
92
+ # sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
93
+ # sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
94
+ # denominator = math.sqrt(sum1) * math.sqrt(sum2)
95
+
96
+ # # checking for divide by zero
97
+ # if denominator == 0:
98
+ # return 0.0
99
+ # else:
100
+ # return float(numerator) / denominator
101
+
102
+
103
+ # # converts given text into a vector
104
+ # def text_to_vector(text):
105
+ # # uses the Regular expression above and gets all words
106
+ # words = WORD.findall(text)
107
+ # # returns a counter of all the words (count of number of occurences)
108
+ # return Counter(words)
109
+
110
+
111
+ # # returns cosine similarity of two words
112
+ # # uses: text_to_vector(text) and get_cosine(v1,v2)
113
+ # def cosineSim(text1, text2):
114
+ # vector1 = text_to_vector(text1)
115
+ # vector2 = text_to_vector(text2)
116
+ # # print vector1,vector2
117
+ # cosine = get_cosine(vector1, vector2)
118
+ # return cosine
119
+
120
+
121
+ # def cos_sim_torch(embedding_1, embedding_2):
122
+ # return util.pytorch_cos_sim(embedding_1, embedding_2).item()
123
+
124
+
125
+ # def embed_text(text):
126
+ # return model.encode(text, convert_to_tensor=True)
127
+
128
+
129
+ # def sentence_similarity(text1, text2):
130
+ # embedding_1 = model.encode(text1, convert_to_tensor=True)
131
+ # embedding_2 = model.encode(text2, convert_to_tensor=True)
132
+
133
+ # o = util.pytorch_cos_sim(embedding_1, embedding_2)
134
+ # return o.item()
135
+
136
+
137
+ # def get_soup_requests(url):
138
+ # page = requests.get(url)
139
+ # if page.status_code == 200:
140
+ # soup = BeautifulSoup(page.content, "html.parser")
141
+ # return soup
142
+ # print("HTML soup failed")
143
+ # return None
144
+
145
+
146
+ # def get_soup_httpx(url):
147
+ # client = httpx.Client(timeout=30)
148
+ # try:
149
+ # page = client.get(url)
150
+ # if page.status_code == httpx.codes.OK:
151
+ # soup = BeautifulSoup(page.content, "html.parser")
152
+ # return soup
153
+ # except:
154
+ # print("HTTPx soup failed")
155
+ # return None
156
+
157
+
158
+ # def getSentences(text):
159
+ # from nltk.tokenize import sent_tokenize
160
+
161
+ # sents = sent_tokenize(text)
162
+ # two_sents = []
163
+ # for i in range(len(sents)):
164
+ # if (i % 2) == 0:
165
+ # two_sents.append(sents[i])
166
+ # else:
167
+ # two_sents[len(two_sents) - 1] += " " + sents[i]
168
+ # return two_sents
169
+
170
+
171
+ # def googleSearch(
172
+ # plag_option,
173
+ # sentences,
174
+ # urlCount,
175
+ # scoreArray,
176
+ # urlList,
177
+ # sorted_date,
178
+ # domains_to_skip,
179
+ # api_key,
180
+ # cse_id,
181
+ # **kwargs,
182
+ # ):
183
+ # service = build("customsearch", "v1", developerKey=api_key)
184
+ # for i, sentence in enumerate(sentences):
185
+ # results = (
186
+ # service.cse()
187
+ # .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
188
+ # .execute()
189
+ # )
190
+ # if "items" in results and len(results["items"]) > 0:
191
+ # for count, link in enumerate(results["items"]):
192
+ # # stop after 3 pages
193
+ # if count >= 3:
194
+ # break
195
+ # # skip user selected domains
196
+ # if any(
197
+ # ("." + domain) in link["link"] for domain in domains_to_skip
198
+ # ):
199
+ # continue
200
+ # # clean up snippet of '...'
201
+ # snippet = link["snippet"]
202
+ # ind = snippet.find("...")
203
+ # if ind < 20 and ind > 9:
204
+ # snippet = snippet[ind + len("... ") :]
205
+ # ind = snippet.find("...")
206
+ # if ind > len(snippet) - 5:
207
+ # snippet = snippet[:ind]
208
+
209
+ # # update cosine similarity between snippet and given text
210
+ # url = link["link"]
211
+ # if url not in urlList:
212
+ # urlList.append(url)
213
+ # scoreArray.append([0] * len(sentences))
214
+ # urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
215
+ # if plag_option == "Standard":
216
+ # scoreArray[urlList.index(url)][i] = cosineSim(
217
+ # sentence, snippet
218
+ # )
219
+ # else:
220
+ # scoreArray[urlList.index(url)][i] = sentence_similarity(
221
+ # sentence, snippet
222
+ # )
223
+ # else:
224
+ # print("Google Search failed")
225
+ # return urlCount, scoreArray
226
+
227
+
228
+ # def getQueries(text, n):
229
+ # # return n-grams of size n
230
+ # words = text.split()
231
+ # return [words[i : i + n] for i in range(len(words) - n + 1)]
232
+
233
+
234
+ # def print2D(array):
235
+ # print(np.array(array))
236
+
237
+
238
+ # def removePunc(text):
239
+ # res = re.sub(r"[^\w\s]", "", text)
240
+ # return res
241
+
242
+
243
+ # async def get_url_data(url, client):
244
+ # try:
245
+ # r = await client.get(url)
246
+ # # print(r.status_code)
247
+ # if r.status_code == 200:
248
+ # # print("in")
249
+ # soup = BeautifulSoup(r.content, "html.parser")
250
+ # return soup
251
+ # except Exception:
252
+ # print("HTTPx parallel soup failed")
253
+ # return None
254
+
255
+
256
+ # async def parallel_scrap(urls):
257
+ # async with httpx.AsyncClient(timeout=30) as client:
258
+ # tasks = []
259
+ # for url in urls:
260
+ # tasks.append(get_url_data(url=url, client=client))
261
+ # results = await asyncio.gather(*tasks, return_exceptions=True)
262
+ # return results
263
+
264
+
265
+ # class TimeoutError(Exception):
266
+ # pass
267
+
268
+
269
+ # def matchingScore(sentence, content):
270
+ # if sentence in content:
271
+ # return 1
272
+ # sentence = removePunc(sentence)
273
+ # content = removePunc(content)
274
+ # if sentence in content:
275
+ # return 1
276
+ # else:
277
+ # n = 5
278
+ # ngrams = getQueries(sentence, n)
279
+ # if len(ngrams) == 0:
280
+ # return 0
281
+ # matched = [x for x in ngrams if " ".join(x) in content]
282
+ # return len(matched) / len(ngrams)
283
+
284
+
285
+ # # def matchingScoreWithTimeout(sentence, content):
286
+ # # def timeout_handler():
287
+ # # raise TimeoutError("Function timed out")
288
+
289
+ # # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds
290
+ # # timer.start()
291
+ # # try:
292
+ # # score = sentence_similarity(sentence, content)
293
+ # # # score = matchingScore(sentence, content)
294
+ # # timer.cancel() # Cancel the timer if calculation completes before timeout
295
+ # # return score
296
+ # # except TimeoutError:
297
+ # # return 0
298
+
299
+
300
+ # # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
301
+ # # content = removePunc(content)
302
+ # # for j, sentence in enumerate(sentences):
303
+ # # sentence = removePunc(sentence)
304
+ # # if sentence in content:
305
+ # # ScoreArray[content_idx][j] = 1
306
+ # # else:
307
+ # # n = 5
308
+ # # ngrams = getQueries(sentence, n)
309
+ # # if len(ngrams) == 0:
310
+ # # return 0
311
+ # # matched = [x for x in ngrams if " ".join(x) in content]
312
+ # # ScoreArray[content_idx][j] = len(matched) / len(ngrams)
313
+ # # print(
314
+ # # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
315
+ # # )
316
+ # # return ScoreArray
317
+
318
+
319
+ # async def matchingScoreAsync(
320
+ # sentences, content, content_idx, ScoreArray, model, util
321
+ # ):
322
+ # content = removePunc(content)
323
+ # for j, sentence in enumerate(sentences):
324
+ # sentence = removePunc(sentence)
325
+ # similarity_score = sentence_similarity(sentence, content, model, util)
326
+ # ScoreArray[content_idx][j] = similarity_score
327
+ # print(
328
+ # f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................"
329
+ # )
330
+ # return ScoreArray
331
+
332
+
333
+ # async def parallel_analyze(soups, sentences, ScoreArray):
334
+ # tasks = []
335
+ # for i, soup in enumerate(soups):
336
+ # if soup:
337
+ # page_content = soup.text
338
+ # tasks.append(
339
+ # matchingScoreAsync(sentences, page_content, i, ScoreArray)
340
+ # )
341
+ # else:
342
+ # print(
343
+ # f"Analyzed {i+1} of soups (SOUP FAILED)........................"
344
+ # )
345
+ # ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
346
+ # return ScoreArray
347
+
348
+
349
+ # async def parallel_analyze_2(soups, sentences, ScoreArray):
350
+ # tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
351
+ # for i, soup in enumerate(soups):
352
+ # if soup:
353
+ # page_content = soup.text
354
+ # for j, sent in enumerate(sentences):
355
+ # print(
356
+ # f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
357
+ # )
358
+ # tasks[i][j] = sentence_similarity(sent, page_content)
359
+ # else:
360
+ # print(
361
+ # f"Analyzed {i+1} of soups (SOUP FAILED)........................"
362
+ # )
363
+ # ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
364
+ # return ScoreArray