minko186 commited on
Commit
9f01824
2 Parent(s): 4eae412 45d10c4

Merge branch 'minko'

Browse files
__pycache__/analysis.cpython-311.pyc ADDED
Binary file (4.75 kB). View file
 
__pycache__/app.cpython-311.pyc ADDED
Binary file (10.9 kB). View file
 
__pycache__/explainability.cpython-311.pyc ADDED
Binary file (7.89 kB). View file
 
__pycache__/plagiarism.cpython-311.pyc ADDED
Binary file (14.1 kB). View file
 
__pycache__/predictors.cpython-311.pyc ADDED
Binary file (12 kB). View file
 
__pycache__/utils.cpython-311.pyc ADDED
Binary file (3.76 kB). View file
 
analysis.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import httpx
3
+ import torch
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ import numpy as np
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+ import asyncio
9
+ from scipy.special import softmax
10
+ from evaluate import load
11
+ from datetime import date
12
+ import nltk
13
+ import fitz
14
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
15
+ import nltk, spacy, subprocess, torch
16
+ import plotly.graph_objects as go
17
+ import torch.nn.functional as F
18
+ import nltk
19
+ from unidecode import unidecode
20
+ import time
21
+ import yaml
22
+ import nltk
23
+ import os
24
+ from explainability import *
25
+ from dotenv import load_dotenv
26
+ import subprocess
27
+
28
+ nltk.download("punkt")
29
+ nltk.download("stopwords")
30
+ load_dotenv()
31
+ with open("config.yaml", "r") as file:
32
+ params = yaml.safe_load(file)
33
+ device = "cuda" if torch.cuda.is_available() else "cpu"
34
+ readability_model_id = params["READABILITY_MODEL_ID"]
35
+ gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
36
+ gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
37
+
38
+ command = ["python", "-m", "spacy", "download", "en_core_web_sm"]
39
+ subprocess.run(command)
40
+ nlp = spacy.load("en_core_web_sm")
41
+
42
+
43
+ def depth_analysis(input_text):
44
+ processed_words = preprocess_text1(input_text)
45
+ ttr_value = vocabulary_richness_ttr(processed_words)
46
+ gunning_fog = calculate_gunning_fog(input_text)
47
+ gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
48
+ words, sentences = preprocess_text2(input_text)
49
+ average_sentence_length = calculate_average_sentence_length(sentences)
50
+ average_word_length = calculate_average_word_length(words)
51
+ average_sentence_length_norm = normalize(
52
+ average_sentence_length, min_value=0, max_value=40
53
+ )
54
+ average_word_length_norm = normalize(
55
+ average_word_length, min_value=0, max_value=8
56
+ )
57
+ average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
58
+ average_tree_depth_norm = normalize(
59
+ average_tree_depth, min_value=0, max_value=10
60
+ )
61
+ perplexity = calculate_perplexity(
62
+ input_text, gpt2_model, gpt2_tokenizer, device
63
+ )
64
+ perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
65
+
66
+ features = {
67
+ "readability": gunning_fog_norm,
68
+ "syntactic tree depth": average_tree_depth_norm,
69
+ "vocabulary richness": ttr_value,
70
+ "perplexity": perplexity_norm,
71
+ "average sentence length": average_sentence_length_norm,
72
+ "average word length": average_word_length_norm,
73
+ }
74
+ fig = go.Figure()
75
+ fig.add_trace(
76
+ go.Scatterpolar(
77
+ r=list(features.values()),
78
+ theta=list(features.keys()),
79
+ fill="toself",
80
+ name="Radar Plot",
81
+ )
82
+ )
83
+ fig.update_layout(
84
+ polar=dict(
85
+ radialaxis=dict(
86
+ visible=True,
87
+ range=[0, 100],
88
+ )
89
+ ),
90
+ showlegend=False,
91
+ margin=dict(
92
+ l=10,
93
+ r=20,
94
+ b=10,
95
+ t=10,
96
+ ),
97
+ )
98
+ return fig
app.py CHANGED
@@ -1,405 +1,24 @@
1
- from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore
2
  import gradio as gr
3
- from urllib.request import urlopen, Request
4
- from googleapiclient.discovery import build
5
- import requests
6
- import httpx
7
- import torch
8
- import re
9
- from bs4 import BeautifulSoup
10
  import numpy as np
11
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
- import asyncio
13
- from scipy.special import softmax
14
- from evaluate import load
15
  from datetime import date
16
- import nltk
17
- import fitz
18
- from transformers import GPT2LMHeadModel, GPT2TokenizerFast
19
- import nltk, spacy, subprocess, torch
20
- import plotly.graph_objects as go
21
- import torch.nn.functional as F
22
- import nltk
23
- from unidecode import unidecode
24
- import time
25
- from utils import cos_sim_torch, embed_text
26
- import multiprocessing
27
- from functools import partial
28
- import concurrent.futures
29
-
30
- nltk.download('punkt')
31
-
32
- from writing_analysis import (
33
- normalize,
34
- preprocess_text1,
35
- preprocess_text2,
36
- vocabulary_richness_ttr,
37
- calculate_gunning_fog,
38
- calculate_average_sentence_length,
39
- calculate_average_word_length,
40
- calculate_syntactic_tree_depth,
41
- calculate_perplexity,
42
-
43
- )
44
 
45
  np.set_printoptions(suppress=True)
46
 
47
 
48
- def plagiarism_check(
49
- plag_option,
50
- input,
51
- year_from,
52
- month_from,
53
- day_from,
54
- year_to,
55
- month_to,
56
- day_to,
57
- domains_to_skip,
58
- ):
59
- api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
60
- api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
61
- api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
62
- # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
63
- api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
64
-
65
- cse_id = "851813e81162b4ed4"
66
-
67
- time1 = time.perf_counter()
68
- start = time.perf_counter()
69
- sentences = getSentences(input)
70
- urlCount = {}
71
- ScoreArray = []
72
- urlList = []
73
-
74
- date_from = build_date(year_from, month_from, day_from)
75
- date_to = build_date(year_to, month_to, day_to)
76
- sort_date = f"date:r:{date_from}:{date_to}"
77
-
78
- # get list of URLS to check
79
- urlCount, ScoreArray = googleSearch(
80
- plag_option,
81
- sentences,
82
- urlCount,
83
- ScoreArray,
84
- urlList,
85
- sort_date,
86
- domains_to_skip,
87
- api_key,
88
- cse_id,
89
- )
90
- print(f"Time for google search: {time.perf_counter()-time1}")
91
- time1 = time.perf_counter()
92
-
93
- print("Number of URLs: ", len(urlCount))
94
- print(urlList)
95
-
96
- # Scrape URLs in list
97
- formatted_tokens = []
98
- soups = asyncio.run(parallel_scrap(urlList))
99
-
100
- print(f"Time for scraping: {time.perf_counter()-time1}")
101
- time1 = time.perf_counter()
102
- print(len(soups))
103
- print(
104
- "Successful scraping: "
105
- + str(len([x for x in soups if x is not None]))
106
- + "out of "
107
- + str(len(urlList))
108
- )
109
-
110
- source_embeddings = []
111
- for i, soup in enumerate(soups):
112
- if soup:
113
- page_content = soup.text
114
- source_embeddings.append(embed_text(page_content))
115
- else:
116
- source_embeddings.append(None)
117
-
118
- # Populate matching scores for scrapped pages
119
- for i, soup in enumerate(soups):
120
- print(f"Analyzing {i+1} of {len(soups)} soups........................")
121
- if soup:
122
- page_content = soup.text
123
- for j, sent in enumerate(sentences):
124
- score = matchingScore(sent, page_content)
125
- score = matchingScore(sent, page_content)
126
- # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
127
- ScoreArray[i][j] = score
128
-
129
-
130
- def compute_cosine_similarity(args):
131
- sent, source_embedding, i, j = args
132
- score = cos_sim_torch(embed_text(sent), source_embedding)
133
- return i, j, score
134
-
135
- def main(soups, sentences):
136
- source_embeddings = [preprocess(soup) for soup in soups]
137
- ScoreArray = [[0 for _ in sentences] for _ in soups]
138
- args_list = []
139
- for i, soup in enumerate(soups):
140
- if soup:
141
- for j, sent in enumerate(sentences):
142
- args_list.append((sent, source_embeddings[i], i, j))
143
- with concurrent.futures.ProcessPoolExecutor() as executor:
144
- results = executor.map(compute_cosine_similarity, args_list)
145
- for i, j, score in results:
146
- ScoreArray[i][j] = score
147
- return ScoreArray
148
-
149
- ScoreArray = main(soups, sentences)
150
-
151
-
152
-
153
- print(f"Time for matching score: {time.perf_counter()-time1}")
154
- time1 = time.perf_counter()
155
-
156
- # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
157
- # print("New Score Array:\n")
158
- # print2D(ScoreArray)
159
-
160
- # Gradio formatting section
161
- sentencePlag = [False] * len(sentences)
162
- sentenceToMaxURL = [-1] * len(sentences)
163
- for j in range(len(sentences)):
164
- if j > 0:
165
- maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
166
- sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
167
- else:
168
- maxScore = -1
169
- for i in range(len(ScoreArray)):
170
- margin = (
171
- 0.1
172
- if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
173
- else 0
174
- )
175
- if ScoreArray[i][j] - maxScore > margin:
176
- maxScore = ScoreArray[i][j]
177
- sentenceToMaxURL[j] = i
178
- if maxScore > 0.5:
179
- sentencePlag[j] = True
180
-
181
- if (
182
- (len(sentences) > 1)
183
- and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
184
- and (
185
- ScoreArray[sentenceToMaxURL[0]][0]
186
- - ScoreArray[sentenceToMaxURL[1]][0]
187
- < 0.1
188
- )
189
- ):
190
- sentenceToMaxURL[0] = sentenceToMaxURL[1]
191
-
192
- index = np.unique(sentenceToMaxURL)
193
-
194
- urlScore = {}
195
- for url in index:
196
- s = [
197
- ScoreArray[url][sen]
198
- for sen in range(len(sentences))
199
- if sentenceToMaxURL[sen] == url
200
- ]
201
- urlScore[url] = sum(s) / len(s)
202
-
203
- index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
204
-
205
- urlMap = {}
206
- for count, i in enumerate(index_descending):
207
- urlMap[i] = count + 1
208
- for i, sent in enumerate(sentences):
209
- formatted_tokens.append(
210
- (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
211
  )
212
 
213
- formatted_tokens.append(("\n", None))
214
- formatted_tokens.append(("\n", None))
215
- formatted_tokens.append(("\n", None))
216
-
217
- print(formatted_tokens)
218
- print(index_descending)
219
-
220
- for ind in index_descending:
221
- formatted_tokens.append(
222
- (
223
- urlList[ind] + " --- Matching Score: " + f"{str(round(urlScore[ind] * 100, 2))}%",
224
- "[" + str(urlMap[ind]) + "]",
225
- )
226
- )
227
- formatted_tokens.append(("\n", None))
228
-
229
- print(f"Formatted Tokens: {formatted_tokens}")
230
-
231
- print(f"Time for plagiarism check: {time.perf_counter()-start}")
232
-
233
- return formatted_tokens
234
-
235
-
236
- """
237
- AI DETECTION SECTION
238
- """
239
- device = "cuda" if torch.cuda.is_available() else "cpu"
240
-
241
- text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
242
- text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
243
- text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
244
-
245
- text_mc_model_path = "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
246
- text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
247
- text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
248
-
249
- quillbot_labels = ["Original", "QuillBot"]
250
- quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
251
- quillbot_model = AutoModelForSequenceClassification.from_pretrained("polygraf-ai/quillbot-detector-28k").to(device)
252
-
253
- def remove_accents(input_str):
254
- text_no_accents = unidecode(input_str)
255
- return text_no_accents
256
-
257
- def remove_special_characters(text):
258
- text = remove_accents(text)
259
- pattern = r'[^\w\s\d.,!?\'"()-;]+'
260
- text = re.sub(pattern, '', text)
261
- return text
262
-
263
- def remove_special_characters_2(text):
264
- pattern = r'[^a-zA-Z0-9 ]+'
265
- text = re.sub(pattern, '', text)
266
- return text
267
-
268
- def update_character_count(text):
269
- return f"{len(text)} characters"
270
-
271
-
272
- def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30, min_last_segment_length=100, type_det='bc'):
273
- sentences = nltk.sent_tokenize(text)
274
- segments = []
275
- current_segment = []
276
- current_length = 0
277
-
278
- if type_det == 'bc':
279
- tokenizer = text_bc_tokenizer
280
- max_length = 333
281
-
282
- elif type_det == 'mc':
283
- tokenizer = text_mc_tokenizer
284
- max_length = 256
285
-
286
- for sentence in sentences:
287
- tokens = tokenizer.tokenize(sentence)
288
- sentence_length = len(tokens)
289
-
290
- if current_length + sentence_length <= max_length + tolerance - 2:
291
- current_segment.append(sentence)
292
- current_length += sentence_length
293
- else:
294
- if current_segment:
295
- encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
296
- segments.append((current_segment, len(encoded_segment)))
297
- current_segment = [sentence]
298
- current_length = sentence_length
299
-
300
- if current_segment:
301
- encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
302
- segments.append((current_segment, len(encoded_segment)))
303
-
304
- final_segments = []
305
- for i, (seg, length) in enumerate(segments):
306
- if i == len(segments) - 1:
307
- if length < min_last_segment_length and len(final_segments) > 0:
308
- prev_seg, prev_length = final_segments[-1]
309
- combined_encoded = tokenizer.encode(' '.join(prev_seg + seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
310
- if len(combined_encoded) <= max_length + tolerance:
311
- final_segments[-1] = (prev_seg + seg, len(combined_encoded))
312
- else:
313
- final_segments.append((seg, length))
314
- else:
315
- final_segments.append((seg, length))
316
- else:
317
- final_segments.append((seg, length))
318
-
319
- decoded_segments = []
320
- encoded_segments = []
321
- for seg, _ in final_segments:
322
- encoded_segment = tokenizer.encode(' '.join(seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
323
- decoded_segment = tokenizer.decode(encoded_segment)
324
- decoded_segments.append(decoded_segment)
325
- return decoded_segments
326
-
327
- def predict_quillbot(text):
328
- with torch.no_grad():
329
- quillbot_model.eval()
330
- tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)
331
- output = quillbot_model(**tokenized_text)
332
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
333
- q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
334
- return q_score
335
-
336
- def predict_bc(model, tokenizer, text):
337
- with torch.no_grad():
338
- model.eval()
339
- tokens = text_bc_tokenizer(
340
- text, padding='max_length', truncation=True, max_length=333, return_tensors="pt"
341
- ).to(device)
342
- output = model(**tokens)
343
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
344
- print("BC Score: ", output_norm)
345
- return output_norm
346
-
347
- def predict_mc(model, tokenizer, text):
348
- with torch.no_grad():
349
- model.eval()
350
- tokens = text_mc_tokenizer(
351
- text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
352
- ).to(device)
353
- output = model(**tokens)
354
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
355
- print("MC Score: ", output_norm)
356
- return output_norm
357
-
358
- def ai_generated_test(ai_option, input):
359
-
360
- bc_scores = []
361
- mc_scores = []
362
- samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
363
- samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc'))
364
- segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
365
- segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
366
-
367
- for i in range(samples_len_bc):
368
- cleaned_text_bc = remove_special_characters(segments_bc[i])
369
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
370
- bc_scores.append(bc_score)
371
-
372
- for i in range(samples_len_mc):
373
- cleaned_text_mc = remove_special_characters(segments_mc[i])
374
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
375
- mc_scores.append(mc_score)
376
-
377
- bc_scores_array = np.array(bc_scores)
378
- mc_scores_array = np.array(mc_scores)
379
- average_bc_scores = np.mean(bc_scores_array, axis=0)
380
- average_mc_scores = np.mean(mc_scores_array, axis=0)
381
- bc_score_list = average_bc_scores.tolist()
382
- mc_score_list = average_mc_scores.tolist()
383
-
384
- bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
385
- mc_score = {}
386
- label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
387
-
388
- for score, label in zip(mc_score_list, label_map):
389
- mc_score[label.upper()] = score
390
-
391
- sum_prob = 1 - bc_score["HUMAN"]
392
- for key, value in mc_score.items():
393
- mc_score[key] = value * sum_prob
394
-
395
- if ai_option == "Human vs AI":
396
- mc_score = {}
397
-
398
- if sum_prob < 0.01 :
399
- mc_score = {}
400
- return bc_score, mc_score
401
- else:
402
- return bc_score, mc_score
403
 
404
  # COMBINED
405
  def main(
@@ -428,117 +47,18 @@ def main(
428
  domains_to_skip,
429
  )
430
  depth_analysis_plot = depth_analysis(input)
431
- bc_score, mc_score = ai_generated_test(ai_option,input)
 
432
  quilscore = predict_quillbot(input)
433
-
434
- return (
435
- bc_score,
436
- mc_score,
437
- formatted_tokens,
438
- depth_analysis_plot,
439
- quilscore
440
- )
441
 
442
-
443
- def build_date(year, month, day):
444
- return f"{year}{months[month]}{day}"
445
-
446
- def len_validator(text):
447
- min_tokens = 200
448
- lengt = len(text_bc_tokenizer.tokenize(text = text, return_tensors="pt"))
449
- if lengt < min_tokens:
450
- return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
451
- else :
452
- return f"Input length ({lengt}) is satisified."
453
-
454
- def extract_text_from_pdf(pdf_path):
455
- doc = fitz.open(pdf_path)
456
- text = ""
457
- for page in doc:
458
- text += page.get_text()
459
- return text
460
-
461
-
462
- # DEPTH ANALYSIS
463
- print("loading depth analysis")
464
- nltk.download('stopwords')
465
- nltk.download('punkt')
466
- command = ['python3', '-m', 'spacy', 'download', 'en_core_web_sm']
467
- # Execute the command
468
- subprocess.run(command)
469
- nlp = spacy.load("en_core_web_sm")
470
-
471
- # for perplexity
472
- model_id = "gpt2"
473
- gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
474
- gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
475
-
476
- def depth_analysis(input_text):
477
-
478
- # vocanulary richness
479
- processed_words = preprocess_text1(input_text)
480
- ttr_value = vocabulary_richness_ttr(processed_words)
481
-
482
- # readability
483
- gunning_fog = calculate_gunning_fog(input_text)
484
- gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
485
-
486
- # average sentence length and average word length
487
- words, sentences = preprocess_text2(input_text)
488
- average_sentence_length = calculate_average_sentence_length(sentences)
489
- average_word_length = calculate_average_word_length(words)
490
- average_sentence_length_norm = normalize(average_sentence_length, min_value=0, max_value=40)
491
- average_word_length_norm = normalize(average_word_length, min_value=0, max_value=8)
492
-
493
- # syntactic_tree_depth
494
- average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
495
- average_tree_depth_norm = normalize(average_tree_depth, min_value=0, max_value=10)
496
-
497
- # perplexity
498
- perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
499
- perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
500
-
501
- features = {
502
- "readability": gunning_fog_norm,
503
- "syntactic tree depth": average_tree_depth_norm,
504
- "vocabulary richness": ttr_value,
505
- "perplexity": perplexity_norm,
506
- "average sentence length": average_sentence_length_norm,
507
- "average word length": average_word_length_norm,
508
- }
509
-
510
- print(features)
511
-
512
- fig = go.Figure()
513
-
514
- fig.add_trace(go.Scatterpolar(
515
- r=list(features.values()),
516
- theta=list(features.keys()),
517
- fill='toself',
518
- name='Radar Plot'
519
- ))
520
-
521
- fig.update_layout(
522
- polar=dict(
523
- radialaxis=dict(
524
- visible=True,
525
- range=[0, 100],
526
- )),
527
- showlegend=False,
528
- # autosize=False,
529
- # width=600,
530
- # height=600,
531
- margin=dict(
532
- l=10,
533
- r=20,
534
- b=10,
535
- t=10,
536
- # pad=100
537
- ),
538
  )
539
 
540
- return fig
541
-
542
 
543
  # START OF GRADIO
544
 
@@ -575,16 +95,23 @@ with gr.Blocks() as demo:
575
  with gr.Row():
576
  input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
577
  file_input = gr.File(label="Upload PDF")
578
- file_input.change(fn=extract_text_from_pdf, inputs=file_input, outputs=input_text)
 
 
579
 
580
- char_count = gr.Textbox(label="Minumum Character Limit Check")
581
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
582
 
583
  with gr.Row():
584
  with gr.Column():
585
- ai_option = gr.Radio(["Human vs AI", "Human vs AI Source Models"], label="Choose an option please.")
 
 
 
586
  with gr.Column():
587
- plag_option = gr.Radio(["Standard", "Advanced"], label="Choose an option please.")
 
 
588
 
589
  with gr.Row():
590
  with gr.Column():
@@ -594,7 +121,7 @@ with gr.Blocks() as demo:
594
  only_plagiarism_btn = gr.Button("Source Check")
595
 
596
  with gr.Row():
597
- quillbot_check = gr.Button("Humanized Text Check (Quillbot)")
598
 
599
  with gr.Row():
600
  depth_analysis_btn = gr.Button("Detailed Writing Analysis")
@@ -607,14 +134,14 @@ with gr.Blocks() as demo:
607
  ## Output
608
  """
609
  )
610
-
611
  # models = gr.Dropdown(
612
- # model_list,
613
- # value=model_list,
614
- # multiselect=True,
615
- # label="Models to test against",
616
- # )
617
-
618
  with gr.Row():
619
  with gr.Column():
620
  bcLabel = gr.Label(label="Source")
@@ -666,9 +193,7 @@ with gr.Blocks() as demo:
666
 
667
  with gr.Row():
668
  with gr.Column():
669
- writing_analysis_plot = gr.Plot(
670
- label="Writing Analysis Plot"
671
- )
672
 
673
  full_check_btn.click(
674
  fn=main,
@@ -690,7 +215,7 @@ with gr.Blocks() as demo:
690
  mcLabel,
691
  sentenceBreakdown,
692
  writing_analysis_plot,
693
- QLabel
694
  ],
695
  api_name="main",
696
  )
@@ -740,5 +265,5 @@ with gr.Blocks() as demo:
740
 
741
  date_from = ""
742
  date_to = ""
743
-
744
- demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
2
  import numpy as np
 
 
 
 
3
  from datetime import date
4
+ from predictors import predict_bc_scores, predict_mc_scores
5
+ from analysis import depth_analysis
6
+ from predictors import predict_quillbot
7
+ from plagiarism import plagiarism_check, build_date
8
+ from utils import extract_text_from_pdf, len_validator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  np.set_printoptions(suppress=True)
11
 
12
 
13
+ def ai_generated_test(option, input):
14
+ if option == "Human vs AI":
15
+ return predict_bc_scores(input), None
16
+ else:
17
+ return (
18
+ predict_bc_scores(input),
19
+ predict_mc_scores(input),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # COMBINED
24
  def main(
 
47
  domains_to_skip,
48
  )
49
  depth_analysis_plot = depth_analysis(input)
50
+ bc_score = predict_bc_scores(input)
51
+ mc_score = predict_mc_scores(input)
52
  quilscore = predict_quillbot(input)
 
 
 
 
 
 
 
 
53
 
54
+ return (
55
+ bc_score,
56
+ mc_score,
57
+ formatted_tokens,
58
+ depth_analysis_plot,
59
+ quilscore,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  )
61
 
 
 
62
 
63
  # START OF GRADIO
64
 
 
95
  with gr.Row():
96
  input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
97
  file_input = gr.File(label="Upload PDF")
98
+ file_input.change(
99
+ fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
100
+ )
101
 
102
+ char_count = gr.Textbox(label="Minumum Character Limit Check")
103
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
104
 
105
  with gr.Row():
106
  with gr.Column():
107
+ ai_option = gr.Radio(
108
+ ["Human vs AI", "Human vs AI Source Models"],
109
+ label="Choose an option please.",
110
+ )
111
  with gr.Column():
112
+ plag_option = gr.Radio(
113
+ ["Standard", "Advanced"], label="Choose an option please."
114
+ )
115
 
116
  with gr.Row():
117
  with gr.Column():
 
121
  only_plagiarism_btn = gr.Button("Source Check")
122
 
123
  with gr.Row():
124
+ quillbot_check = gr.Button("Humanized Text Check")
125
 
126
  with gr.Row():
127
  depth_analysis_btn = gr.Button("Detailed Writing Analysis")
 
134
  ## Output
135
  """
136
  )
137
+
138
  # models = gr.Dropdown(
139
+ # model_list,
140
+ # value=model_list,
141
+ # multiselect=True,
142
+ # label="Models to test against",
143
+ # )
144
+
145
  with gr.Row():
146
  with gr.Column():
147
  bcLabel = gr.Label(label="Source")
 
193
 
194
  with gr.Row():
195
  with gr.Column():
196
+ writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
 
 
197
 
198
  full_check_btn.click(
199
  fn=main,
 
215
  mcLabel,
216
  sentenceBreakdown,
217
  writing_analysis_plot,
218
+ QLabel,
219
  ],
220
  api_name="main",
221
  )
 
265
 
266
  date_from = ""
267
  date_to = ""
268
+
269
+ demo.launch(share=True, auth=("polygraf-admin", "test@aisd"))
explainability.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, textstat
2
+ from nltk import FreqDist
3
+ from nltk.corpus import stopwords
4
+ from nltk.tokenize import word_tokenize, sent_tokenize
5
+ import torch
6
+ import nltk
7
+ from tqdm import tqdm
8
+
9
+ nltk.download("punkt")
10
+
11
+
12
+ def normalize(value, min_value, max_value):
13
+ normalized_value = ((value - min_value) * 100) / (max_value - min_value)
14
+ return max(0, min(100, normalized_value))
15
+
16
+
17
+ def preprocess_text1(text):
18
+ text = text.lower()
19
+ text = re.sub(r"[^\w\s]", "", text) # remove punctuation
20
+ stop_words = set(stopwords.words("english")) # remove stopwords
21
+ words = [word for word in text.split() if word not in stop_words]
22
+ words = [word for word in words if not word.isdigit()] # remove numbers
23
+ return words
24
+
25
+
26
+ def vocabulary_richness_ttr(words):
27
+ unique_words = set(words)
28
+ ttr = len(unique_words) / len(words) * 100
29
+ return ttr
30
+
31
+
32
+ def calculate_gunning_fog(text):
33
+ """range 0-20"""
34
+ gunning_fog = textstat.gunning_fog(text)
35
+ return gunning_fog
36
+
37
+
38
+ def calculate_automated_readability_index(text):
39
+ """range 1-20"""
40
+ ari = textstat.automated_readability_index(text)
41
+ return ari
42
+
43
+
44
+ def calculate_flesch_reading_ease(text):
45
+ """range 0-100"""
46
+ fre = textstat.flesch_reading_ease(text)
47
+ return fre
48
+
49
+
50
+ def preprocess_text2(text):
51
+ sentences = sent_tokenize(text)
52
+ words = [
53
+ word.lower()
54
+ for sent in sentences
55
+ for word in word_tokenize(sent)
56
+ if word.isalnum()
57
+ ]
58
+ stop_words = set(stopwords.words("english"))
59
+ words = [word for word in words if word not in stop_words]
60
+ return words, sentences
61
+
62
+
63
+ def calculate_average_sentence_length(sentences):
64
+ """range 0-40 or 50 based on the histogram"""
65
+ total_words = sum(len(word_tokenize(sent)) for sent in sentences)
66
+ average_sentence_length = total_words / (len(sentences) + 0.0000001)
67
+ return average_sentence_length
68
+
69
+
70
+ def calculate_average_word_length(words):
71
+ """range 0-8 based on the histogram"""
72
+ total_characters = sum(len(word) for word in words)
73
+ average_word_length = total_characters / (len(words) + 0.0000001)
74
+ return average_word_length
75
+
76
+
77
+ def calculate_max_depth(sent):
78
+ return max(len(list(token.ancestors)) for token in sent)
79
+
80
+
81
+ def calculate_syntactic_tree_depth(nlp, text):
82
+ """0-10 based on the histogram"""
83
+ doc = nlp(text)
84
+ sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
85
+ average_depth = (
86
+ sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
87
+ )
88
+ return average_depth
89
+
90
+
91
+ def calculate_perplexity(text, model, tokenizer, device, stride=512):
92
+ """range 0-30 based on the histogram"""
93
+ encodings = tokenizer(text, return_tensors="pt")
94
+ max_length = model.config.n_positions
95
+ seq_len = encodings.input_ids.size(1)
96
+
97
+ nlls = []
98
+ prev_end_loc = 0
99
+ for begin_loc in tqdm(range(0, seq_len, stride)):
100
+ end_loc = min(begin_loc + max_length, seq_len)
101
+ trg_len = (
102
+ end_loc - prev_end_loc
103
+ ) # may be different from stride on last loop
104
+ input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
105
+ target_ids = input_ids.clone()
106
+ target_ids[:, :-trg_len] = -100
107
+
108
+ with torch.no_grad():
109
+ outputs = model(input_ids, labels=target_ids)
110
+ neg_log_likelihood = outputs.loss
111
+
112
+ nlls.append(neg_log_likelihood)
113
+
114
+ prev_end_loc = end_loc
115
+ if end_loc == seq_len:
116
+ break
117
+
118
+ ppl = torch.exp(torch.stack(nlls).mean())
119
+ return ppl.item()
plagiarism.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from nltk.tokenize import sent_tokenize
3
+ from googleapiclient.discovery import build
4
+ from collections import Counter
5
+ import re, math
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import asyncio
8
+ import httpx
9
+ from bs4 import BeautifulSoup
10
+ import numpy as np
11
+ import concurrent
12
+
13
+
14
+ WORD = re.compile(r"\w+")
15
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
16
+
17
+
18
+ # returns cosine similarity of two vectors
19
+ # input: two vectors
20
+ # output: integer between 0 and 1.
21
+ def get_cosine(vec1, vec2):
22
+ intersection = set(vec1.keys()) & set(vec2.keys())
23
+
24
+ # calculating numerator
25
+ numerator = sum([vec1[x] * vec2[x] for x in intersection])
26
+
27
+ # calculating denominator
28
+ sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
29
+ sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
30
+ denominator = math.sqrt(sum1) * math.sqrt(sum2)
31
+
32
+ # checking for divide by zero
33
+ if denominator == 0:
34
+ return 0.0
35
+ else:
36
+ return float(numerator) / denominator
37
+
38
+
39
+ # converts given text into a vector
40
+ def text_to_vector(text):
41
+ # uses the Regular expression above and gets all words
42
+ words = WORD.findall(text)
43
+ # returns a counter of all the words (count of number of occurences)
44
+ return Counter(words)
45
+
46
+
47
+ # returns cosine similarity of two words
48
+ # uses: text_to_vector(text) and get_cosine(v1,v2)
49
+ def cosineSim(text1, text2):
50
+ vector1 = text_to_vector(text1)
51
+ vector2 = text_to_vector(text2)
52
+ # print vector1,vector2
53
+ cosine = get_cosine(vector1, vector2)
54
+ return cosine
55
+
56
+
57
+ def cos_sim_torch(embedding_1, embedding_2):
58
+ return util.pytorch_cos_sim(embedding_1, embedding_2).item()
59
+
60
+
61
+ def embed_text(text):
62
+ return model.encode(text, convert_to_tensor=True)
63
+
64
+
65
+ def sentence_similarity(text1, text2):
66
+ embedding_1 = model.encode(text1, convert_to_tensor=True)
67
+ embedding_2 = model.encode(text2, convert_to_tensor=True)
68
+
69
+ o = util.pytorch_cos_sim(embedding_1, embedding_2)
70
+ return o.item()
71
+
72
+
73
+ def google_search(
74
+ plag_option,
75
+ sentences,
76
+ urlCount,
77
+ scoreArray,
78
+ urlList,
79
+ sorted_date,
80
+ domains_to_skip,
81
+ api_key,
82
+ cse_id,
83
+ **kwargs,
84
+ ):
85
+ service = build("customsearch", "v1", developerKey=api_key)
86
+ for i, sentence in enumerate(sentences):
87
+ results = (
88
+ service.cse()
89
+ .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
90
+ .execute()
91
+ )
92
+ if "items" in results and len(results["items"]) > 0:
93
+ for count, link in enumerate(results["items"]):
94
+ # stop after 3 pages
95
+ if count >= 3:
96
+ break
97
+ # skip user selected domains
98
+ if any(
99
+ ("." + domain) in link["link"] for domain in domains_to_skip
100
+ ):
101
+ continue
102
+ # clean up snippet of '...'
103
+ snippet = link["snippet"]
104
+ ind = snippet.find("...")
105
+ if ind < 20 and ind > 9:
106
+ snippet = snippet[ind + len("... ") :]
107
+ ind = snippet.find("...")
108
+ if ind > len(snippet) - 5:
109
+ snippet = snippet[:ind]
110
+
111
+ # update cosine similarity between snippet and given text
112
+ url = link["link"]
113
+ if url not in urlList:
114
+ urlList.append(url)
115
+ scoreArray.append([0] * len(sentences))
116
+ urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
117
+ if plag_option == "Standard":
118
+ scoreArray[urlList.index(url)][i] = cosineSim(
119
+ sentence, snippet
120
+ )
121
+ else:
122
+ scoreArray[urlList.index(url)][i] = sentence_similarity(
123
+ sentence, snippet
124
+ )
125
+ return urlCount, scoreArray
126
+
127
+
128
+ def split_sentence_blocks(text):
129
+
130
+ sents = sent_tokenize(text)
131
+ two_sents = []
132
+ for i in range(len(sents)):
133
+ if (i % 4) == 0:
134
+ two_sents.append(sents[i])
135
+ else:
136
+ two_sents[len(two_sents) - 1] += " " + sents[i]
137
+ return two_sents
138
+
139
+
140
+ months = {
141
+ "January": "01",
142
+ "February": "02",
143
+ "March": "03",
144
+ "April": "04",
145
+ "May": "05",
146
+ "June": "06",
147
+ "July": "07",
148
+ "August": "08",
149
+ "September": "09",
150
+ "October": "10",
151
+ "November": "11",
152
+ "December": "12",
153
+ }
154
+
155
+
156
+ def build_date(year=2024, month="March", day=1):
157
+ return f"{year}{months[month]}{day}"
158
+
159
+
160
+ async def get_url_data(url, client):
161
+ try:
162
+ r = await client.get(url)
163
+ # print(r.status_code)
164
+ if r.status_code == 200:
165
+ # print("in")
166
+ soup = BeautifulSoup(r.content, "html.parser")
167
+ return soup
168
+ except Exception:
169
+ return None
170
+
171
+
172
+ def remove_punc(text):
173
+ res = re.sub(r"[^\w\s]", "", text)
174
+ return res
175
+
176
+
177
+ def split_ngrams(text, n):
178
+ # return n-grams of size n
179
+ words = text.split()
180
+ return [words[i : i + n] for i in range(len(words) - n + 1)]
181
+
182
+
183
+ async def parallel_scrap(urls):
184
+ async with httpx.AsyncClient(timeout=30) as client:
185
+ tasks = []
186
+ for url in urls:
187
+ tasks.append(get_url_data(url=url, client=client))
188
+ results = await asyncio.gather(*tasks, return_exceptions=True)
189
+ return results
190
+
191
+
192
+ def matching_score(args_list):
193
+ sentence = remove_punc(args_list[0])
194
+ content = remove_punc(args_list[1])
195
+ if sentence in content:
196
+ return 1
197
+ else:
198
+ n = 5
199
+ ngrams = split_ngrams(sentence, n)
200
+ if len(ngrams) == 0:
201
+ return 0
202
+ matched = [x for x in ngrams if " ".join(x) in content]
203
+ return len(matched) / len(ngrams)
204
+
205
+
206
+ def plagiarism_check(
207
+ plag_option,
208
+ input,
209
+ year_from,
210
+ month_from,
211
+ day_from,
212
+ year_to,
213
+ month_to,
214
+ day_to,
215
+ domains_to_skip,
216
+ ):
217
+ api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
218
+ api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
219
+ api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
220
+ # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
221
+ api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
222
+ cse_id = "851813e81162b4ed4"
223
+
224
+ sentences = split_sentence_blocks(input)
225
+ urlCount = {}
226
+ ScoreArray = []
227
+ urlList = []
228
+ date_from = build_date(year_from, month_from, day_from)
229
+ date_to = build_date(year_to, month_to, day_to)
230
+ sort_date = f"date:r:{date_from}:{date_to}"
231
+ # get list of URLS to check
232
+ urlCount, ScoreArray = google_search(
233
+ plag_option,
234
+ sentences,
235
+ urlCount,
236
+ ScoreArray,
237
+ urlList,
238
+ sort_date,
239
+ domains_to_skip,
240
+ api_key,
241
+ cse_id,
242
+ )
243
+
244
+ # Scrape URLs in list
245
+ formatted_tokens = []
246
+ soups = asyncio.run(parallel_scrap(urlList))
247
+
248
+ # Populate matching scores for scrapped pages
249
+ for i, soup in enumerate(soups):
250
+ print(f"Analyzing {i+1} of {len(soups)} soups........................")
251
+ if soup:
252
+ page_content = soup.text
253
+ for j, sent in enumerate(sentences):
254
+ args_list = (sent, page_content)
255
+ score = matching_score(args_list)
256
+ # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
257
+ ScoreArray[i][j] = score
258
+
259
+ # with concurrent.futures.ProcessPoolExecutor() as executor:
260
+ # results = executor.map(matching_score, args_list)
261
+
262
+ # *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
263
+ # source_embeddings = []
264
+ # for i, soup in enumerate(soups):
265
+ # if soup:
266
+ # page_content = soup.text
267
+ # source_embeddings.append(embed_text(page_content))
268
+ # else:
269
+ # source_embeddings.append(None)
270
+
271
+ # def compute_cosine_similarity(args):
272
+ # sent, source_embedding, i, j = args
273
+ # score = cos_sim_torch(embed_text(sent), source_embedding)
274
+ # return i, j, score
275
+
276
+ # def main(soups, sentences):
277
+ # source_embeddings = [preprocess(soup) for soup in soups]
278
+ # ScoreArray = [[0 for _ in sentences] for _ in soups]
279
+ # args_list = []
280
+ # for i, soup in enumerate(soups):
281
+ # if soup:
282
+ # for j, sent in enumerate(sentences):
283
+ # args_list.append((sent, source_embeddings[i], i, j))
284
+ # with concurrent.futures.ProcessPoolExecutor() as executor:
285
+ # results = executor.map(compute_cosine_similarity, args_list)
286
+ # for i, j, score in results:
287
+ # ScoreArray[i][j] = score
288
+ # return ScoreArray
289
+
290
+ # # Populate matching scores for scrapped pages
291
+ # ScoreArray = main(soups, sentences)
292
+ # *******************************************************************************************
293
+
294
+ # Calculate URL of max matching score for each sentence chunk
295
+ sentenceToMaxURL = [-1] * len(sentences)
296
+ for j in range(len(sentences)):
297
+ if j > 0:
298
+ maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
299
+ sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
300
+ else:
301
+ maxScore = -1
302
+
303
+ for i in range(len(ScoreArray)):
304
+ margin = (
305
+ 0.1
306
+ if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
307
+ else 0
308
+ )
309
+ if ScoreArray[i][j] - maxScore > margin:
310
+ maxScore = ScoreArray[i][j]
311
+ sentenceToMaxURL[j] = i
312
+
313
+ index = np.unique(sentenceToMaxURL)
314
+
315
+ urlScore = {}
316
+ for url in index:
317
+ s = [
318
+ ScoreArray[url][sen]
319
+ for sen in range(len(sentences))
320
+ if sentenceToMaxURL[sen] == url
321
+ ]
322
+ urlScore[url] = sum(s) / len(s)
323
+
324
+ index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
325
+
326
+ urlMap = {}
327
+ for count, i in enumerate(index_descending):
328
+ urlMap[i] = count + 1
329
+ for i, sent in enumerate(sentences):
330
+ formatted_tokens.append(
331
+ (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
332
+ )
333
+ for ind in index_descending:
334
+ formatted_tokens.append(
335
+ (
336
+ urlList[ind]
337
+ + " --- Matching Score: "
338
+ + f"{str(round(urlScore[ind] * 100, 2))}%",
339
+ "[" + str(urlMap[ind]) + "]",
340
+ )
341
+ )
342
+ formatted_tokens.append(("\n", None))
343
+
344
+ return formatted_tokens
predictors.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import httpx
3
+ import torch
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ import numpy as np
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+ import asyncio
9
+ from evaluate import load
10
+ from datetime import date
11
+ import nltk
12
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
13
+ import plotly.graph_objects as go
14
+ import torch.nn.functional as F
15
+ import nltk
16
+ from unidecode import unidecode
17
+ import time
18
+ from scipy.special import softmax
19
+ import yaml
20
+ import os
21
+ from utils import *
22
+ from dotenv import load_dotenv
23
+
24
+ with open("config.yaml", "r") as file:
25
+ params = yaml.safe_load(file)
26
+ nltk.download("punkt")
27
+ nltk.download("stopwords")
28
+ load_dotenv()
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
31
+ text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
32
+ text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
33
+ quillbot_labels = params["QUILLBOT_LABELS"]
34
+ mc_label_map = params["MC_OUTPUT_LABELS"]
35
+ mc_token_size = int(params["MC_TOKEN_SIZE"])
36
+ bc_token_size = int(params["BC_TOKEN_SIZE"])
37
+ text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
38
+ text_bc_model = AutoModelForSequenceClassification.from_pretrained(
39
+ text_bc_model_path
40
+ ).to(device)
41
+ text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
42
+ text_mc_model = AutoModelForSequenceClassification.from_pretrained(
43
+ text_mc_model_path
44
+ ).to(device)
45
+ quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path)
46
+ quillbot_model = AutoModelForSequenceClassification.from_pretrained(
47
+ text_quillbot_model_path
48
+ ).to(device)
49
+
50
+
51
+ def split_text_allow_complete_sentences_nltk(
52
+ text,
53
+ max_length=256,
54
+ tolerance=30,
55
+ min_last_segment_length=100,
56
+ type_det="bc",
57
+ ):
58
+ sentences = nltk.sent_tokenize(text)
59
+ segments = []
60
+ current_segment = []
61
+ current_length = 0
62
+ if type_det == "bc":
63
+ tokenizer = text_bc_tokenizer
64
+ max_length = bc_token_size
65
+ elif type_det == "mc":
66
+ tokenizer = text_mc_tokenizer
67
+ max_length = mc_token_size
68
+ for sentence in sentences:
69
+ tokens = tokenizer.tokenize(sentence)
70
+ sentence_length = len(tokens)
71
+
72
+ if current_length + sentence_length <= max_length + tolerance - 2:
73
+ current_segment.append(sentence)
74
+ current_length += sentence_length
75
+ else:
76
+ if current_segment:
77
+ encoded_segment = tokenizer.encode(
78
+ " ".join(current_segment),
79
+ add_special_tokens=True,
80
+ max_length=max_length + tolerance,
81
+ truncation=True,
82
+ )
83
+ segments.append((current_segment, len(encoded_segment)))
84
+ current_segment = [sentence]
85
+ current_length = sentence_length
86
+
87
+ if current_segment:
88
+ encoded_segment = tokenizer.encode(
89
+ " ".join(current_segment),
90
+ add_special_tokens=True,
91
+ max_length=max_length + tolerance,
92
+ truncation=True,
93
+ )
94
+ segments.append((current_segment, len(encoded_segment)))
95
+
96
+ final_segments = []
97
+ for i, (seg, length) in enumerate(segments):
98
+ if i == len(segments) - 1:
99
+ if length < min_last_segment_length and len(final_segments) > 0:
100
+ prev_seg, prev_length = final_segments[-1]
101
+ combined_encoded = tokenizer.encode(
102
+ " ".join(prev_seg + seg),
103
+ add_special_tokens=True,
104
+ max_length=max_length + tolerance,
105
+ truncation=True,
106
+ )
107
+ if len(combined_encoded) <= max_length + tolerance:
108
+ final_segments[-1] = (prev_seg + seg, len(combined_encoded))
109
+ else:
110
+ final_segments.append((seg, length))
111
+ else:
112
+ final_segments.append((seg, length))
113
+ else:
114
+ final_segments.append((seg, length))
115
+
116
+ decoded_segments = []
117
+ encoded_segments = []
118
+ for seg, _ in final_segments:
119
+ encoded_segment = tokenizer.encode(
120
+ " ".join(seg),
121
+ add_special_tokens=True,
122
+ max_length=max_length + tolerance,
123
+ truncation=True,
124
+ )
125
+ decoded_segment = tokenizer.decode(encoded_segment)
126
+ decoded_segments.append(decoded_segment)
127
+ return decoded_segments
128
+
129
+
130
+ def predict_quillbot(text):
131
+ with torch.no_grad():
132
+ quillbot_model.eval()
133
+ tokenized_text = quillbot_tokenizer(
134
+ text,
135
+ padding="max_length",
136
+ truncation=True,
137
+ max_length=256,
138
+ return_tensors="pt",
139
+ ).to(device)
140
+ output = quillbot_model(**tokenized_text)
141
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
142
+ q_score = {
143
+ "Humanized": output_norm[1].item(),
144
+ "Original": output_norm[0].item(),
145
+ }
146
+ return q_score
147
+
148
+
149
+ def predict_bc(model, tokenizer, text):
150
+ with torch.no_grad():
151
+ model.eval()
152
+ tokens = text_bc_tokenizer(
153
+ text,
154
+ padding="max_length",
155
+ truncation=True,
156
+ max_length=bc_token_size,
157
+ return_tensors="pt",
158
+ ).to(device)
159
+ output = model(**tokens)
160
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
161
+ return output_norm
162
+
163
+
164
+ def predict_mc(model, tokenizer, text):
165
+ with torch.no_grad():
166
+ model.eval()
167
+ tokens = text_mc_tokenizer(
168
+ text,
169
+ padding="max_length",
170
+ truncation=True,
171
+ return_tensors="pt",
172
+ max_length=mc_token_size,
173
+ ).to(device)
174
+ output = model(**tokens)
175
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
176
+ return output_norm
177
+
178
+
179
+ def predict_mc_scores(input):
180
+ bc_scores = []
181
+ mc_scores = []
182
+
183
+ samples_len_bc = len(
184
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
185
+ )
186
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
187
+ for i in range(samples_len_bc):
188
+ cleaned_text_bc = remove_special_characters(segments_bc[i])
189
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
190
+ bc_scores.append(bc_score)
191
+ bc_scores_array = np.array(bc_scores)
192
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
193
+ bc_score_list = average_bc_scores.tolist()
194
+ bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
195
+ segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
196
+ samples_len_mc = len(
197
+ split_text_allow_complete_sentences_nltk(input, type_det="mc")
198
+ )
199
+ for i in range(samples_len_mc):
200
+ cleaned_text_mc = remove_special_characters(segments_mc[i])
201
+ mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
202
+ mc_scores.append(mc_score)
203
+ mc_scores_array = np.array(mc_scores)
204
+ average_mc_scores = np.mean(mc_scores_array, axis=0)
205
+ mc_score_list = average_mc_scores.tolist()
206
+ mc_score = {}
207
+ for score, label in zip(mc_score_list, mc_label_map):
208
+ mc_score[label.upper()] = score
209
+
210
+ sum_prob = 1 - bc_score["HUMAN"]
211
+ for key, value in mc_score.items():
212
+ mc_score[key] = value * sum_prob
213
+ if sum_prob < 0.01:
214
+ mc_score = {}
215
+
216
+ return mc_score
217
+
218
+
219
+ def predict_bc_scores(input):
220
+ bc_scores = []
221
+ mc_scores = []
222
+ samples_len_bc = len(
223
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
224
+ )
225
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
226
+ for i in range(samples_len_bc):
227
+ cleaned_text_bc = remove_special_characters(segments_bc[i])
228
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
229
+ bc_scores.append(bc_score)
230
+ bc_scores_array = np.array(bc_scores)
231
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
232
+ bc_score_list = average_bc_scores.tolist()
233
+ bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
234
+ return bc_score
235
+
236
+
237
+ # def predict_1on1(input):
238
+ # models = ['bard', 'claude', 'gpt4', 'mistral_ai', 'llama2']
239
+ # text = str(row["text"])
240
+ # predictions = {}
241
+ # prediction = predict(text, bard_model, bard_tokenizer) predictions['bard'] = prediction[1]
242
+ # prediction = predict(text, claude_model, claude_tokenizer) predictions['claude'] = prediction[1]
243
+ # prediction = predict(text, gpt4_model, gpt4_tokenizer) predictions['gpt4'] = prediction[1]
244
+ # prediction = predict(text, mistral_ai_model, mistral_ai_tokenizer) predictions['mistral_ai'] = prediction[1]
245
+ # prediction = predict(text, llama2_model, llama2_tokenizer) predictions['llama2'] = prediction[1]
246
+ # max_key = max(predictions, key=predictions.get)
requirements.txt CHANGED
@@ -6,8 +6,8 @@ BeautifulSoup4
6
  scrapingbee
7
  requests
8
  numpy
9
- torch==1.13.0
10
- transformers==4.25.1
11
  transformers-interpret
12
  textstat
13
  scipy
 
6
  scrapingbee
7
  requests
8
  numpy
9
+ torch
10
+ transformers
11
  transformers-interpret
12
  textstat
13
  scipy
utils.py CHANGED
@@ -11,284 +11,354 @@ import asyncio
11
  import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
  import threading
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- nltk.download('punkt')
16
 
17
  WORD = re.compile(r"\w+")
18
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
19
 
20
 
21
  # returns cosine similarity of two vectors
22
  # input: two vectors
23
  # output: integer between 0 and 1.
24
- def get_cosine(vec1, vec2):
25
- intersection = set(vec1.keys()) & set(vec2.keys())
26
 
27
- # calculating numerator
28
- numerator = sum([vec1[x] * vec2[x] for x in intersection])
29
 
30
- # calculating denominator
31
- sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
32
- sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
33
- denominator = math.sqrt(sum1) * math.sqrt(sum2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # checking for divide by zero
36
- if denominator == 0:
37
- return 0.0
38
- else:
39
- return float(numerator) / denominator
40
-
41
-
42
- # converts given text into a vector
43
- def text_to_vector(text):
44
- # uses the Regular expression above and gets all words
45
- words = WORD.findall(text)
46
- # returns a counter of all the words (count of number of occurences)
47
- return Counter(words)
48
-
49
-
50
- # returns cosine similarity of two words
51
- # uses: text_to_vector(text) and get_cosine(v1,v2)
52
- def cosineSim(text1, text2):
53
- vector1 = text_to_vector(text1)
54
- vector2 = text_to_vector(text2)
55
- # print vector1,vector2
56
- cosine = get_cosine(vector1, vector2)
57
- return cosine
58
-
59
- def cos_sim_torch(embedding_1, embedding_2):
60
- return util.pytorch_cos_sim(embedding_1, embedding_2).item()
61
-
62
- def embed_text(text):
63
- return model.encode(text, convert_to_tensor=True)
64
-
65
- def sentence_similarity(text1, text2):
66
- embedding_1= model.encode(text1, convert_to_tensor=True)
67
- embedding_2 = model.encode(text2, convert_to_tensor=True)
68
-
69
- o = util.pytorch_cos_sim(embedding_1, embedding_2)
70
- return o.item()
71
-
72
- def get_soup_requests(url):
73
- page = requests.get(url)
74
- if page.status_code == 200:
75
- soup = BeautifulSoup(page.content, "html.parser")
76
- return soup
77
- print("HTML soup failed")
78
- return None
79
-
80
-
81
- def get_soup_httpx(url):
82
- client = httpx.Client(timeout=30)
83
- try:
84
- page = client.get(url)
85
- if page.status_code == httpx.codes.OK:
86
- soup = BeautifulSoup(page.content, "html.parser")
87
- return soup
88
- except:
89
- print("HTTPx soup failed")
90
- return None
91
-
92
- def getSentences(text):
93
- from nltk.tokenize import sent_tokenize
94
-
95
- sents = sent_tokenize(text)
96
- two_sents = []
97
- for i in range(len(sents)):
98
- if (i % 2) == 0:
99
- two_sents.append(sents[i])
100
- else:
101
- two_sents[len(two_sents) - 1] += " " + sents[i]
102
- return two_sents
103
-
104
-
105
- def googleSearch(
106
- plag_option,
107
- sentences,
108
- urlCount,
109
- scoreArray,
110
- urlList,
111
- sorted_date,
112
- domains_to_skip,
113
- api_key,
114
- cse_id,
115
- **kwargs,
116
- ):
117
- service = build("customsearch", "v1", developerKey=api_key)
118
- for i, sentence in enumerate(sentences):
119
- results = (
120
- service.cse()
121
- .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
122
- .execute()
123
- )
124
- if "items" in results and len(results["items"]) > 0:
125
- for count, link in enumerate(results["items"]):
126
- # stop after 3 pages
127
- if count >= 3:
128
- break
129
- # skip user selected domains
130
- if any(
131
- ("." + domain) in link["link"]
132
- for domain in domains_to_skip
133
- ):
134
- continue
135
- # clean up snippet of '...'
136
- snippet = link["snippet"]
137
- ind = snippet.find("...")
138
- if ind < 20 and ind > 9:
139
- snippet = snippet[ind + len("... ") :]
140
- ind = snippet.find("...")
141
- if ind > len(snippet) - 5:
142
- snippet = snippet[:ind]
143
-
144
- # update cosine similarity between snippet and given text
145
- url = link["link"]
146
- if url not in urlList:
147
- urlList.append(url)
148
- scoreArray.append([0] * len(sentences))
149
- urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
150
- if plag_option == 'Standard':
151
- scoreArray[urlList.index(url)][i] = cosineSim(
152
- sentence, snippet)
153
- else :
154
- scoreArray[urlList.index(url)][i] = sentence_similarity(
155
- sentence, snippet
156
- )
157
- else:
158
- print("Google Search failed")
159
- return urlCount, scoreArray
160
-
161
-
162
- def getQueries(text, n):
163
- # return n-grams of size n
164
- words = text.split()
165
- return [words[i : i + n] for i in range(len(words) - n + 1)]
166
-
167
-
168
- def print2D(array):
169
- print(np.array(array))
170
-
171
-
172
- def removePunc(text):
173
- res = re.sub(r"[^\w\s]", "", text)
174
- return res
175
-
176
-
177
- async def get_url_data(url, client):
178
- try:
179
- r = await client.get(url)
180
- # print(r.status_code)
181
- if r.status_code == 200:
182
- # print("in")
183
- soup = BeautifulSoup(r.content, "html.parser")
184
- return soup
185
- except Exception:
186
- print("HTTPx parallel soup failed")
187
- return None
188
-
189
-
190
- async def parallel_scrap(urls):
191
- async with httpx.AsyncClient(timeout=30) as client:
192
- tasks = []
193
- for url in urls:
194
- tasks.append(get_url_data(url=url, client=client))
195
- results = await asyncio.gather(*tasks, return_exceptions=True)
196
- return results
197
-
198
-
199
- class TimeoutError(Exception):
200
- pass
201
-
202
-
203
-
204
- def matchingScore(sentence, content):
205
- if sentence in content:
206
- return 1
207
- sentence = removePunc(sentence)
208
- content = removePunc(content)
209
- if sentence in content:
210
- return 1
211
- else:
212
- n = 5
213
- ngrams = getQueries(sentence, n)
214
- if len(ngrams) == 0:
215
- return 0
216
- matched = [x for x in ngrams if " ".join(x) in content]
217
- return len(matched) / len(ngrams)
218
 
 
 
 
 
 
 
 
 
219
 
220
- # def matchingScoreWithTimeout(sentence, content):
221
- # def timeout_handler():
222
- # raise TimeoutError("Function timed out")
223
 
224
- # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds
225
- # timer.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  # try:
227
- # score = sentence_similarity(sentence, content)
228
- # # score = matchingScore(sentence, content)
229
- # timer.cancel() # Cancel the timer if calculation completes before timeout
230
- # return score
231
- # except TimeoutError:
232
- # return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
 
 
234
 
235
- # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  # content = removePunc(content)
237
  # for j, sentence in enumerate(sentences):
238
  # sentence = removePunc(sentence)
239
- # if sentence in content:
240
- # ScoreArray[content_idx][j] = 1
241
- # else:
242
- # n = 5
243
- # ngrams = getQueries(sentence, n)
244
- # if len(ngrams) == 0:
245
- # return 0
246
- # matched = [x for x in ngrams if " ".join(x) in content]
247
- # ScoreArray[content_idx][j] = len(matched) / len(ngrams)
248
  # print(
249
- # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
250
  # )
251
  # return ScoreArray
252
 
253
- async def matchingScoreAsync(sentences, content, content_idx, ScoreArray, model, util):
254
- content = removePunc(content)
255
- for j, sentence in enumerate(sentences):
256
- sentence = removePunc(sentence)
257
- similarity_score = sentence_similarity(sentence, content, model, util)
258
- ScoreArray[content_idx][j] = similarity_score
259
- print(f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................")
260
- return ScoreArray
261
-
262
-
263
- async def parallel_analyze(soups, sentences, ScoreArray):
264
- tasks = []
265
- for i, soup in enumerate(soups):
266
- if soup:
267
- page_content = soup.text
268
- tasks.append(
269
- matchingScoreAsync(sentences, page_content, i, ScoreArray)
270
- )
271
- else:
272
- print(
273
- f"Analyzed {i+1} of soups (SOUP FAILED)........................"
274
- )
275
- ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
276
- return ScoreArray
277
-
278
-
279
- async def parallel_analyze_2(soups, sentences, ScoreArray):
280
- tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
281
- for i, soup in enumerate(soups):
282
- if soup:
283
- page_content = soup.text
284
- for j, sent in enumerate(sentences):
285
- print(
286
- f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
287
- )
288
- tasks[i][j] = sentence_similarity(sent, page_content)
289
- else:
290
- print(
291
- f"Analyzed {i+1} of soups (SOUP FAILED)........................"
292
- )
293
- ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
294
- return ScoreArray
 
11
  import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
  import threading
14
+ import torch
15
+ import re
16
+ import numpy as np
17
+ import asyncio
18
+ from datetime import date
19
+ import nltk
20
+ from unidecode import unidecode
21
+ from scipy.special import softmax
22
+ from transformers import AutoTokenizer
23
+ import yaml
24
+ import fitz
25
+ import os
26
+
27
+
28
+ def remove_accents(input_str):
29
+ text_no_accents = unidecode(input_str)
30
+ return text_no_accents
31
+
32
+
33
+ def remove_special_characters(text):
34
+ text = remove_accents(text)
35
+ pattern = r'[^\w\s\d.,!?\'"()-;]+'
36
+ text = re.sub(pattern, "", text)
37
+ return text
38
+
39
+
40
+ def remove_special_characters_2(text):
41
+ pattern = r"[^a-zA-Z0-9 ]+"
42
+ text = re.sub(pattern, "", text)
43
+ return text
44
+
45
+
46
+ def update_character_count(text):
47
+ return f"{len(text)} characters"
48
+
49
+
50
+ nltk.download("punkt")
51
+
52
+
53
+ with open("config.yaml", "r") as file:
54
+ params = yaml.safe_load(file)
55
+
56
+ text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
57
+
58
+ text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
59
+
60
+
61
+ def len_validator(text):
62
+ min_tokens = 200
63
+ lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
64
+ if lengt < min_tokens:
65
+ return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
66
+ else:
67
+ return f"Input length ({lengt}) is satisified."
68
+
69
+
70
+ def extract_text_from_pdf(pdf_path):
71
+ doc = fitz.open(pdf_path)
72
+ text = ""
73
+ for page in doc:
74
+ text += page.get_text()
75
+ return text
76
 
 
77
 
78
  WORD = re.compile(r"\w+")
79
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
80
 
81
 
82
  # returns cosine similarity of two vectors
83
  # input: two vectors
84
  # output: integer between 0 and 1.
85
+ # def get_cosine(vec1, vec2):
86
+ # intersection = set(vec1.keys()) & set(vec2.keys())
87
 
88
+ # # calculating numerator
89
+ # numerator = sum([vec1[x] * vec2[x] for x in intersection])
90
 
91
+ # # calculating denominator
92
+ # sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
93
+ # sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
94
+ # denominator = math.sqrt(sum1) * math.sqrt(sum2)
95
+
96
+ # # checking for divide by zero
97
+ # if denominator == 0:
98
+ # return 0.0
99
+ # else:
100
+ # return float(numerator) / denominator
101
+
102
+
103
+ # # converts given text into a vector
104
+ # def text_to_vector(text):
105
+ # # uses the Regular expression above and gets all words
106
+ # words = WORD.findall(text)
107
+ # # returns a counter of all the words (count of number of occurences)
108
+ # return Counter(words)
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ # # returns cosine similarity of two words
112
+ # # uses: text_to_vector(text) and get_cosine(v1,v2)
113
+ # def cosineSim(text1, text2):
114
+ # vector1 = text_to_vector(text1)
115
+ # vector2 = text_to_vector(text2)
116
+ # # print vector1,vector2
117
+ # cosine = get_cosine(vector1, vector2)
118
+ # return cosine
119
 
 
 
 
120
 
121
+ # def cos_sim_torch(embedding_1, embedding_2):
122
+ # return util.pytorch_cos_sim(embedding_1, embedding_2).item()
123
+
124
+
125
+ # def embed_text(text):
126
+ # return model.encode(text, convert_to_tensor=True)
127
+
128
+
129
+ # def sentence_similarity(text1, text2):
130
+ # embedding_1 = model.encode(text1, convert_to_tensor=True)
131
+ # embedding_2 = model.encode(text2, convert_to_tensor=True)
132
+
133
+ # o = util.pytorch_cos_sim(embedding_1, embedding_2)
134
+ # return o.item()
135
+
136
+
137
+ # def get_soup_requests(url):
138
+ # page = requests.get(url)
139
+ # if page.status_code == 200:
140
+ # soup = BeautifulSoup(page.content, "html.parser")
141
+ # return soup
142
+ # print("HTML soup failed")
143
+ # return None
144
+
145
+
146
+ # def get_soup_httpx(url):
147
+ # client = httpx.Client(timeout=30)
148
  # try:
149
+ # page = client.get(url)
150
+ # if page.status_code == httpx.codes.OK:
151
+ # soup = BeautifulSoup(page.content, "html.parser")
152
+ # return soup
153
+ # except:
154
+ # print("HTTPx soup failed")
155
+ # return None
156
+
157
+
158
+ # def getSentences(text):
159
+ # from nltk.tokenize import sent_tokenize
160
+
161
+ # sents = sent_tokenize(text)
162
+ # two_sents = []
163
+ # for i in range(len(sents)):
164
+ # if (i % 2) == 0:
165
+ # two_sents.append(sents[i])
166
+ # else:
167
+ # two_sents[len(two_sents) - 1] += " " + sents[i]
168
+ # return two_sents
169
+
170
+
171
+ # def googleSearch(
172
+ # plag_option,
173
+ # sentences,
174
+ # urlCount,
175
+ # scoreArray,
176
+ # urlList,
177
+ # sorted_date,
178
+ # domains_to_skip,
179
+ # api_key,
180
+ # cse_id,
181
+ # **kwargs,
182
+ # ):
183
+ # service = build("customsearch", "v1", developerKey=api_key)
184
+ # for i, sentence in enumerate(sentences):
185
+ # results = (
186
+ # service.cse()
187
+ # .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
188
+ # .execute()
189
+ # )
190
+ # if "items" in results and len(results["items"]) > 0:
191
+ # for count, link in enumerate(results["items"]):
192
+ # # stop after 3 pages
193
+ # if count >= 3:
194
+ # break
195
+ # # skip user selected domains
196
+ # if any(
197
+ # ("." + domain) in link["link"] for domain in domains_to_skip
198
+ # ):
199
+ # continue
200
+ # # clean up snippet of '...'
201
+ # snippet = link["snippet"]
202
+ # ind = snippet.find("...")
203
+ # if ind < 20 and ind > 9:
204
+ # snippet = snippet[ind + len("... ") :]
205
+ # ind = snippet.find("...")
206
+ # if ind > len(snippet) - 5:
207
+ # snippet = snippet[:ind]
208
+
209
+ # # update cosine similarity between snippet and given text
210
+ # url = link["link"]
211
+ # if url not in urlList:
212
+ # urlList.append(url)
213
+ # scoreArray.append([0] * len(sentences))
214
+ # urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
215
+ # if plag_option == "Standard":
216
+ # scoreArray[urlList.index(url)][i] = cosineSim(
217
+ # sentence, snippet
218
+ # )
219
+ # else:
220
+ # scoreArray[urlList.index(url)][i] = sentence_similarity(
221
+ # sentence, snippet
222
+ # )
223
+ # else:
224
+ # print("Google Search failed")
225
+ # return urlCount, scoreArray
226
+
227
+
228
+ # def getQueries(text, n):
229
+ # # return n-grams of size n
230
+ # words = text.split()
231
+ # return [words[i : i + n] for i in range(len(words) - n + 1)]
232
+
233
 
234
+ # def print2D(array):
235
+ # print(np.array(array))
236
 
237
+
238
+ # def removePunc(text):
239
+ # res = re.sub(r"[^\w\s]", "", text)
240
+ # return res
241
+
242
+
243
+ # async def get_url_data(url, client):
244
+ # try:
245
+ # r = await client.get(url)
246
+ # # print(r.status_code)
247
+ # if r.status_code == 200:
248
+ # # print("in")
249
+ # soup = BeautifulSoup(r.content, "html.parser")
250
+ # return soup
251
+ # except Exception:
252
+ # print("HTTPx parallel soup failed")
253
+ # return None
254
+
255
+
256
+ # async def parallel_scrap(urls):
257
+ # async with httpx.AsyncClient(timeout=30) as client:
258
+ # tasks = []
259
+ # for url in urls:
260
+ # tasks.append(get_url_data(url=url, client=client))
261
+ # results = await asyncio.gather(*tasks, return_exceptions=True)
262
+ # return results
263
+
264
+
265
+ # class TimeoutError(Exception):
266
+ # pass
267
+
268
+
269
+ # def matchingScore(sentence, content):
270
+ # if sentence in content:
271
+ # return 1
272
+ # sentence = removePunc(sentence)
273
+ # content = removePunc(content)
274
+ # if sentence in content:
275
+ # return 1
276
+ # else:
277
+ # n = 5
278
+ # ngrams = getQueries(sentence, n)
279
+ # if len(ngrams) == 0:
280
+ # return 0
281
+ # matched = [x for x in ngrams if " ".join(x) in content]
282
+ # return len(matched) / len(ngrams)
283
+
284
+
285
+ # # def matchingScoreWithTimeout(sentence, content):
286
+ # # def timeout_handler():
287
+ # # raise TimeoutError("Function timed out")
288
+
289
+ # # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds
290
+ # # timer.start()
291
+ # # try:
292
+ # # score = sentence_similarity(sentence, content)
293
+ # # # score = matchingScore(sentence, content)
294
+ # # timer.cancel() # Cancel the timer if calculation completes before timeout
295
+ # # return score
296
+ # # except TimeoutError:
297
+ # # return 0
298
+
299
+
300
+ # # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
301
+ # # content = removePunc(content)
302
+ # # for j, sentence in enumerate(sentences):
303
+ # # sentence = removePunc(sentence)
304
+ # # if sentence in content:
305
+ # # ScoreArray[content_idx][j] = 1
306
+ # # else:
307
+ # # n = 5
308
+ # # ngrams = getQueries(sentence, n)
309
+ # # if len(ngrams) == 0:
310
+ # # return 0
311
+ # # matched = [x for x in ngrams if " ".join(x) in content]
312
+ # # ScoreArray[content_idx][j] = len(matched) / len(ngrams)
313
+ # # print(
314
+ # # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
315
+ # # )
316
+ # # return ScoreArray
317
+
318
+
319
+ # async def matchingScoreAsync(
320
+ # sentences, content, content_idx, ScoreArray, model, util
321
+ # ):
322
  # content = removePunc(content)
323
  # for j, sentence in enumerate(sentences):
324
  # sentence = removePunc(sentence)
325
+ # similarity_score = sentence_similarity(sentence, content, model, util)
326
+ # ScoreArray[content_idx][j] = similarity_score
 
 
 
 
 
 
 
327
  # print(
328
+ # f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................"
329
  # )
330
  # return ScoreArray
331
 
332
+
333
+ # async def parallel_analyze(soups, sentences, ScoreArray):
334
+ # tasks = []
335
+ # for i, soup in enumerate(soups):
336
+ # if soup:
337
+ # page_content = soup.text
338
+ # tasks.append(
339
+ # matchingScoreAsync(sentences, page_content, i, ScoreArray)
340
+ # )
341
+ # else:
342
+ # print(
343
+ # f"Analyzed {i+1} of soups (SOUP FAILED)........................"
344
+ # )
345
+ # ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
346
+ # return ScoreArray
347
+
348
+
349
+ # async def parallel_analyze_2(soups, sentences, ScoreArray):
350
+ # tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
351
+ # for i, soup in enumerate(soups):
352
+ # if soup:
353
+ # page_content = soup.text
354
+ # for j, sent in enumerate(sentences):
355
+ # print(
356
+ # f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
357
+ # )
358
+ # tasks[i][j] = sentence_similarity(sent, page_content)
359
+ # else:
360
+ # print(
361
+ # f"Analyzed {i+1} of soups (SOUP FAILED)........................"
362
+ # )
363
+ # ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
364
+ # return ScoreArray