minko186 commited on
Commit
fe15d80
·
1 Parent(s): f6a92d7

fix changes in plagiarism check

Browse files
Files changed (2) hide show
  1. app.py +210 -121
  2. plagiarism.py +0 -0
app.py CHANGED
@@ -1,4 +1,10 @@
1
- from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore
 
 
 
 
 
 
2
  import gradio as gr
3
  from urllib.request import urlopen, Request
4
  from googleapiclient.discovery import build
@@ -14,7 +20,7 @@ from scipy.special import softmax
14
  from evaluate import load
15
  from datetime import date
16
  import nltk
17
- import fitz
18
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
19
  import nltk, spacy, subprocess, torch
20
  import plotly.graph_objects as go
@@ -27,20 +33,19 @@ import multiprocessing
27
  from functools import partial
28
  import concurrent.futures
29
 
30
- nltk.download('punkt')
31
 
32
  from writing_analysis import (
33
  normalize,
34
  preprocess_text1,
35
- preprocess_text2,
36
  vocabulary_richness_ttr,
37
  calculate_gunning_fog,
38
  calculate_average_sentence_length,
39
  calculate_average_word_length,
40
  calculate_syntactic_tree_depth,
41
  calculate_perplexity,
42
-
43
- )
44
 
45
  np.set_printoptions(suppress=True)
46
 
@@ -89,7 +94,7 @@ def plagiarism_check(
89
  )
90
  print(f"Time for google search: {time.perf_counter()-time1}")
91
  time1 = time.perf_counter()
92
-
93
  print("Number of URLs: ", len(urlCount))
94
  print(urlList)
95
 
@@ -113,8 +118,8 @@ def plagiarism_check(
113
  page_content = soup.text
114
  source_embeddings.append(embed_text(page_content))
115
  else:
116
- source_embeddings.append(None)
117
-
118
  # Populate matching scores for scrapped pages
119
  # for i, soup in enumerate(soups):
120
  # print(f"Analyzing {i+1} of {len(soups)} soups........................")
@@ -126,30 +131,27 @@ def plagiarism_check(
126
  # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
127
  # ScoreArray[i][j] = score
128
 
129
-
130
- def compute_cosine_similarity(args):
131
- sent, source_embedding, i, j = args
132
- score = cos_sim_torch(embed_text(sent), source_embedding)
133
- return i, j, score
134
-
135
- def main(soups, sentences):
136
- source_embeddings = [preprocess(soup) for soup in soups]
137
- ScoreArray = [[0 for _ in sentences] for _ in soups]
138
- args_list = []
139
- for i, soup in enumerate(soups):
140
- if soup:
141
- for j, sent in enumerate(sentences):
142
- args_list.append((sent, source_embeddings[i], i, j))
143
- with concurrent.futures.ProcessPoolExecutor() as executor:
144
- results = executor.map(compute_cosine_similarity, args_list)
145
- for i, j, score in results:
146
- ScoreArray[i][j] = score
147
- return ScoreArray
148
 
149
  ScoreArray = main(soups, sentences)
150
 
151
-
152
-
153
  print(f"Time for matching score: {time.perf_counter()-time1}")
154
  time1 = time.perf_counter()
155
 
@@ -177,7 +179,7 @@ def main(soups, sentences):
177
  sentenceToMaxURL[j] = i
178
  if maxScore > 0.5:
179
  sentencePlag[j] = True
180
-
181
  if (
182
  (len(sentences) > 1)
183
  and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
@@ -216,11 +218,13 @@ def main(soups, sentences):
216
 
217
  print(formatted_tokens)
218
  print(index_descending)
219
-
220
  for ind in index_descending:
221
  formatted_tokens.append(
222
  (
223
- urlList[ind] + " --- Matching Score: " + f"{str(round(urlScore[ind] * 100, 2))}%",
 
 
224
  "[" + str(urlMap[ind]) + "]",
225
  )
226
  )
@@ -232,7 +236,7 @@ def main(soups, sentences):
232
 
233
  return formatted_tokens
234
 
235
-
236
  """
237
  AI DETECTION SECTION
238
  """
@@ -240,73 +244,106 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
240
 
241
  text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
242
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
243
- text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
 
 
244
 
245
- text_mc_model_path = "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
 
 
246
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
247
- text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
 
 
248
 
249
  quillbot_labels = ["Original", "QuillBot"]
250
  quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
251
- quillbot_model = AutoModelForSequenceClassification.from_pretrained("polygraf-ai/quillbot-detector-28k").to(device)
 
 
 
252
 
253
  def remove_accents(input_str):
254
  text_no_accents = unidecode(input_str)
255
  return text_no_accents
256
 
 
257
  def remove_special_characters(text):
258
  text = remove_accents(text)
259
  pattern = r'[^\w\s\d.,!?\'"()-;]+'
260
- text = re.sub(pattern, '', text)
261
  return text
262
 
 
263
  def remove_special_characters_2(text):
264
- pattern = r'[^a-zA-Z0-9 ]+'
265
- text = re.sub(pattern, '', text)
266
  return text
267
 
 
268
  def update_character_count(text):
269
  return f"{len(text)} characters"
270
 
271
 
272
- def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30, min_last_segment_length=100, type_det='bc'):
 
 
 
 
 
 
273
  sentences = nltk.sent_tokenize(text)
274
  segments = []
275
  current_segment = []
276
- current_length = 0
277
-
278
- if type_det == 'bc':
279
  tokenizer = text_bc_tokenizer
280
  max_length = 333
281
-
282
- elif type_det == 'mc':
283
  tokenizer = text_mc_tokenizer
284
  max_length = 256
285
-
286
  for sentence in sentences:
287
  tokens = tokenizer.tokenize(sentence)
288
  sentence_length = len(tokens)
289
-
290
- if current_length + sentence_length <= max_length + tolerance - 2:
291
  current_segment.append(sentence)
292
  current_length += sentence_length
293
  else:
294
  if current_segment:
295
- encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
 
 
 
 
 
296
  segments.append((current_segment, len(encoded_segment)))
297
  current_segment = [sentence]
298
  current_length = sentence_length
299
-
300
  if current_segment:
301
- encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
 
 
 
 
 
302
  segments.append((current_segment, len(encoded_segment)))
303
 
304
  final_segments = []
305
  for i, (seg, length) in enumerate(segments):
306
- if i == len(segments) - 1:
307
  if length < min_last_segment_length and len(final_segments) > 0:
308
  prev_seg, prev_length = final_segments[-1]
309
- combined_encoded = tokenizer.encode(' '.join(prev_seg + seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
 
 
 
 
 
310
  if len(combined_encoded) <= max_length + tolerance:
311
  final_segments[-1] = (prev_seg + seg, len(combined_encoded))
312
  else:
@@ -319,56 +356,86 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30,
319
  decoded_segments = []
320
  encoded_segments = []
321
  for seg, _ in final_segments:
322
- encoded_segment = tokenizer.encode(' '.join(seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
 
 
 
 
 
323
  decoded_segment = tokenizer.decode(encoded_segment)
324
  decoded_segments.append(decoded_segment)
325
  return decoded_segments
326
 
 
327
  def predict_quillbot(text):
328
  with torch.no_grad():
329
  quillbot_model.eval()
330
- tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)
 
 
 
 
 
 
331
  output = quillbot_model(**tokenized_text)
332
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
333
- q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
 
 
 
334
  return q_score
335
 
 
336
  def predict_bc(model, tokenizer, text):
337
  with torch.no_grad():
338
  model.eval()
339
  tokens = text_bc_tokenizer(
340
- text, padding='max_length', truncation=True, max_length=333, return_tensors="pt"
 
 
 
 
341
  ).to(device)
342
  output = model(**tokens)
343
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
344
  print("BC Score: ", output_norm)
345
  return output_norm
346
 
 
347
  def predict_mc(model, tokenizer, text):
348
  with torch.no_grad():
349
  model.eval()
350
  tokens = text_mc_tokenizer(
351
- text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
 
 
 
 
352
  ).to(device)
353
  output = model(**tokens)
354
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
355
  print("MC Score: ", output_norm)
356
  return output_norm
357
 
 
358
  def ai_generated_test(ai_option, input):
359
-
360
  bc_scores = []
361
  mc_scores = []
362
- samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
363
- samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc'))
364
- segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
365
- segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
366
-
 
 
 
 
367
  for i in range(samples_len_bc):
368
  cleaned_text_bc = remove_special_characters(segments_bc[i])
369
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
370
  bc_scores.append(bc_score)
371
-
372
  for i in range(samples_len_mc):
373
  cleaned_text_mc = remove_special_characters(segments_mc[i])
374
  mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
@@ -380,27 +447,28 @@ def ai_generated_test(ai_option, input):
380
  average_mc_scores = np.mean(mc_scores_array, axis=0)
381
  bc_score_list = average_bc_scores.tolist()
382
  mc_score_list = average_mc_scores.tolist()
383
-
384
  bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
385
  mc_score = {}
386
  label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
387
-
388
  for score, label in zip(mc_score_list, label_map):
389
  mc_score[label.upper()] = score
390
-
391
  sum_prob = 1 - bc_score["HUMAN"]
392
  for key, value in mc_score.items():
393
  mc_score[key] = value * sum_prob
394
-
395
  if ai_option == "Human vs AI":
396
  mc_score = {}
397
 
398
- if sum_prob < 0.01 :
399
  mc_score = {}
400
  return bc_score, mc_score
401
  else:
402
  return bc_score, mc_score
403
 
 
404
  # COMBINED
405
  def main(
406
  ai_option,
@@ -428,28 +496,30 @@ def main(
428
  domains_to_skip,
429
  )
430
  depth_analysis_plot = depth_analysis(input)
431
- bc_score, mc_score = ai_generated_test(ai_option,input)
432
  quilscore = predict_quillbot(input)
433
-
434
  return (
435
- bc_score,
436
- mc_score,
437
- formatted_tokens,
438
- depth_analysis_plot,
439
- quilscore
440
- )
441
 
442
 
443
  def build_date(year, month, day):
444
  return f"{year}{months[month]}{day}"
445
 
 
446
  def len_validator(text):
447
- min_tokens = 200
448
- lengt = len(text_bc_tokenizer.tokenize(text = text, return_tensors="pt"))
449
- if lengt < min_tokens:
450
- return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
451
- else :
452
- return f"Input length ({lengt}) is satisified."
 
453
 
454
  def extract_text_from_pdf(pdf_path):
455
  doc = fitz.open(pdf_path)
@@ -461,9 +531,9 @@ def extract_text_from_pdf(pdf_path):
461
 
462
  # DEPTH ANALYSIS
463
  print("loading depth analysis")
464
- nltk.download('stopwords')
465
- nltk.download('punkt')
466
- command = ['python3', '-m', 'spacy', 'download', 'en_core_web_sm']
467
  # Execute the command
468
  subprocess.run(command)
469
  nlp = spacy.load("en_core_web_sm")
@@ -473,6 +543,7 @@ model_id = "gpt2"
473
  gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
474
  gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
475
 
 
476
  def depth_analysis(input_text):
477
 
478
  # vocanulary richness
@@ -482,48 +553,59 @@ def depth_analysis(input_text):
482
  # readability
483
  gunning_fog = calculate_gunning_fog(input_text)
484
  gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
485
-
486
  # average sentence length and average word length
487
  words, sentences = preprocess_text2(input_text)
488
  average_sentence_length = calculate_average_sentence_length(sentences)
489
  average_word_length = calculate_average_word_length(words)
490
- average_sentence_length_norm = normalize(average_sentence_length, min_value=0, max_value=40)
491
- average_word_length_norm = normalize(average_word_length, min_value=0, max_value=8)
 
 
 
 
492
 
493
  # syntactic_tree_depth
494
  average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
495
- average_tree_depth_norm = normalize(average_tree_depth, min_value=0, max_value=10)
 
 
496
 
497
  # perplexity
498
- perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
 
 
499
  perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
500
 
501
  features = {
502
- "readability": gunning_fog_norm,
503
  "syntactic tree depth": average_tree_depth_norm,
504
  "vocabulary richness": ttr_value,
505
  "perplexity": perplexity_norm,
506
  "average sentence length": average_sentence_length_norm,
507
- "average word length": average_word_length_norm,
508
  }
509
 
510
  print(features)
511
 
512
  fig = go.Figure()
513
 
514
- fig.add_trace(go.Scatterpolar(
515
- r=list(features.values()),
516
- theta=list(features.keys()),
517
- fill='toself',
518
- name='Radar Plot'
519
- ))
 
 
520
 
521
  fig.update_layout(
522
  polar=dict(
523
  radialaxis=dict(
524
  visible=True,
525
  range=[0, 100],
526
- )),
 
527
  showlegend=False,
528
  # autosize=False,
529
  # width=600,
@@ -575,16 +657,23 @@ with gr.Blocks() as demo:
575
  with gr.Row():
576
  input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
577
  file_input = gr.File(label="Upload PDF")
578
- file_input.change(fn=extract_text_from_pdf, inputs=file_input, outputs=input_text)
 
 
579
 
580
- char_count = gr.Textbox(label="Minumum Character Limit Check")
581
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
582
 
583
  with gr.Row():
584
  with gr.Column():
585
- ai_option = gr.Radio(["Human vs AI", "Human vs AI Source Models"], label="Choose an option please.")
 
 
 
586
  with gr.Column():
587
- plag_option = gr.Radio(["Standard", "Advanced"], label="Choose an option please.")
 
 
588
 
589
  with gr.Row():
590
  with gr.Column():
@@ -607,14 +696,14 @@ with gr.Blocks() as demo:
607
  ## Output
608
  """
609
  )
610
-
611
  # models = gr.Dropdown(
612
- # model_list,
613
- # value=model_list,
614
- # multiselect=True,
615
- # label="Models to test against",
616
- # )
617
-
618
  with gr.Row():
619
  with gr.Column():
620
  bcLabel = gr.Label(label="Source")
@@ -666,9 +755,7 @@ with gr.Blocks() as demo:
666
 
667
  with gr.Row():
668
  with gr.Column():
669
- writing_analysis_plot = gr.Plot(
670
- label="Writing Analysis Plot"
671
- )
672
 
673
  full_check_btn.click(
674
  fn=main,
@@ -690,7 +777,7 @@ with gr.Blocks() as demo:
690
  mcLabel,
691
  sentenceBreakdown,
692
  writing_analysis_plot,
693
- QLabel
694
  ],
695
  api_name="main",
696
  )
@@ -740,5 +827,7 @@ with gr.Blocks() as demo:
740
 
741
  date_from = ""
742
  date_to = ""
743
-
744
- demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))
 
 
 
1
+ from utils import (
2
+ cosineSim,
3
+ googleSearch,
4
+ getSentences,
5
+ parallel_scrap,
6
+ matchingScore,
7
+ )
8
  import gradio as gr
9
  from urllib.request import urlopen, Request
10
  from googleapiclient.discovery import build
 
20
  from evaluate import load
21
  from datetime import date
22
  import nltk
23
+ import fitz
24
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
25
  import nltk, spacy, subprocess, torch
26
  import plotly.graph_objects as go
 
33
  from functools import partial
34
  import concurrent.futures
35
 
36
+ nltk.download("punkt")
37
 
38
  from writing_analysis import (
39
  normalize,
40
  preprocess_text1,
41
+ preprocess_text2,
42
  vocabulary_richness_ttr,
43
  calculate_gunning_fog,
44
  calculate_average_sentence_length,
45
  calculate_average_word_length,
46
  calculate_syntactic_tree_depth,
47
  calculate_perplexity,
48
+ )
 
49
 
50
  np.set_printoptions(suppress=True)
51
 
 
94
  )
95
  print(f"Time for google search: {time.perf_counter()-time1}")
96
  time1 = time.perf_counter()
97
+
98
  print("Number of URLs: ", len(urlCount))
99
  print(urlList)
100
 
 
118
  page_content = soup.text
119
  source_embeddings.append(embed_text(page_content))
120
  else:
121
+ source_embeddings.append(None)
122
+
123
  # Populate matching scores for scrapped pages
124
  # for i, soup in enumerate(soups):
125
  # print(f"Analyzing {i+1} of {len(soups)} soups........................")
 
131
  # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
132
  # ScoreArray[i][j] = score
133
 
134
+ def compute_cosine_similarity(args):
135
+ sent, source_embedding, i, j = args
136
+ score = cos_sim_torch(embed_text(sent), source_embedding)
137
+ return i, j, score
138
+
139
+ def main(soups, sentences):
140
+ source_embeddings = [preprocess(soup) for soup in soups]
141
+ ScoreArray = [[0 for _ in sentences] for _ in soups]
142
+ args_list = []
143
+ for i, soup in enumerate(soups):
144
+ if soup:
145
+ for j, sent in enumerate(sentences):
146
+ args_list.append((sent, source_embeddings[i], i, j))
147
+ with concurrent.futures.ProcessPoolExecutor() as executor:
148
+ results = executor.map(compute_cosine_similarity, args_list)
149
+ for i, j, score in results:
150
+ ScoreArray[i][j] = score
151
+ return ScoreArray
 
152
 
153
  ScoreArray = main(soups, sentences)
154
 
 
 
155
  print(f"Time for matching score: {time.perf_counter()-time1}")
156
  time1 = time.perf_counter()
157
 
 
179
  sentenceToMaxURL[j] = i
180
  if maxScore > 0.5:
181
  sentencePlag[j] = True
182
+
183
  if (
184
  (len(sentences) > 1)
185
  and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
 
218
 
219
  print(formatted_tokens)
220
  print(index_descending)
221
+
222
  for ind in index_descending:
223
  formatted_tokens.append(
224
  (
225
+ urlList[ind]
226
+ + " --- Matching Score: "
227
+ + f"{str(round(urlScore[ind] * 100, 2))}%",
228
  "[" + str(urlMap[ind]) + "]",
229
  )
230
  )
 
236
 
237
  return formatted_tokens
238
 
239
+
240
  """
241
  AI DETECTION SECTION
242
  """
 
244
 
245
  text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
246
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
247
+ text_bc_model = AutoModelForSequenceClassification.from_pretrained(
248
+ text_bc_model_path
249
+ ).to(device)
250
 
251
+ text_mc_model_path = (
252
+ "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
253
+ )
254
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
255
+ text_mc_model = AutoModelForSequenceClassification.from_pretrained(
256
+ text_mc_model_path
257
+ ).to(device)
258
 
259
  quillbot_labels = ["Original", "QuillBot"]
260
  quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
261
+ quillbot_model = AutoModelForSequenceClassification.from_pretrained(
262
+ "polygraf-ai/quillbot-detector-28k"
263
+ ).to(device)
264
+
265
 
266
  def remove_accents(input_str):
267
  text_no_accents = unidecode(input_str)
268
  return text_no_accents
269
 
270
+
271
  def remove_special_characters(text):
272
  text = remove_accents(text)
273
  pattern = r'[^\w\s\d.,!?\'"()-;]+'
274
+ text = re.sub(pattern, "", text)
275
  return text
276
 
277
+
278
  def remove_special_characters_2(text):
279
+ pattern = r"[^a-zA-Z0-9 ]+"
280
+ text = re.sub(pattern, "", text)
281
  return text
282
 
283
+
284
  def update_character_count(text):
285
  return f"{len(text)} characters"
286
 
287
 
288
+ def split_text_allow_complete_sentences_nltk(
289
+ text,
290
+ max_length=256,
291
+ tolerance=30,
292
+ min_last_segment_length=100,
293
+ type_det="bc",
294
+ ):
295
  sentences = nltk.sent_tokenize(text)
296
  segments = []
297
  current_segment = []
298
+ current_length = 0
299
+
300
+ if type_det == "bc":
301
  tokenizer = text_bc_tokenizer
302
  max_length = 333
303
+
304
+ elif type_det == "mc":
305
  tokenizer = text_mc_tokenizer
306
  max_length = 256
307
+
308
  for sentence in sentences:
309
  tokens = tokenizer.tokenize(sentence)
310
  sentence_length = len(tokens)
311
+
312
+ if current_length + sentence_length <= max_length + tolerance - 2:
313
  current_segment.append(sentence)
314
  current_length += sentence_length
315
  else:
316
  if current_segment:
317
+ encoded_segment = tokenizer.encode(
318
+ " ".join(current_segment),
319
+ add_special_tokens=True,
320
+ max_length=max_length + tolerance,
321
+ truncation=True,
322
+ )
323
  segments.append((current_segment, len(encoded_segment)))
324
  current_segment = [sentence]
325
  current_length = sentence_length
326
+
327
  if current_segment:
328
+ encoded_segment = tokenizer.encode(
329
+ " ".join(current_segment),
330
+ add_special_tokens=True,
331
+ max_length=max_length + tolerance,
332
+ truncation=True,
333
+ )
334
  segments.append((current_segment, len(encoded_segment)))
335
 
336
  final_segments = []
337
  for i, (seg, length) in enumerate(segments):
338
+ if i == len(segments) - 1:
339
  if length < min_last_segment_length and len(final_segments) > 0:
340
  prev_seg, prev_length = final_segments[-1]
341
+ combined_encoded = tokenizer.encode(
342
+ " ".join(prev_seg + seg),
343
+ add_special_tokens=True,
344
+ max_length=max_length + tolerance,
345
+ truncation=True,
346
+ )
347
  if len(combined_encoded) <= max_length + tolerance:
348
  final_segments[-1] = (prev_seg + seg, len(combined_encoded))
349
  else:
 
356
  decoded_segments = []
357
  encoded_segments = []
358
  for seg, _ in final_segments:
359
+ encoded_segment = tokenizer.encode(
360
+ " ".join(seg),
361
+ add_special_tokens=True,
362
+ max_length=max_length + tolerance,
363
+ truncation=True,
364
+ )
365
  decoded_segment = tokenizer.decode(encoded_segment)
366
  decoded_segments.append(decoded_segment)
367
  return decoded_segments
368
 
369
+
370
  def predict_quillbot(text):
371
  with torch.no_grad():
372
  quillbot_model.eval()
373
+ tokenized_text = quillbot_tokenizer(
374
+ text,
375
+ padding="max_length",
376
+ truncation=True,
377
+ max_length=256,
378
+ return_tensors="pt",
379
+ ).to(device)
380
  output = quillbot_model(**tokenized_text)
381
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
382
+ q_score = {
383
+ "QuillBot": output_norm[1].item(),
384
+ "Original": output_norm[0].item(),
385
+ }
386
  return q_score
387
 
388
+
389
  def predict_bc(model, tokenizer, text):
390
  with torch.no_grad():
391
  model.eval()
392
  tokens = text_bc_tokenizer(
393
+ text,
394
+ padding="max_length",
395
+ truncation=True,
396
+ max_length=333,
397
+ return_tensors="pt",
398
  ).to(device)
399
  output = model(**tokens)
400
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
401
  print("BC Score: ", output_norm)
402
  return output_norm
403
 
404
+
405
  def predict_mc(model, tokenizer, text):
406
  with torch.no_grad():
407
  model.eval()
408
  tokens = text_mc_tokenizer(
409
+ text,
410
+ padding="max_length",
411
+ truncation=True,
412
+ return_tensors="pt",
413
+ max_length=256,
414
  ).to(device)
415
  output = model(**tokens)
416
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
417
  print("MC Score: ", output_norm)
418
  return output_norm
419
 
420
+
421
  def ai_generated_test(ai_option, input):
422
+
423
  bc_scores = []
424
  mc_scores = []
425
+ samples_len_bc = len(
426
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
427
+ )
428
+ samples_len_mc = len(
429
+ split_text_allow_complete_sentences_nltk(input, type_det="mc")
430
+ )
431
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
432
+ segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
433
+
434
  for i in range(samples_len_bc):
435
  cleaned_text_bc = remove_special_characters(segments_bc[i])
436
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
437
  bc_scores.append(bc_score)
438
+
439
  for i in range(samples_len_mc):
440
  cleaned_text_mc = remove_special_characters(segments_mc[i])
441
  mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
 
447
  average_mc_scores = np.mean(mc_scores_array, axis=0)
448
  bc_score_list = average_bc_scores.tolist()
449
  mc_score_list = average_mc_scores.tolist()
450
+
451
  bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
452
  mc_score = {}
453
  label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
454
+
455
  for score, label in zip(mc_score_list, label_map):
456
  mc_score[label.upper()] = score
457
+
458
  sum_prob = 1 - bc_score["HUMAN"]
459
  for key, value in mc_score.items():
460
  mc_score[key] = value * sum_prob
461
+
462
  if ai_option == "Human vs AI":
463
  mc_score = {}
464
 
465
+ if sum_prob < 0.01:
466
  mc_score = {}
467
  return bc_score, mc_score
468
  else:
469
  return bc_score, mc_score
470
 
471
+
472
  # COMBINED
473
  def main(
474
  ai_option,
 
496
  domains_to_skip,
497
  )
498
  depth_analysis_plot = depth_analysis(input)
499
+ bc_score, mc_score = ai_generated_test(ai_option, input)
500
  quilscore = predict_quillbot(input)
501
+
502
  return (
503
+ bc_score,
504
+ mc_score,
505
+ formatted_tokens,
506
+ depth_analysis_plot,
507
+ quilscore,
508
+ )
509
 
510
 
511
  def build_date(year, month, day):
512
  return f"{year}{months[month]}{day}"
513
 
514
+
515
  def len_validator(text):
516
+ min_tokens = 200
517
+ lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
518
+ if lengt < min_tokens:
519
+ return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
520
+ else:
521
+ return f"Input length ({lengt}) is satisified."
522
+
523
 
524
  def extract_text_from_pdf(pdf_path):
525
  doc = fitz.open(pdf_path)
 
531
 
532
  # DEPTH ANALYSIS
533
  print("loading depth analysis")
534
+ nltk.download("stopwords")
535
+ nltk.download("punkt")
536
+ command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
537
  # Execute the command
538
  subprocess.run(command)
539
  nlp = spacy.load("en_core_web_sm")
 
543
  gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
544
  gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
545
 
546
+
547
  def depth_analysis(input_text):
548
 
549
  # vocanulary richness
 
553
  # readability
554
  gunning_fog = calculate_gunning_fog(input_text)
555
  gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
556
+
557
  # average sentence length and average word length
558
  words, sentences = preprocess_text2(input_text)
559
  average_sentence_length = calculate_average_sentence_length(sentences)
560
  average_word_length = calculate_average_word_length(words)
561
+ average_sentence_length_norm = normalize(
562
+ average_sentence_length, min_value=0, max_value=40
563
+ )
564
+ average_word_length_norm = normalize(
565
+ average_word_length, min_value=0, max_value=8
566
+ )
567
 
568
  # syntactic_tree_depth
569
  average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
570
+ average_tree_depth_norm = normalize(
571
+ average_tree_depth, min_value=0, max_value=10
572
+ )
573
 
574
  # perplexity
575
+ perplexity = calculate_perplexity(
576
+ input_text, gpt2_model, gpt2_tokenizer, device
577
+ )
578
  perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
579
 
580
  features = {
581
+ "readability": gunning_fog_norm,
582
  "syntactic tree depth": average_tree_depth_norm,
583
  "vocabulary richness": ttr_value,
584
  "perplexity": perplexity_norm,
585
  "average sentence length": average_sentence_length_norm,
586
+ "average word length": average_word_length_norm,
587
  }
588
 
589
  print(features)
590
 
591
  fig = go.Figure()
592
 
593
+ fig.add_trace(
594
+ go.Scatterpolar(
595
+ r=list(features.values()),
596
+ theta=list(features.keys()),
597
+ fill="toself",
598
+ name="Radar Plot",
599
+ )
600
+ )
601
 
602
  fig.update_layout(
603
  polar=dict(
604
  radialaxis=dict(
605
  visible=True,
606
  range=[0, 100],
607
+ )
608
+ ),
609
  showlegend=False,
610
  # autosize=False,
611
  # width=600,
 
657
  with gr.Row():
658
  input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
659
  file_input = gr.File(label="Upload PDF")
660
+ file_input.change(
661
+ fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
662
+ )
663
 
664
+ char_count = gr.Textbox(label="Minumum Character Limit Check")
665
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
666
 
667
  with gr.Row():
668
  with gr.Column():
669
+ ai_option = gr.Radio(
670
+ ["Human vs AI", "Human vs AI Source Models"],
671
+ label="Choose an option please.",
672
+ )
673
  with gr.Column():
674
+ plag_option = gr.Radio(
675
+ ["Standard", "Advanced"], label="Choose an option please."
676
+ )
677
 
678
  with gr.Row():
679
  with gr.Column():
 
696
  ## Output
697
  """
698
  )
699
+
700
  # models = gr.Dropdown(
701
+ # model_list,
702
+ # value=model_list,
703
+ # multiselect=True,
704
+ # label="Models to test against",
705
+ # )
706
+
707
  with gr.Row():
708
  with gr.Column():
709
  bcLabel = gr.Label(label="Source")
 
755
 
756
  with gr.Row():
757
  with gr.Column():
758
+ writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
 
 
759
 
760
  full_check_btn.click(
761
  fn=main,
 
777
  mcLabel,
778
  sentenceBreakdown,
779
  writing_analysis_plot,
780
+ QLabel,
781
  ],
782
  api_name="main",
783
  )
 
827
 
828
  date_from = ""
829
  date_to = ""
830
+
831
+ demo.launch(
832
+ share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
833
+ )
plagiarism.py ADDED
File without changes