aliasgerovs commited on
Commit
6af6f76
·
1 Parent(s): d7f8207

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -10
app.py CHANGED
@@ -17,6 +17,10 @@ import fitz
17
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
18
  import nltk, spacy, subprocess, torch
19
  import plotly.graph_objects as go
 
 
 
 
20
  from writing_analysis import (
21
  normalize,
22
  preprocess_text1,
@@ -190,6 +194,55 @@ def remove_special_characters(text):
190
  def update_character_count(text):
191
  return f"{len(text)} characters"
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  def predict_bc(model, tokenizer, text):
194
  tokens = tokenizer(
195
  text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
@@ -198,8 +251,7 @@ def predict_bc(model, tokenizer, text):
198
  output = model(tokens)
199
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
200
  print("BC Score: ", output_norm)
201
- bc_score = {"AI": output_norm[1].item(), "HUMAN": output_norm[0].item()}
202
- return bc_score
203
 
204
  def predict_mc(model, tokenizer, text):
205
  tokens = tokenizer(
@@ -208,17 +260,33 @@ def predict_mc(model, tokenizer, text):
208
  output = model(tokens)
209
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
210
  print("MC Score: ", output_norm)
211
- mc_score = {}
212
- label_map = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA 2"]
213
- for score, label in zip(output_norm, label_map):
214
- mc_score[label.upper()] = score.item()
215
- return mc_score
216
 
217
  def ai_generated_test(ai_option, input):
218
-
 
 
219
  cleaned_text = remove_special_characters(input)
220
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text)
221
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  sum_prob = 1 - bc_score["HUMAN"]
224
  for key, value in mc_score.items():
 
17
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
18
  import nltk, spacy, subprocess, torch
19
  import plotly.graph_objects as go
20
+ import nltk
21
+
22
+ nltk.download('punkt')
23
+
24
  from writing_analysis import (
25
  normalize,
26
  preprocess_text1,
 
194
  def update_character_count(text):
195
  return f"{len(text)} characters"
196
 
197
+
198
+ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=10, min_last_segment_length=120):
199
+ sentences = nltk.sent_tokenize(text)
200
+ segments = []
201
+ current_segment = []
202
+ current_length = 0
203
+
204
+ for sentence in sentences:
205
+ tokens = tokenizer.tokenize(sentence)
206
+ sentence_length = len(tokens)
207
+
208
+ if current_length + sentence_length <= max_length + tolerance - 2:
209
+ current_segment.append(sentence)
210
+ current_length += sentence_length
211
+ else:
212
+ if current_segment:
213
+ encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
214
+ segments.append((current_segment, len(encoded_segment)))
215
+ current_segment = [sentence]
216
+ current_length = sentence_length
217
+
218
+ if current_segment:
219
+ encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
220
+ segments.append((current_segment, len(encoded_segment)))
221
+
222
+ final_segments = []
223
+ for i, (seg, length) in enumerate(segments):
224
+ if i == len(segments) - 1:
225
+ if length < min_last_segment_length and len(final_segments) > 0:
226
+ prev_seg, prev_length = final_segments[-1]
227
+ combined_encoded = tokenizer.encode(' '.join(prev_seg + seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
228
+ if len(combined_encoded) <= max_length + tolerance:
229
+ final_segments[-1] = (prev_seg + seg, len(combined_encoded))
230
+ else:
231
+ final_segments.append((seg, length))
232
+ else:
233
+ final_segments.append((seg, length))
234
+ else:
235
+ final_segments.append((seg, length))
236
+
237
+ decoded_segments = []
238
+ encoded_segments = []
239
+ for seg, _ in final_segments:
240
+ encoded_segment = tokenizer.encode(' '.join(seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
241
+ # decoded_segment = tokenizer.decode(encoded_segment)
242
+ encoded_segments.append(encoded_segment)
243
+ return encoded_segments
244
+
245
+
246
  def predict_bc(model, tokenizer, text):
247
  tokens = tokenizer(
248
  text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
 
251
  output = model(tokens)
252
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
253
  print("BC Score: ", output_norm)
254
+ return output_norm
 
255
 
256
  def predict_mc(model, tokenizer, text):
257
  tokens = tokenizer(
 
260
  output = model(tokens)
261
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
262
  print("MC Score: ", output_norm)
263
+ return output_norm
 
 
 
 
264
 
265
  def ai_generated_test(ai_option, input):
266
+
267
+ bc_scores = []
268
+ mc_scores = []
269
  cleaned_text = remove_special_characters(input)
270
+ samples_len = len(split_text_allow_complete_sentences_nltk(input))
271
+
272
+ for i in samples_len:
273
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text)
274
+ mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text)
275
+ bc_scores.append(bc_score)
276
+ mc_scores.append(mc_score)
277
+
278
+ bc_scores_array = np.array(bc_scores)
279
+ mc_scores_array = np.array(mc_scores)
280
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
281
+ average_mc_scores = np.mean(mc_scores_array, axis=0)
282
+ bc_score_list = average_bc_scores.tolist()
283
+ mc_score_list = average_mc_scores.tolist()
284
+
285
+ bc_score = {"AI": bc_score[1].item(), "HUMAN": bc_score[0].item()}
286
+ mc_score = {}
287
+ label_map = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA 2"]
288
+ for score, label in zip(mc_score_list, label_map):
289
+ mc_score[label.upper()] = score.item()
290
 
291
  sum_prob = 1 - bc_score["HUMAN"]
292
  for key, value in mc_score.items():