aliasgerovs commited on
Commit
1ed67b2
·
2 Parent(s): 4215662 b472976

Merge branch 'demo'

Browse files
Files changed (8) hide show
  1. .gitignore +1 -1
  2. app.py +14 -10
  3. highlighter.py +1 -1
  4. isotonic_regression_model.joblib +0 -0
  5. plagiarism.py +2 -0
  6. predictors.py +66 -5
  7. requirements.txt +3 -0
  8. utils.py +20 -5
.gitignore CHANGED
@@ -1,3 +1,3 @@
1
  __pycache__/
2
- copy_ch/
3
  copy_check/
 
1
  __pycache__/
2
+ venv/
3
  copy_check/
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
  import numpy as np
3
  from datetime import date
4
- from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
 
5
  from analysis import depth_analysis
6
  from predictors import predict_quillbot
7
  from plagiarism import plagiarism_check, build_date, html_highlight
@@ -29,7 +30,7 @@ def ai_generated_test(option, input, models):
29
  if option == "Human vs AI":
30
  return predict_bc_scores(input), None
31
  elif option == "Human vs AI Source Models":
32
- return predict_bc_scores(input), predict_1on1_scores(input, models)
33
  return None, None
34
 
35
 
@@ -74,7 +75,7 @@ def main(
74
  )
75
  depth_analysis_plot = depth_analysis(input)
76
  bc_score = predict_bc_scores(input)
77
- mc_score = predict_1on1_scores(input, models)
78
  quilscore = predict_quillbot(input)
79
 
80
  return (
@@ -88,7 +89,7 @@ def main(
88
 
89
  # START OF GRADIO
90
 
91
- title = "Copyright Checker"
92
  months = {
93
  "January": "01",
94
  "February": "02",
@@ -114,7 +115,7 @@ with gr.Blocks() as demo:
114
  domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
115
  gr.Markdown(
116
  """
117
- # Copyright Checker
118
  """
119
  )
120
  with gr.Row():
@@ -127,6 +128,12 @@ with gr.Blocks() as demo:
127
  char_count = gr.Textbox(label="Minumum Character Limit Check")
128
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
129
 
 
 
 
 
 
 
130
  with gr.Row():
131
  models = gr.Dropdown(
132
  model_list,
@@ -382,8 +389,5 @@ with gr.Blocks() as demo:
382
  date_from = ""
383
  date_to = ""
384
 
385
-
386
- if __name__ == "__main__":
387
- demo.launch(
388
- share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
389
- )
 
1
  import gradio as gr
2
  import numpy as np
3
  from datetime import date
4
+ from predictors import predict_bc_scores, predict_mc_scores
5
+ from predictors import update, correct_text, split_text
6
  from analysis import depth_analysis
7
  from predictors import predict_quillbot
8
  from plagiarism import plagiarism_check, build_date, html_highlight
 
30
  if option == "Human vs AI":
31
  return predict_bc_scores(input), None
32
  elif option == "Human vs AI Source Models":
33
+ return predict_bc_scores(input), predict_mc_scores(input, models)
34
  return None, None
35
 
36
 
 
75
  )
76
  depth_analysis_plot = depth_analysis(input)
77
  bc_score = predict_bc_scores(input)
78
+ mc_score = predict_mc_scores(input, models)
79
  quilscore = predict_quillbot(input)
80
 
81
  return (
 
89
 
90
  # START OF GRADIO
91
 
92
+ title = "AI Detection and Source Analysis"
93
  months = {
94
  "January": "01",
95
  "February": "02",
 
115
  domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
116
  gr.Markdown(
117
  """
118
+ # AI Detection and Source Analysis
119
  """
120
  )
121
  with gr.Row():
 
128
  char_count = gr.Textbox(label="Minumum Character Limit Check")
129
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
130
 
131
+ with gr.Row():
132
+ btn = gr.Button("Bias Buster")
133
+ out = gr.Textbox(label="Bias Corrected Full Input", interactive=False)
134
+ corrections_output = gr.Textbox(label="Bias Corrections", interactive=False)
135
+ btn.click(fn=update, inputs=input_text, outputs=[out, corrections_output])
136
+
137
  with gr.Row():
138
  models = gr.Dropdown(
139
  model_list,
 
389
  date_from = ""
390
  date_to = ""
391
 
392
+ if __name__ == "__main__":
393
+ demo.launch(share=True, server_name="0.0.0.0", server_port = 80, auth=("polygraf-admin", "test@aisd"))
 
 
 
highlighter.py CHANGED
@@ -14,7 +14,7 @@ def explainer(text, model_type):
14
  sentences = [sent for sent in sent_tokenize(text)]
15
  num_sentences = len(sentences)
16
  exp = explainer_.explain_instance(
17
- text, predictor_wrapper, num_features=num_sentences, num_samples=500
18
  )
19
  weights_mapping = exp.as_map()[1]
20
  sentences_weights = {sentence: 0 for sentence in sentences}
 
14
  sentences = [sent for sent in sent_tokenize(text)]
15
  num_sentences = len(sentences)
16
  exp = explainer_.explain_instance(
17
+ text, predictor_wrapper, num_features=num_sentences, num_samples=2000
18
  )
19
  weights_mapping = exp.as_map()[1]
20
  sentences_weights = {sentence: 0 for sentence in sentences}
isotonic_regression_model.joblib CHANGED
Binary files a/isotonic_regression_model.joblib and b/isotonic_regression_model.joblib differ
 
plagiarism.py CHANGED
@@ -224,6 +224,8 @@ def plagiarism_check(
224
  domains_to_skip,
225
  source_block_size,
226
  ):
 
 
227
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
228
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
229
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
 
224
  domains_to_skip,
225
  source_block_size,
226
  ):
227
+ api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
228
+ api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
229
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
230
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
231
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
predictors.py CHANGED
@@ -8,12 +8,23 @@ from scipy.special import softmax
8
  import yaml
9
  from utils import *
10
  import joblib
 
 
 
 
 
 
 
 
 
 
11
 
12
  with open("config.yaml", "r") as file:
13
  params = yaml.safe_load(file)
14
  nltk.download("punkt")
15
  nltk.download("stopwords")
16
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
17
  text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
18
  text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
19
  text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
@@ -23,6 +34,8 @@ mc_label_map = params["MC_OUTPUT_LABELS"]
23
  text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
24
  mc_token_size = int(params["MC_TOKEN_SIZE"])
25
  bc_token_size = int(params["BC_TOKEN_SIZE"])
 
 
26
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
27
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(
28
  text_bc_model_path
@@ -43,24 +56,71 @@ for model_name, model in zip(mc_label_map, text_1on1_models):
43
  AutoModelForSequenceClassification.from_pretrained(model).to(device)
44
  )
45
 
 
 
46
  # proxy models for explainability
47
  mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
48
  bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
49
  bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
50
  mini_bc_model_name
51
- ).to(device)
52
  mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
53
  humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
54
  mini_humanizer_model_name
55
  )
56
  humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
57
  mini_humanizer_model_name
58
- ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # model score calibration
61
  iso_reg = joblib.load("isotonic_regression_model.joblib")
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def split_text_allow_complete_sentences_nltk(
65
  text,
66
  max_length=256,
@@ -181,7 +241,7 @@ def predict_for_explainanility(text, model_type=None):
181
  padding="max_length",
182
  truncation=True,
183
  max_length=max_length,
184
- ).to(device)
185
  outputs = model(**tokenized_text)
186
  tensor_logits = outputs[0]
187
  probas = F.softmax(tensor_logits).detach().cpu().numpy()
@@ -279,6 +339,7 @@ def predict_bc_scores(input):
279
  human_score = 1 - ai_score
280
  bc_score = {"AI": ai_score, "HUMAN": human_score}
281
  print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
 
282
  return bc_score
283
 
284
 
@@ -313,7 +374,7 @@ def predict_1on1_single(input, model):
313
  return predictions
314
 
315
 
316
- def predict_1on1_scores(input, models):
317
 
318
  if len(models) == 0:
319
  return {}
 
8
  import yaml
9
  from utils import *
10
  import joblib
11
+ from optimum.bettertransformer import BetterTransformer
12
+ import gc
13
+ from cleantext import clean
14
+ import gradio as gr
15
+ from tqdm.auto import tqdm
16
+ from transformers import pipeline
17
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
18
+ import nltk
19
+ from nltk.tokenize import sent_tokenize
20
+ from optimum.pipelines import pipeline
21
 
22
  with open("config.yaml", "r") as file:
23
  params = yaml.safe_load(file)
24
  nltk.download("punkt")
25
  nltk.download("stopwords")
26
+ device_needed = "cuda" if torch.cuda.is_available() else "cpu"
27
+ device = 'cpu'
28
  text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
29
  text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
30
  text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
 
34
  text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
35
  mc_token_size = int(params["MC_TOKEN_SIZE"])
36
  bc_token_size = int(params["BC_TOKEN_SIZE"])
37
+ bias_checker_model_name = params['BIAS_CHECKER_MODEL_PATH']
38
+ bias_corrector_model_name = params['BIAS_CORRECTOR_MODEL_PATH']
39
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
40
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(
41
  text_bc_model_path
 
56
  AutoModelForSequenceClassification.from_pretrained(model).to(device)
57
  )
58
 
59
+
60
+
61
  # proxy models for explainability
62
  mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
63
  bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
64
  bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
65
  mini_bc_model_name
66
+ ).to(device_needed)
67
  mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
68
  humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
69
  mini_humanizer_model_name
70
  )
71
  humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
72
  mini_humanizer_model_name
73
+ ).to(device_needed)
74
+
75
+ bc_model_mini = BetterTransformer.transform(bc_model_mini)
76
+ humanizer_model_mini = BetterTransformer.transform(humanizer_model_mini)
77
+ text_bc_model = BetterTransformer.transform(text_bc_model)
78
+ text_mc_model = BetterTransformer.transform(text_mc_model)
79
+ quillbot_model = BetterTransformer.transform(quillbot_model)
80
+
81
+ bias_model_checker = AutoModelForSequenceClassification.from_pretrained(bias_checker_model_name)
82
+ tokenizer = AutoTokenizer.from_pretrained(bias_checker_model_name)
83
+ bias_model_checker = BetterTransformer.transform(bias_model_checker, keep_original_model=False)
84
+ bias_checker = pipeline(
85
+ "text-classification",
86
+ model=bias_checker_model_name,
87
+ tokenizer=bias_checker_model_name,
88
+ )
89
+ gc.collect()
90
+ bias_corrector = pipeline( "text2text-generation", model=bias_corrector_model_name, accelerator="ort")
91
 
92
  # model score calibration
93
  iso_reg = joblib.load("isotonic_regression_model.joblib")
94
 
95
 
96
+ def split_text(text: str) -> list:
97
+ sentences = sent_tokenize(text)
98
+ return [[sentence] for sentence in sentences]
99
+
100
+ def correct_text(text: str, bias_checker, bias_corrector, separator: str = " ") -> tuple:
101
+ sentence_batches = split_text(text)
102
+ corrected_text = []
103
+ corrections = []
104
+ for batch in tqdm(sentence_batches, total=len(sentence_batches), desc="correcting text.."):
105
+ raw_text = " ".join(batch)
106
+ results = bias_checker(raw_text)
107
+ if results[0]["label"] != "LABEL_1" or (results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9):
108
+ corrected_batch = bias_corrector(raw_text)
109
+ corrected_version = corrected_batch[0]["generated_text"]
110
+ corrected_text.append(corrected_version)
111
+ corrections.append((raw_text, corrected_version))
112
+ else:
113
+ corrected_text.append(raw_text)
114
+ corrected_text = separator.join(corrected_text)
115
+ return corrected_text, corrections
116
+
117
+ def update(text: str):
118
+ text = clean(text, lower=False)
119
+ corrected_text, corrections = correct_text(text, bias_checker, bias_corrector)
120
+ corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
121
+ return corrected_text, corrections_display
122
+
123
+
124
  def split_text_allow_complete_sentences_nltk(
125
  text,
126
  max_length=256,
 
241
  padding="max_length",
242
  truncation=True,
243
  max_length=max_length,
244
+ ).to(device_needed)
245
  outputs = model(**tokenized_text)
246
  tensor_logits = outputs[0]
247
  probas = F.softmax(tensor_logits).detach().cpu().numpy()
 
339
  human_score = 1 - ai_score
340
  bc_score = {"AI": ai_score, "HUMAN": human_score}
341
  print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
342
+ print(f"Input Text: {cleaned_text_bc}")
343
  return bc_score
344
 
345
 
 
374
  return predictions
375
 
376
 
377
+ def predict_mc_scores(input, models):
378
 
379
  if len(models) == 0:
380
  return {}
requirements.txt CHANGED
@@ -26,6 +26,9 @@ Unidecode
26
  python-dotenv
27
  lime
28
  joblib
 
 
 
29
  emoji==1.6.1
30
  matplotlib
31
  seaborn
 
26
  python-dotenv
27
  lime
28
  joblib
29
+ optimum
30
+ clean-text
31
+ optimum[onnxruntime]
32
  emoji==1.6.1
33
  matplotlib
34
  seaborn
utils.py CHANGED
@@ -14,13 +14,28 @@ def remove_accents(input_str):
14
 
15
 
16
  def remove_special_characters(text):
17
- text = text.replace("<s>", "").replace("</s>", "")
18
- text = remove_accents(text)
19
- pattern = r'[^\w\s\d.,!?\'"()-;]+'
20
- text = re.sub(pattern, "", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  return text
22
 
23
-
24
  def remove_special_characters_2(text):
25
  pattern = r"[^a-zA-Z0-9 ]+"
26
  text = re.sub(pattern, "", text)
 
14
 
15
 
16
  def remove_special_characters(text):
17
+ text = re.sub(r'https?://\S+|www\.\S+', '', text)
18
+ emoji_pattern = re.compile("["
19
+ u"\U0001F600-\U0001F64F" # emoticons
20
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
21
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
22
+ u"\U0001F700-\U0001F77F" # alchemical symbols
23
+ u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
24
+ u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
25
+ u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
26
+ u"\U0001FA00-\U0001FA6F" # Chess Symbols
27
+ u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
28
+ u"\U00002702-\U000027B0" # Dingbats
29
+ u"\U000024C2-\U0001F251"
30
+ "]+", flags=re.UNICODE)
31
+ text = emoji_pattern.sub('', text)
32
+ text = re.sub(r'#\w+', '', text)
33
+ text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text)
34
+ text = re.sub(r'\s+([.,!?;])', r'\1', text)
35
+ text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
36
+ text = re.sub(r'\s+', ' ', text).strip()
37
  return text
38
 
 
39
  def remove_special_characters_2(text):
40
  pattern = r"[^a-zA-Z0-9 ]+"
41
  text = re.sub(pattern, "", text)