Spaces:

polygraf-ai
/

copyright_checker

Sleeping

App Files Files Community

aliasgerovs commited on May 10, 2024

Commit

1ed67b2

2 Parent(s): 4215662 b472976

Merge branch 'demo'

Browse files

Files changed (8) hide show

.gitignore +1 -1
app.py +14 -10
highlighter.py +1 -1
isotonic_regression_model.joblib +0 -0
plagiarism.py +2 -0
predictors.py +66 -5
requirements.txt +3 -0
utils.py +20 -5

.gitignore CHANGED Viewed

@@ -1,3 +1,3 @@
 __pycache__/
-copy_ch/
 copy_check/

 __pycache__/
+venv/
 copy_check/

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import gradio as gr
 import numpy as np
 from datetime import date
-from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
 from analysis import depth_analysis
 from predictors import predict_quillbot
 from plagiarism import plagiarism_check, build_date, html_highlight
@@ -29,7 +30,7 @@ def ai_generated_test(option, input, models):
     if option == "Human vs AI":
         return predict_bc_scores(input), None
     elif option == "Human vs AI Source Models":
-        return predict_bc_scores(input), predict_1on1_scores(input, models)
     return None, None
@@ -74,7 +75,7 @@ def main(
     )
     depth_analysis_plot = depth_analysis(input)
     bc_score = predict_bc_scores(input)
-    mc_score = predict_1on1_scores(input, models)
     quilscore = predict_quillbot(input)
     return (
@@ -88,7 +89,7 @@ def main(
 # START OF GRADIO
-title = "Copyright Checker"
 months = {
     "January": "01",
     "February": "02",
@@ -114,7 +115,7 @@ with gr.Blocks() as demo:
     domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
     gr.Markdown(
         """
-    # Copyright Checker
     """
     )
     with gr.Row():
@@ -127,6 +128,12 @@ with gr.Blocks() as demo:
     char_count = gr.Textbox(label="Minumum Character Limit Check")
     input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
     with gr.Row():
         models = gr.Dropdown(
             model_list,
@@ -382,8 +389,5 @@ with gr.Blocks() as demo:
     date_from = ""
     date_to = ""
-if __name__ == "__main__":
-    demo.launch(
-        share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
-    )

 import gradio as gr
 import numpy as np
 from datetime import date
+from predictors import predict_bc_scores, predict_mc_scores
+from predictors import update, correct_text, split_text
 from analysis import depth_analysis
 from predictors import predict_quillbot
 from plagiarism import plagiarism_check, build_date, html_highlight
     if option == "Human vs AI":
         return predict_bc_scores(input), None
     elif option == "Human vs AI Source Models":
+        return predict_bc_scores(input), predict_mc_scores(input, models)
     return None, None
     )
     depth_analysis_plot = depth_analysis(input)
     bc_score = predict_bc_scores(input)
+    mc_score = predict_mc_scores(input, models)
     quilscore = predict_quillbot(input)
     return (
 # START OF GRADIO
+title = "AI Detection and Source Analysis"
 months = {
     "January": "01",
     "February": "02",
     domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
     gr.Markdown(
         """
+    # AI Detection and Source Analysis
     """
     )
     with gr.Row():
     char_count = gr.Textbox(label="Minumum Character Limit Check")
     input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
+    with gr.Row():
+        btn = gr.Button("Bias Buster")
+        out = gr.Textbox(label="Bias Corrected Full Input", interactive=False)
+        corrections_output = gr.Textbox(label="Bias Corrections", interactive=False)
+        btn.click(fn=update, inputs=input_text, outputs=[out, corrections_output])
     with gr.Row():
         models = gr.Dropdown(
             model_list,
     date_from = ""
     date_to = ""
+if __name__ == "__main__":
+    demo.launch(share=True, server_name="0.0.0.0", server_port = 80, auth=("polygraf-admin", "test@aisd"))

highlighter.py CHANGED Viewed

@@ -14,7 +14,7 @@ def explainer(text, model_type):
     sentences = [sent for sent in sent_tokenize(text)]
     num_sentences = len(sentences)
     exp = explainer_.explain_instance(
-        text, predictor_wrapper, num_features=num_sentences, num_samples=500
     )
     weights_mapping = exp.as_map()[1]
     sentences_weights = {sentence: 0 for sentence in sentences}

     sentences = [sent for sent in sent_tokenize(text)]
     num_sentences = len(sentences)
     exp = explainer_.explain_instance(
+        text, predictor_wrapper, num_features=num_sentences, num_samples=2000
     )
     weights_mapping = exp.as_map()[1]
     sentences_weights = {sentence: 0 for sentence in sentences}

isotonic_regression_model.joblib CHANGED Viewed

Binary files a/isotonic_regression_model.joblib and b/isotonic_regression_model.joblib differ

plagiarism.py CHANGED Viewed

@@ -224,6 +224,8 @@ def plagiarism_check(
     domains_to_skip,
     source_block_size,
 ):
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"

     domains_to_skip,
     source_block_size,
 ):
+    api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
+    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"

predictors.py CHANGED Viewed

@@ -8,12 +8,23 @@ from scipy.special import softmax
 import yaml
 from utils import *
 import joblib
 with open("config.yaml", "r") as file:
     params = yaml.safe_load(file)
 nltk.download("punkt")
 nltk.download("stopwords")
-device = "cuda" if torch.cuda.is_available() else "cpu"
 text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
 text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
 text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
@@ -23,6 +34,8 @@ mc_label_map = params["MC_OUTPUT_LABELS"]
 text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
 mc_token_size = int(params["MC_TOKEN_SIZE"])
 bc_token_size = int(params["BC_TOKEN_SIZE"])
 text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
 text_bc_model = AutoModelForSequenceClassification.from_pretrained(
     text_bc_model_path
@@ -43,24 +56,71 @@ for model_name, model in zip(mc_label_map, text_1on1_models):
         AutoModelForSequenceClassification.from_pretrained(model).to(device)
     )
 # proxy models for explainability
 mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
 bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
 bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
     mini_bc_model_name
-).to(device)
 mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
 humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
     mini_humanizer_model_name
 )
 humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
     mini_humanizer_model_name
-).to(device)
 # model score calibration
 iso_reg = joblib.load("isotonic_regression_model.joblib")
 def split_text_allow_complete_sentences_nltk(
     text,
     max_length=256,
@@ -181,7 +241,7 @@ def predict_for_explainanility(text, model_type=None):
             padding="max_length",
             truncation=True,
             max_length=max_length,
-        ).to(device)
         outputs = model(**tokenized_text)
         tensor_logits = outputs[0]
         probas = F.softmax(tensor_logits).detach().cpu().numpy()
@@ -279,6 +339,7 @@ def predict_bc_scores(input):
     human_score = 1 - ai_score
     bc_score = {"AI": ai_score, "HUMAN": human_score}
     print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
     return bc_score
@@ -313,7 +374,7 @@ def predict_1on1_single(input, model):
     return predictions
-def predict_1on1_scores(input, models):
     if len(models) == 0:
         return {}

 import yaml
 from utils import *
 import joblib
+from optimum.bettertransformer import BetterTransformer
+import gc
+from cleantext import clean
+import gradio as gr
+from tqdm.auto import tqdm
+from transformers import pipeline
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import nltk
+from nltk.tokenize import sent_tokenize
+from optimum.pipelines import pipeline
 with open("config.yaml", "r") as file:
     params = yaml.safe_load(file)
 nltk.download("punkt")
 nltk.download("stopwords")
+device_needed = "cuda" if torch.cuda.is_available() else "cpu"
+device = 'cpu'
 text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
 text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
 text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
 text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
 mc_token_size = int(params["MC_TOKEN_SIZE"])
 bc_token_size = int(params["BC_TOKEN_SIZE"])
+bias_checker_model_name = params['BIAS_CHECKER_MODEL_PATH']
+bias_corrector_model_name = params['BIAS_CORRECTOR_MODEL_PATH']
 text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
 text_bc_model = AutoModelForSequenceClassification.from_pretrained(
     text_bc_model_path
         AutoModelForSequenceClassification.from_pretrained(model).to(device)
     )
 # proxy models for explainability
 mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
 bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
 bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
     mini_bc_model_name
+).to(device_needed)
 mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
 humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
     mini_humanizer_model_name
 )
 humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
     mini_humanizer_model_name
+).to(device_needed)
+bc_model_mini = BetterTransformer.transform(bc_model_mini)
+humanizer_model_mini = BetterTransformer.transform(humanizer_model_mini)
+text_bc_model = BetterTransformer.transform(text_bc_model)
+text_mc_model = BetterTransformer.transform(text_mc_model)
+quillbot_model = BetterTransformer.transform(quillbot_model)
+bias_model_checker = AutoModelForSequenceClassification.from_pretrained(bias_checker_model_name)
+tokenizer = AutoTokenizer.from_pretrained(bias_checker_model_name)
+bias_model_checker = BetterTransformer.transform(bias_model_checker, keep_original_model=False)
+bias_checker = pipeline(
+    "text-classification",
+    model=bias_checker_model_name,
+    tokenizer=bias_checker_model_name,
+)
+gc.collect()
+bias_corrector = pipeline( "text2text-generation", model=bias_corrector_model_name, accelerator="ort")
 # model score calibration
 iso_reg = joblib.load("isotonic_regression_model.joblib")
+def split_text(text: str) -> list:
+    sentences = sent_tokenize(text)
+    return [[sentence] for sentence in sentences]
+def correct_text(text: str, bias_checker, bias_corrector, separator: str = " ") -> tuple:
+    sentence_batches = split_text(text)
+    corrected_text = []
+    corrections = []
+    for batch in tqdm(sentence_batches, total=len(sentence_batches), desc="correcting text.."):
+        raw_text = " ".join(batch)
+        results = bias_checker(raw_text)
+        if results[0]["label"] != "LABEL_1" or (results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9):
+            corrected_batch = bias_corrector(raw_text)
+            corrected_version = corrected_batch[0]["generated_text"]
+            corrected_text.append(corrected_version)
+            corrections.append((raw_text, corrected_version))
+        else:
+            corrected_text.append(raw_text)
+    corrected_text = separator.join(corrected_text)
+    return corrected_text, corrections
+def update(text: str):
+    text = clean(text, lower=False)
+    corrected_text, corrections = correct_text(text, bias_checker, bias_corrector)
+    corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
+    return corrected_text, corrections_display
 def split_text_allow_complete_sentences_nltk(
     text,
     max_length=256,
             padding="max_length",
             truncation=True,
             max_length=max_length,
+        ).to(device_needed)
         outputs = model(**tokenized_text)
         tensor_logits = outputs[0]
         probas = F.softmax(tensor_logits).detach().cpu().numpy()
     human_score = 1 - ai_score
     bc_score = {"AI": ai_score, "HUMAN": human_score}
     print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
+    print(f"Input Text: {cleaned_text_bc}")
     return bc_score
     return predictions
+def predict_mc_scores(input, models):
     if len(models) == 0:
         return {}

requirements.txt CHANGED Viewed

@@ -26,6 +26,9 @@ Unidecode
 python-dotenv
 lime
 joblib
 emoji==1.6.1
 matplotlib
 seaborn

 python-dotenv
 lime
 joblib
+optimum
+clean-text
+optimum[onnxruntime]
 emoji==1.6.1
 matplotlib
 seaborn

utils.py CHANGED Viewed

@@ -14,13 +14,28 @@ def remove_accents(input_str):
 def remove_special_characters(text):
-    text = text.replace("<s>", "").replace("</s>", "")
-    text = remove_accents(text)
-    pattern = r'[^\w\s\d.,!?\'"()-;]+'
-    text = re.sub(pattern, "", text)
     return text
 def remove_special_characters_2(text):
     pattern = r"[^a-zA-Z0-9 ]+"
     text = re.sub(pattern, "", text)

 def remove_special_characters(text):
+    text = re.sub(r'https?://\S+|www\.\S+', '', text)
+    emoji_pattern = re.compile("["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F700-\U0001F77F"  # alchemical symbols
+        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
+        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
+        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
+        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
+        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
+        u"\U00002702-\U000027B0"  # Dingbats
+        u"\U000024C2-\U0001F251"
+        "]+", flags=re.UNICODE)
+    text = emoji_pattern.sub('', text)
+    text = re.sub(r'#\w+', '', text)
+    text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text)
+    text = re.sub(r'\s+([.,!?;])', r'\1', text)
+    text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
+    text = re.sub(r'\s+', ' ', text).strip()
     return text
 def remove_special_characters_2(text):
     pattern = r"[^a-zA-Z0-9 ]+"
     text = re.sub(pattern, "", text)