Spaces:
Sleeping
Sleeping
import torch | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import nltk | |
import torch.nn.functional as F | |
import nltk | |
from scipy.special import softmax | |
import yaml | |
from utils import * | |
import joblib | |
from optimum.bettertransformer import BetterTransformer | |
import gc | |
from cleantext import clean | |
import gradio as gr | |
from tqdm.auto import tqdm | |
from transformers import pipeline | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from optimum.pipelines import pipeline | |
with open("config.yaml", "r") as file: | |
params = yaml.safe_load(file) | |
nltk.download("punkt") | |
nltk.download("stopwords") | |
device_needed = "cuda" if torch.cuda.is_available() else "cpu" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"] | |
text_mc_model_path = params["TEXT_MC_MODEL_PATH"] | |
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"] | |
quillbot_labels = params["QUILLBOT_LABELS"] | |
mc_label_map = params["MC_OUTPUT_LABELS"] | |
mc_token_size = int(params["MC_TOKEN_SIZE"]) | |
bc_token_size = int(params["BC_TOKEN_SIZE"]) | |
bias_checker_model_name = params['BIAS_CHECKER_MODEL_PATH'] | |
bias_corrector_model_name = params['BIAS_CORRECTOR_MODEL_PATH'] | |
access_token = params['HF_TOKEN'] | |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path, token=access_token) | |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path, token=access_token).to(device) | |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path, token=access_token) | |
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path, token=access_token).to(device) | |
quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path, token=access_token) | |
quillbot_model = AutoModelForSequenceClassification.from_pretrained(text_quillbot_model_path, token=access_token).to(device) | |
# proxy models for explainability | |
mini_bc_model_name = "polygraf-ai/bc-model" | |
bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name, token=access_token) | |
bc_model_mini = AutoModelForSequenceClassification.from_pretrained(mini_bc_model_name, token=access_token).to(device_needed) | |
mini_humanizer_model_name = "polygraf-ai/humanizer-model" | |
humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(mini_humanizer_model_name, token=access_token) | |
humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(mini_humanizer_model_name, token=access_token).to(device_needed) | |
bc_model_mini = BetterTransformer.transform(bc_model_mini) | |
humanizer_model_mini = BetterTransformer.transform(humanizer_model_mini) | |
text_bc_model = BetterTransformer.transform(text_bc_model) | |
text_mc_model = BetterTransformer.transform(text_mc_model) | |
quillbot_model = BetterTransformer.transform(quillbot_model) | |
bias_model_checker = AutoModelForSequenceClassification.from_pretrained(bias_checker_model_name) | |
tokenizer = AutoTokenizer.from_pretrained(bias_checker_model_name) | |
bias_model_checker = BetterTransformer.transform(bias_model_checker, keep_original_model=False) | |
bias_checker = pipeline( | |
"text-classification", | |
model=bias_checker_model_name, | |
tokenizer=bias_checker_model_name, | |
) | |
gc.collect() | |
bias_corrector = pipeline( "text2text-generation", model=bias_corrector_model_name, accelerator="ort") | |
# model score calibration | |
iso_reg = joblib.load("isotonic_regression_model.joblib") | |
def split_text(text: str) -> list: | |
sentences = sent_tokenize(text) | |
return [[sentence] for sentence in sentences] | |
def correct_text(text: str, bias_checker, bias_corrector, separator: str = " ") -> tuple: | |
sentence_batches = split_text(text) | |
corrected_text = [] | |
corrections = [] | |
for batch in tqdm(sentence_batches, total=len(sentence_batches), desc="correcting text.."): | |
raw_text = " ".join(batch) | |
results = bias_checker(raw_text) | |
if results[0]["label"] != "LABEL_1" or (results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9): | |
corrected_batch = bias_corrector(raw_text) | |
corrected_version = corrected_batch[0]["generated_text"] | |
corrected_text.append(corrected_version) | |
corrections.append((raw_text, corrected_version)) | |
else: | |
corrected_text.append(raw_text) | |
corrected_text = separator.join(corrected_text) | |
return corrected_text, corrections | |
def update(text: str): | |
text = clean(text, lower=False) | |
corrected_text, corrections = correct_text(text, bias_checker, bias_corrector) | |
corrections_display = "".join([f"{corr}" for orig, corr in corrections]) | |
if corrections_display == "": | |
corrections_display = text | |
return corrections_display | |
def update_main(text: str): | |
text = clean(text, lower=False) | |
corrected_text, corrections = correct_text(text, bias_checker, bias_corrector) | |
corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections]) | |
return corrected_text, corrections_display | |
def split_text(text: str) -> list: | |
sentences = sent_tokenize(text) | |
return [[sentence] for sentence in sentences] | |
def get_token_length(tokenizer, sentence): | |
return len(tokenizer.tokenize(sentence)) | |
def split_text_allow_complete_sentences_nltk(text, type_det="bc"): | |
sentences = sent_tokenize(text) | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
if type_det == "bc": | |
tokenizer = text_bc_tokenizer | |
max_tokens = bc_token_size | |
elif type_det == "mc": | |
tokenizer = text_mc_tokenizer | |
max_tokens = mc_token_size | |
elif type_det == "quillbot": | |
tokenizer = quillbot_tokenizer | |
max_tokens = 256 | |
def add_sentence_to_chunk(sentence): | |
nonlocal current_chunk, current_length | |
sentence_length = get_token_length(tokenizer, sentence) | |
if current_length + sentence_length > max_tokens: | |
chunks.append((current_chunk, current_length)) | |
current_chunk = [] | |
current_length = 0 | |
current_chunk.append(sentence) | |
current_length += sentence_length | |
for sentence in sentences: | |
add_sentence_to_chunk(sentence) | |
if current_chunk: | |
chunks.append((current_chunk, current_length)) | |
adjusted_chunks = [] | |
while chunks: | |
chunk = chunks.pop(0) | |
if len(chunks) > 0 and chunk[1] < max_tokens / 2: | |
next_chunk = chunks.pop(0) | |
combined_length = chunk[1] + next_chunk[1] | |
if combined_length <= max_tokens: | |
adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length)) | |
else: | |
adjusted_chunks.append(chunk) | |
chunks.insert(0, next_chunk) | |
else: | |
adjusted_chunks.append(chunk) | |
result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks] | |
return result_chunks | |
def predict_quillbot(text, bias_buster_selected): | |
if bias_buster_selected: | |
text = update(text) | |
with torch.no_grad(): | |
quillbot_model.eval() | |
tokenized_text = quillbot_tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
max_length=256, | |
return_tensors="pt", | |
).to(device) | |
output = quillbot_model(**tokenized_text) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
q_score = { | |
"Humanized": output_norm[1].item(), | |
"Original": output_norm[0].item(), | |
} | |
return q_score | |
def predict_for_explainanility(text, model_type=None): | |
if model_type == "quillbot": | |
cleaning = False | |
max_length = 256 | |
model = humanizer_model_mini | |
tokenizer = humanizer_tokenizer_mini | |
elif model_type == "bc": | |
cleaning = True | |
max_length = bc_token_size | |
model = bc_model_mini | |
tokenizer = bc_tokenizer_mini | |
else: | |
raise ValueError("Invalid model type") | |
with torch.no_grad(): | |
if cleaning: | |
text = [remove_special_characters(t) for t in text] | |
tokenized_text = tokenizer( | |
text, | |
return_tensors="pt", | |
padding="max_length", | |
truncation=True, | |
max_length=max_length, | |
).to(device_needed) | |
outputs = model(**tokenized_text) | |
tensor_logits = outputs[0] | |
probas = F.softmax(tensor_logits).detach().cpu().numpy() | |
return probas | |
def predict_bc(model, tokenizer, text): | |
with torch.no_grad(): | |
model.eval() | |
tokens = text_bc_tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
max_length=bc_token_size, | |
return_tensors="pt", | |
).to(device) | |
output = model(**tokens) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
return output_norm | |
def predict_mc(model, tokenizer, text): | |
with torch.no_grad(): | |
model.eval() | |
tokens = text_mc_tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
return_tensors="pt", | |
max_length=mc_token_size, | |
).to(device) | |
output = model(**tokens) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
return output_norm | |
def predict_bc_scores(input): | |
bc_scores = [] | |
samples_len_bc = len( | |
split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
) | |
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
for i in range(samples_len_bc): | |
cleaned_text_bc = remove_special_characters(segments_bc[i]) | |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc) | |
bc_scores.append(bc_score) | |
bc_scores_array = np.array(bc_scores) | |
average_bc_scores = np.mean(bc_scores_array, axis=0) | |
bc_score_list = average_bc_scores.tolist() | |
print( | |
f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}" | |
) | |
# isotonic regression calibration | |
ai_score = iso_reg.predict([bc_score_list[1]])[0] | |
human_score = 1 - ai_score | |
bc_score = {"AI": ai_score, "HUMAN": human_score} | |
print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}") | |
print(f"Input Text: {cleaned_text_bc}") | |
return bc_score | |
def predict_mc_scores(input): | |
# BC SCORE | |
bc_scores = [] | |
samples_len_bc = len( | |
split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
) | |
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
for i in range(samples_len_bc): | |
cleaned_text_bc = remove_special_characters(segments_bc[i]) | |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc) | |
bc_scores.append(bc_score) | |
bc_scores_array = np.array(bc_scores) | |
average_bc_scores = np.mean(bc_scores_array, axis=0) | |
bc_score_list = average_bc_scores.tolist() | |
print( | |
f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}" | |
) | |
# isotonic regression calibration | |
ai_score = iso_reg.predict([bc_score_list[1]])[0] | |
human_score = 1 - ai_score | |
bc_score = {"AI": ai_score, "HUMAN": human_score} | |
print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}") | |
mc_scores = [] | |
segments_mc = split_text_allow_complete_sentences_nltk( | |
input, type_det="mc" | |
) | |
samples_len_mc = len( | |
split_text_allow_complete_sentences_nltk(input, type_det="mc") | |
) | |
for i in range(samples_len_mc): | |
cleaned_text_mc = remove_special_characters(segments_mc[i]) | |
mc_score = predict_mc( | |
text_mc_model, text_mc_tokenizer, cleaned_text_mc | |
) | |
mc_scores.append(mc_score) | |
mc_scores_array = np.array(mc_scores) | |
average_mc_scores = np.mean(mc_scores_array, axis=0) | |
mc_score_list = average_mc_scores.tolist() | |
mc_score = {} | |
for score, label in zip(mc_score_list, mc_label_map): | |
mc_score[label.upper()] = score | |
sum_prob = 1 - bc_score["HUMAN"] | |
for key, value in mc_score.items(): | |
mc_score[key] = value * sum_prob | |
print("MC Score:", mc_score) | |
if sum_prob < 0.01: | |
mc_score = {} | |
return mc_score | |