|
from transformers import AutoTokenizer |
|
from transformers import AutoModelForSequenceClassification |
|
import torch |
|
from torch.nn import functional as F |
|
import numpy as np |
|
import re |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("gpt3_finetuned_model/checkpoint-30048") |
|
tokenizer_v2 = AutoTokenizer.from_pretrained("gpt2-large") |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("gpt3_finetuned_model/checkpoint-30048").to("cuda") |
|
|
|
|
|
|
|
def logit2prob(logit): |
|
|
|
|
|
prob= 1/(1+ np.exp(-logit)) |
|
return np.round(prob, 3) |
|
|
|
def split_sentence(sentence:str): |
|
|
|
sentence= sentence.replace('\n', '') |
|
separators = ['. ', '.', ':'] |
|
|
|
pattern = '|'.join(map(re.escape, separators)) |
|
|
|
|
|
parts = re.split(pattern, sentence) |
|
|
|
return parts |
|
|
|
|
|
|
|
def predict(sentence: str): |
|
''' |
|
Returns (probability_human, probability_AI, label) |
|
''' |
|
inputs = tokenizer(sentence, return_tensors="pt").to("cuda") |
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
|
|
print("logits: ", logits) |
|
predicted_class_id = logits.argmax().item() |
|
|
|
probabilities_scores = np.round( |
|
F.softmax(logits.to("cpu"), dim = -1).numpy()[0], |
|
3) |
|
print("P(Human): ", probabilities_scores[0]) |
|
print("P(AI): ", probabilities_scores[1]) |
|
label= "Human Written" if model.config.id2label[predicted_class_id]=='NEGATIVE' else 'AI written' |
|
print("Label: ", label) |
|
print(model.config.id2label[predicted_class_id]) |
|
|
|
|
|
return probabilities_scores[0], probabilities_scores[1], label |
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_burstiness(sentence: str): |
|
''' |
|
Returns (variance, average_length) |
|
''' |
|
list_of_sentences= split_sentence(sentence) |
|
arr= [] |
|
for i in list_of_sentences: |
|
if len(i)==0: |
|
continue |
|
ei= tokenizer_v2(i, return_tensors="pt") |
|
arr.append(ei.input_ids.size(1)) |
|
|
|
variance= np.var(np.array(arr)) |
|
std_deviation= np.std(np.array(arr)) |
|
avg_length= np.average(np.array(arr)) |
|
|
|
print(f"arr= {(arr)}") |
|
print(f'variance: {variance}') |
|
print(f'std: {std_deviation}') |
|
print(f'average length: {avg_length}') |
|
|
|
return variance, avg_length |
|
|
|
|
|
|
|
|
|
|
|
def complete_sentence_analysis(sentence:str): |
|
''' |
|
Returns a dictionary |
|
{ |
|
p_human : probablity that the text is written by the human |
|
p_ai : probablity that the text is written by ai |
|
label : label {ai/human} |
|
variance : variance in the length of the sentences |
|
avg_length: average tokens per sentence |
|
} |
|
''' |
|
p_human, p_ai, label= predict(sentence) |
|
variance, avg_length= calculate_burstiness(sentence) |
|
return { |
|
"p_human": p_human, |
|
"p_ai": p_ai, |
|
"label": label, |
|
"variance": variance, |
|
"avg_length": avg_length |
|
} |
|
|
|
|
|
|
|
|
|
|
|
def get_top_labels(keyword: str): |
|
''' |
|
Returns score list |
|
''' |
|
inputs = tokenizer(keyword, return_tensors="pt").to("cuda") |
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
|
|
|
|
|
|
|
|
|
|
|
|
individual_probabilities_scores = logit2prob(logits.cpu().numpy()[0]) |
|
|
|
score_list= [] |
|
|
|
for i in range(2): |
|
label= "Human Written" if model.config.id2label[i]=='NEGATIVE' else 'AI written' |
|
|
|
score= individual_probabilities_scores[i] |
|
score_list.append( |
|
(label, score) |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
score_list.sort( |
|
key= lambda x: x[1], reverse=True |
|
) |
|
|
|
return score_list[:5] |