Spaces:
Sleeping
Sleeping
| from transformers import pipeline | |
| from _google_search_engine_testing_share import find_by_relative_search | |
| import math | |
| PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv" | |
| WORD_FREQUENCY = None | |
| DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" | |
| """ | |
| data/MAGE/xsum_human.csv = {'HUMAN': 64, 'MACHINE': 36} correction = 20 => 84% | |
| data/MAGE/xsum_machine_topical_gpt-3.5-trubo.csv = {'HUMAN': 3, 'MACHINE': 97} => correction = 3 => 94% | |
| original acc = (64+97)/ 200 = 80.5% | |
| improve = (84 + 94) / 200 = 89% | |
| different = 8.5% | |
| https://huggingface.co/datasets/RealTimeData/bbc_news_alltime = {'HUMAN': 82, 'MACHINE': 18} => corrected 16 => 98% | |
| """ | |
| MODEL_HUMAN_MATCHING = dict() | |
| MODEL_HUMAN_MATCHING[DEFAULT_MODEL] = "Human" | |
| HUMAN = "HUMAN" | |
| MACHINE = "MACHINE" | |
| UNKNOWN = "UNKNOWN" | |
| PARAPHASE = "PARAPHASE" | |
| NON_PARAPHASE = "NON_PARAPHASE" | |
| def detect_by_huggingface_model(input_text, model = DEFAULT_MODEL, max_length=512): | |
| """ | |
| trả về kết quả là "HUMAN" hay "MACHINE" và confidence score (int) | |
| """ | |
| pipe = pipeline("text-classification", model=model,tokenizer=model, max_length=512, truncation=True, device_map="auto") | |
| result = pipe(input_text)[0] | |
| confidence_score = result['score'] | |
| if result['label'] == MODEL_HUMAN_MATCHING[model]: | |
| return HUMAN, confidence_score | |
| else: | |
| return MACHINE, confidence_score | |
| def check_human(data, min_ratio = 0.7): | |
| """ | |
| input: | |
| - data have item: | |
| + input sentence | |
| + source sentence | |
| + similarity | |
| + True/False : paraphrase or not | |
| output: | |
| is human (True/False) | |
| """ | |
| total_sentence = len(data) | |
| min_matching = int(math.ceil(total_sentence * min_ratio)) | |
| count = 0 | |
| for input_sentence, source_sentence, similiarity, is_paraprhase in data: | |
| if input_sentence in source_sentence: | |
| count += 1 | |
| if count >= min_matching: | |
| return True | |
| else: | |
| return False | |
| def abstract_detect_generated_text(input_text): | |
| """ | |
| Assists to detect the source of text using the search engine | |
| Output | |
| - prediction by search engine (HUMAN/MACHINE/UNKNOWN) | |
| - Prediction by SOTA (HUMAN/MACHINE) | |
| - SOTA confidence (float) | |
| - url to website (None if UNKNOWN) | |
| - pair of sentences. Each item have ([] if empty) | |
| - input sentence | |
| - source sentence best matching in url | |
| - matching result between input /source sentence (PARAPHASE/NON_PARAPHASE) | |
| """ | |
| is_support_opposite = False | |
| is_paraphrase, found_url, data = find_by_relative_search(input_text, is_support_opposite) | |
| sentence_pairs = [] | |
| SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(input_text) | |
| if not is_paraphrase: | |
| search_engine_prediction = UNKNOWN | |
| else: | |
| if check_human(data): | |
| search_engine_prediction = HUMAN | |
| else: | |
| search_engine_prediction = MACHINE | |
| for input_sentence, source_sentence, similiarity, is_paraphrase in data: | |
| if is_paraphrase: | |
| check_paraphrase = PARAPHASE | |
| else: | |
| check_paraphrase = NON_PARAPHASE | |
| sentence_pairs.append([input_sentence, source_sentence, check_paraphrase]) | |
| return search_engine_prediction, SOTA_prediction, SOTA_confidence, found_url, sentence_pairs | |
| if __name__ == "__main__": | |
| pass | |