Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /texts /Search_Text /_text_detection_share.py

pmkhanh7890

1st

22e1b62 9 months ago

raw

history blame

3.44 kB

	from transformers import pipeline
	from _google_search_engine_testing_share import find_by_relative_search
	import math

	PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv"
	WORD_FREQUENCY = None

	DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
	"""
	data/MAGE/xsum_human.csv = {'HUMAN': 64, 'MACHINE': 36} correction = 20 => 84%
	data/MAGE/xsum_machine_topical_gpt-3.5-trubo.csv = {'HUMAN': 3, 'MACHINE': 97} => correction = 3 => 94%
	original acc = (64+97)/ 200 = 80.5%
	improve = (84 + 94) / 200 = 89%
	different = 8.5%

	https://huggingface.co/datasets/RealTimeData/bbc_news_alltime = {'HUMAN': 82, 'MACHINE': 18} => corrected 16 => 98%

	"""

	MODEL_HUMAN_MATCHING = dict()
	MODEL_HUMAN_MATCHING[DEFAULT_MODEL] = "Human"

	HUMAN = "HUMAN"
	MACHINE = "MACHINE"

	UNKNOWN = "UNKNOWN"
	PARAPHASE = "PARAPHASE"
	NON_PARAPHASE = "NON_PARAPHASE"


	def detect_by_huggingface_model(input_text, model = DEFAULT_MODEL, max_length=512):
	"""
	trả về kết quả là "HUMAN" hay "MACHINE" và confidence score (int)
	"""
	pipe = pipeline("text-classification", model=model,tokenizer=model, max_length=512, truncation=True, device_map="auto")
	result = pipe(input_text)[0]
	confidence_score = result['score']
	if result['label'] == MODEL_HUMAN_MATCHING[model]:
	return HUMAN, confidence_score
	else:
	return MACHINE, confidence_score

	def check_human(data, min_ratio = 0.7):
	"""
	input:
	- data have item:
	+ input sentence
	+ source sentence
	+ similarity
	+ True/False : paraphrase or not
	output:
	is human (True/False)
	"""
	total_sentence = len(data)
	min_matching = int(math.ceil(total_sentence * min_ratio))
	count = 0
	for input_sentence, source_sentence, similiarity, is_paraprhase in data:
	if input_sentence in source_sentence:
	count += 1
	if count >= min_matching:
	return True
	else:
	return False

	def abstract_detect_generated_text(input_text):
	"""
	Assists to detect the source of text using the search engine
	Output
	- prediction by search engine (HUMAN/MACHINE/UNKNOWN)
	- Prediction by SOTA (HUMAN/MACHINE)
	- SOTA confidence (float)
	- url to website (None if UNKNOWN)
	- pair of sentences. Each item have ([] if empty)
	- input sentence
	- source sentence best matching in url
	- matching result between input /source sentence (PARAPHASE/NON_PARAPHASE)
	"""
	is_support_opposite = False
	is_paraphrase, found_url, data = find_by_relative_search(input_text, is_support_opposite)
	sentence_pairs = []
	SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(input_text)
	if not is_paraphrase:
	search_engine_prediction = UNKNOWN
	else:
	if check_human(data):
	search_engine_prediction = HUMAN
	else:
	search_engine_prediction = MACHINE
	for input_sentence, source_sentence, similiarity, is_paraphrase in data:
	if is_paraphrase:
	check_paraphrase = PARAPHASE
	else:
	check_paraphrase = NON_PARAPHASE
	sentence_pairs.append([input_sentence, source_sentence, check_paraphrase])

	return search_engine_prediction, SOTA_prediction, SOTA_confidence, found_url, sentence_pairs

	if __name__ == "__main__":
	pass