Spaces:

wangoes-dev
/

AI_to_Humanizer

Runtime error

App Files Files Community

AI_to_Humanizer / app.py

wangoes-dev

update

7c2cc49 verified 11 days ago

raw

history blame contribute delete

11.9 kB

	from spacy.lang.en import English
	from spacy.lang.en.stop_words import STOP_WORDS
	import pandas as pd
	import gradio as gr
	from transformers import pipeline
	from gradio.themes.utils.colors import red, green
	import requests
	import json
	import os
	from dotenv import load_dotenv
	import time

	# Load environment variables
	load_dotenv()

	# Initialize the NLP pipeline
	nlp = English()
	nlp.add_pipe("sentencizer")
	tokenizer = nlp.tokenizer

	# Initialize the text classification pipeline
	detector = pipeline(task='text-classification', model='SJTU-CL/RoBERTa-large-ArguGPT-sent')

	# Groq API configuration
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	if not GROQ_API_KEY:
	raise ValueError("Please set your GROQ_API_KEY in the .env file")
	GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
	GROQ_MODEL = "llama3-70b-8192" # Updated to latest model

	# Define color map for highlighted text
	color_map = {
	'0%': green.c400,
	'10%': green.c300,
	'20%': green.c200,
	'30%': green.c100,
	'40%': green.c50,
	'50%': red.c50,
	'60%': red.c100,
	'70%': red.c200,
	'80%': red.c300,
	'90%': red.c400,
	'100%': red.c500
	}

	def is_stopword(word):
	"""Check if a word is a stop word or very short"""
	return word.lower() in STOP_WORDS or len(word) <= 2

	def get_synonyms(word):
	"""Get simple, human-readable synonyms using Groq API"""
	if is_stopword(word):
	return [word] # Don't get synonyms for stop words

	headers = {
	"Authorization": f"Bearer {GROQ_API_KEY}",
	"Content-Type": "application/json"
	}

	prompt = f"""Provide a list of exactly 5 simple synonyms for '{word}'.
	Return ONLY a JSON array of synonyms without any additional text.
	Example: ["use", "employ", "apply", "make use of", "take"]"""

	data = {
	"model": GROQ_MODEL,
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.3,
	"max_tokens": 100,
	"response_format": {"type": "json_object"}
	}

	try:
	response = requests.post(GROQ_API_URL, headers=headers, json=data)
	response.raise_for_status()
	result = response.json()
	content = json.loads(result['choices'][0]['message']['content'])

	# Improved response parsing
	if isinstance(content, dict):
	for key in ['synonyms', 'words', 'alternatives']:
	if key in content and isinstance(content[key], list):
	return content[key][:5]
	return [word] # Fallback if parsing fails
	except Exception as e:
	print(f"Error getting synonyms: {e}")
	return [word] # Fallback to original word if API fails

	def identify_problem_words(text):
	"""Use Groq API to identify uncommon, difficult, and AI-generated words"""
	headers = {
	"Authorization": f"Bearer {GROQ_API_KEY}",
	"Content-Type": "application/json"
	}

	prompt = f"""Analyze this text and return ONLY a JSON list of words that are:
	1. Uncommon (not in everyday vocabulary)
	2. Difficult (complex or technical)
	3. Likely AI-generated (overly formal, verbose, or unnatural)

	Exclude all stop words (a, an, the, and, but, etc.) and very short words (1-2 letters).
	Return format: {{"words": ["word1", "word2", ...]}}

	Text: {text}"""

	data = {
	"model": GROQ_MODEL,
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.2,
	"max_tokens": 200,
	"response_format": {"type": "json_object"}
	}

	try:
	response = requests.post(GROQ_API_URL, headers=headers, json=data)
	response.raise_for_status()
	result = response.json()
	content = json.loads(result['choices'][0]['message']['content'])

	if isinstance(content, dict) and 'words' in content:
	# Filter out any stop words that might have slipped through
	filtered_words = [word for word in content['words'] if not is_stopword(word)]
	return set(filtered_words)
	return set()
	except Exception as e:
	print(f"Error identifying problem words: {e}")
	return set()

	def predict_word(word, problem_words):
	"""Predict AI probability for a single word if it's in problem words"""
	if len(word) <= 3 or word.lower() not in problem_words or is_stopword(word):
	return 0.0
	try:
	prob = predict_one_sent(word)
	return prob
	except:
	return 0.0

	def predict_doc(doc):
	start_time = time.time()

	# First identify problem words using Groq
	problem_words = identify_problem_words(doc)
	print(f"Identified problem words: {problem_words}")

	sents = [s.text for s in nlp(doc).sents]
	data = {'sentence': [], 'label': [], 'score': []}
	sent_res = []
	word_highlights = []

	for sent in sents:
	sent_prob = predict_one_sent(sent)

	# Word-level analysis - only for problem words
	tokens = [token.text for token in tokenizer(sent)]
	word_probs = [predict_word(token, problem_words) for token in tokens]

	for word, prob in zip(tokens, word_probs):
	if prob >= 0.2: # Only highlight words with >20% AI probability
	if prob < 0.3: label = '20%'
	elif prob < 0.4: label = '30%'
	elif prob < 0.5: label = '40%'
	elif prob < 0.6: label = '50%'
	elif prob < 0.7: label = '60%'
	elif prob < 0.8: label = '70%'
	elif prob < 0.9: label = '80%'
	elif prob < 1: label = '90%'
	else: label = '100%'
	word_highlights.append((word, label))
	else:
	word_highlights.append((word, None))

	data['sentence'].append(sent)
	data['score'].append(round(sent_prob, 4))
	if sent_prob <= 0.5:
	data['label'].append('Human')
	else:
	data['label'].append('Machine')

	if sent_prob < 0.1: label = '0%'
	elif sent_prob < 0.2: label = '10%'
	elif sent_prob < 0.3: label = '20%'
	elif sent_prob < 0.4: label = '30%'
	elif sent_prob < 0.5: label = '40%'
	elif sent_prob < 0.6: label = '50%'
	elif sent_prob < 0.7: label = '60%'
	elif sent_prob < 0.8: label = '70%'
	elif sent_prob < 0.9: label = '80%'
	elif sent_prob < 1: label = '90%'
	else: label = '100%'
	sent_res.append((sent, label))

	df = pd.DataFrame(data)
	csv_path = 'result.csv'
	df.to_csv(csv_path)
	print(f"Analysis took {time.time() - start_time:.2f} seconds")

	overall_score = df.score.mean()
	overall_label = 'Human' if overall_score <= 0.5 else 'Machine'
	sum_str = f'The essay is probably written by {overall_label}. The probability of being generated by AI is {overall_score:.2f}'

	return sum_str, sent_res, df, csv_path, word_highlights

	def predict_one_sent(sent):
	res = detector(sent)[0]
	org_label, prob = res['label'], res['score']
	if org_label == 'LABEL_0': prob = 1 - prob
	return prob

	def update_text(text, selected_word, replacement, word_highlights):
	new_text = text.replace(selected_word, replacement, 1)

	# Update word_highlights with the new word (assuming it's now human-written)
	updated_highlights = []
	replaced = False
	for word, label in word_highlights:
	if word == selected_word and not replaced:
	updated_highlights.append((replacement, '0%'))
	replaced = True
	else:
	updated_highlights.append((word, label))

	return new_text, updated_highlights

	def process_word_highlights(highlights):
	return highlights

	# Custom CSS for modern look
	custom_css = """
	.gradio-container {
	font-family: 'Arial', sans-serif;
	}
	.gradio-header {
	background-color: #4CAF50;
	color: white;
	padding: 10px;
	text-align: center;
	border-radius: 8px;
	margin-bottom: 20px;
	}
	.gradio-button {
	background-color: #4CAF50;
	color: white;
	border: none;
	padding: 10px 20px;
	text-align: center;
	text-decoration: none;
	display: inline-block;
	font-size: 16px;
	margin: 4px 2px;
	cursor: pointer;
	border-radius: 5px;
	transition: background-color 0.3s;
	}
	.gradio-button:hover {
	background-color: #45a049;
	}
	.highlighted-word {
	cursor: pointer;
	padding: 2px 4px;
	border-radius: 3px;
	transition: all 0.2s;
	}
	.highlighted-word:hover {
	text-decoration: underline;
	background-color: #f0f0f0;
	transform: scale(1.05);
	}
	.replacement-row {
	border: 1px solid #ddd;
	padding: 15px;
	border-radius: 8px;
	margin-top: 10px;
	background-color: #f9f9f9;
	}
	"""

	with gr.Blocks(css=custom_css) as demo:
	gr.Markdown("""## AI vs Human Essay Detector""")
	gr.Markdown("""Identify and replace uncommon, difficult, and AI-generated words in your text.""")

	word_highlights = gr.State([])
	selected_word = gr.State("")

	with gr.Row():
	with gr.Column():
	text_in = gr.Textbox(
	lines=10,
	label='Essay Input',
	placeholder="Paste your essay here...",
	elem_classes=["text-input"]
	)
	btn = gr.Button('Analyze Text', variant="primary")

	with gr.Column():
	sent_res = gr.HighlightedText(
	label='Sentence-level Analysis',
	color_map=color_map,
	show_legend=True
	)
	word_res = gr.HighlightedText(
	label='Word-level Analysis (Click words to replace)',
	color_map=color_map,
	show_legend=True
	)

	with gr.Row():
	summary = gr.Textbox(label='Overall Analysis', interactive=False)
	csv_f = gr.File(label='Download Detailed Analysis')

	with gr.Row():
	tab = gr.Dataframe(
	label='Detailed Sentence Analysis',
	wrap=True,
	max_rows=10
	)

	with gr.Column(visible=False) as replacement_row:
	gr.Markdown("### Replace Word")
	with gr.Row():
	replacement_dropdown = gr.Dropdown(
	label="Select replacement",
	interactive=True,
	allow_custom_value=True
	)
	with gr.Row():
	replace_btn = gr.Button("Replace", variant="primary")
	cancel_btn = gr.Button("Cancel")

	def on_word_select(evt: gr.SelectData):
	if evt.value:
	synonyms = get_synonyms(evt.value)
	return (
	evt.value,
	gr.Dropdown(choices=synonyms, value=evt.value),
	gr.Column(visible=True)
	)
	return None, None, gr.Column(visible=False)

	word_res.select(
	fn=on_word_select,
	outputs=[selected_word, replacement_dropdown, replacement_row]
	)

	replace_btn.click(
	fn=update_text,
	inputs=[text_in, selected_word, replacement_dropdown, word_highlights],
	outputs=[text_in, word_highlights]
	).then(
	fn=lambda: gr.Column(visible=False),
	outputs=replacement_row
	).then(
	fn=lambda x: predict_doc(x),
	inputs=text_in,
	outputs=[summary, sent_res, tab, csv_f, word_highlights]
	).then(
	fn=process_word_highlights,
	inputs=word_highlights,
	outputs=word_res
	)

	cancel_btn.click(
	fn=lambda: gr.Column(visible=False),
	outputs=replacement_row
	)

	btn.click(
	fn=predict_doc,
	inputs=text_in,
	outputs=[summary, sent_res, tab, csv_f, word_highlights]
	).then(
	fn=process_word_highlights,
	inputs=word_highlights,
	outputs=word_res
	)

	if __name__ == "__main__":
	demo.launch()