AI_to_Humanizer / app.py
wangoes-dev's picture
update
7c2cc49 verified
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
import gradio as gr
from transformers import pipeline
from gradio.themes.utils.colors import red, green
import requests
import json
import os
from dotenv import load_dotenv
import time
# Load environment variables
load_dotenv()
# Initialize the NLP pipeline
nlp = English()
nlp.add_pipe("sentencizer")
tokenizer = nlp.tokenizer
# Initialize the text classification pipeline
detector = pipeline(task='text-classification', model='SJTU-CL/RoBERTa-large-ArguGPT-sent')
# Groq API configuration
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
raise ValueError("Please set your GROQ_API_KEY in the .env file")
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
GROQ_MODEL = "llama3-70b-8192" # Updated to latest model
# Define color map for highlighted text
color_map = {
'0%': green.c400,
'10%': green.c300,
'20%': green.c200,
'30%': green.c100,
'40%': green.c50,
'50%': red.c50,
'60%': red.c100,
'70%': red.c200,
'80%': red.c300,
'90%': red.c400,
'100%': red.c500
}
def is_stopword(word):
"""Check if a word is a stop word or very short"""
return word.lower() in STOP_WORDS or len(word) <= 2
def get_synonyms(word):
"""Get simple, human-readable synonyms using Groq API"""
if is_stopword(word):
return [word] # Don't get synonyms for stop words
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
prompt = f"""Provide a list of exactly 5 simple synonyms for '{word}'.
Return ONLY a JSON array of synonyms without any additional text.
Example: ["use", "employ", "apply", "make use of", "take"]"""
data = {
"model": GROQ_MODEL,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.3,
"max_tokens": 100,
"response_format": {"type": "json_object"}
}
try:
response = requests.post(GROQ_API_URL, headers=headers, json=data)
response.raise_for_status()
result = response.json()
content = json.loads(result['choices'][0]['message']['content'])
# Improved response parsing
if isinstance(content, dict):
for key in ['synonyms', 'words', 'alternatives']:
if key in content and isinstance(content[key], list):
return content[key][:5]
return [word] # Fallback if parsing fails
except Exception as e:
print(f"Error getting synonyms: {e}")
return [word] # Fallback to original word if API fails
def identify_problem_words(text):
"""Use Groq API to identify uncommon, difficult, and AI-generated words"""
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
prompt = f"""Analyze this text and return ONLY a JSON list of words that are:
1. Uncommon (not in everyday vocabulary)
2. Difficult (complex or technical)
3. Likely AI-generated (overly formal, verbose, or unnatural)
Exclude all stop words (a, an, the, and, but, etc.) and very short words (1-2 letters).
Return format: {{"words": ["word1", "word2", ...]}}
Text: {text}"""
data = {
"model": GROQ_MODEL,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.2,
"max_tokens": 200,
"response_format": {"type": "json_object"}
}
try:
response = requests.post(GROQ_API_URL, headers=headers, json=data)
response.raise_for_status()
result = response.json()
content = json.loads(result['choices'][0]['message']['content'])
if isinstance(content, dict) and 'words' in content:
# Filter out any stop words that might have slipped through
filtered_words = [word for word in content['words'] if not is_stopword(word)]
return set(filtered_words)
return set()
except Exception as e:
print(f"Error identifying problem words: {e}")
return set()
def predict_word(word, problem_words):
"""Predict AI probability for a single word if it's in problem words"""
if len(word) <= 3 or word.lower() not in problem_words or is_stopword(word):
return 0.0
try:
prob = predict_one_sent(word)
return prob
except:
return 0.0
def predict_doc(doc):
start_time = time.time()
# First identify problem words using Groq
problem_words = identify_problem_words(doc)
print(f"Identified problem words: {problem_words}")
sents = [s.text for s in nlp(doc).sents]
data = {'sentence': [], 'label': [], 'score': []}
sent_res = []
word_highlights = []
for sent in sents:
sent_prob = predict_one_sent(sent)
# Word-level analysis - only for problem words
tokens = [token.text for token in tokenizer(sent)]
word_probs = [predict_word(token, problem_words) for token in tokens]
for word, prob in zip(tokens, word_probs):
if prob >= 0.2: # Only highlight words with >20% AI probability
if prob < 0.3: label = '20%'
elif prob < 0.4: label = '30%'
elif prob < 0.5: label = '40%'
elif prob < 0.6: label = '50%'
elif prob < 0.7: label = '60%'
elif prob < 0.8: label = '70%'
elif prob < 0.9: label = '80%'
elif prob < 1: label = '90%'
else: label = '100%'
word_highlights.append((word, label))
else:
word_highlights.append((word, None))
data['sentence'].append(sent)
data['score'].append(round(sent_prob, 4))
if sent_prob <= 0.5:
data['label'].append('Human')
else:
data['label'].append('Machine')
if sent_prob < 0.1: label = '0%'
elif sent_prob < 0.2: label = '10%'
elif sent_prob < 0.3: label = '20%'
elif sent_prob < 0.4: label = '30%'
elif sent_prob < 0.5: label = '40%'
elif sent_prob < 0.6: label = '50%'
elif sent_prob < 0.7: label = '60%'
elif sent_prob < 0.8: label = '70%'
elif sent_prob < 0.9: label = '80%'
elif sent_prob < 1: label = '90%'
else: label = '100%'
sent_res.append((sent, label))
df = pd.DataFrame(data)
csv_path = 'result.csv'
df.to_csv(csv_path)
print(f"Analysis took {time.time() - start_time:.2f} seconds")
overall_score = df.score.mean()
overall_label = 'Human' if overall_score <= 0.5 else 'Machine'
sum_str = f'The essay is probably written by {overall_label}. The probability of being generated by AI is {overall_score:.2f}'
return sum_str, sent_res, df, csv_path, word_highlights
def predict_one_sent(sent):
res = detector(sent)[0]
org_label, prob = res['label'], res['score']
if org_label == 'LABEL_0': prob = 1 - prob
return prob
def update_text(text, selected_word, replacement, word_highlights):
new_text = text.replace(selected_word, replacement, 1)
# Update word_highlights with the new word (assuming it's now human-written)
updated_highlights = []
replaced = False
for word, label in word_highlights:
if word == selected_word and not replaced:
updated_highlights.append((replacement, '0%'))
replaced = True
else:
updated_highlights.append((word, label))
return new_text, updated_highlights
def process_word_highlights(highlights):
return highlights
# Custom CSS for modern look
custom_css = """
.gradio-container {
font-family: 'Arial', sans-serif;
}
.gradio-header {
background-color: #4CAF50;
color: white;
padding: 10px;
text-align: center;
border-radius: 8px;
margin-bottom: 20px;
}
.gradio-button {
background-color: #4CAF50;
color: white;
border: none;
padding: 10px 20px;
text-align: center;
text-decoration: none;
display: inline-block;
font-size: 16px;
margin: 4px 2px;
cursor: pointer;
border-radius: 5px;
transition: background-color 0.3s;
}
.gradio-button:hover {
background-color: #45a049;
}
.highlighted-word {
cursor: pointer;
padding: 2px 4px;
border-radius: 3px;
transition: all 0.2s;
}
.highlighted-word:hover {
text-decoration: underline;
background-color: #f0f0f0;
transform: scale(1.05);
}
.replacement-row {
border: 1px solid #ddd;
padding: 15px;
border-radius: 8px;
margin-top: 10px;
background-color: #f9f9f9;
}
"""
with gr.Blocks(css=custom_css) as demo:
gr.Markdown("""## AI vs Human Essay Detector""")
gr.Markdown("""Identify and replace uncommon, difficult, and AI-generated words in your text.""")
word_highlights = gr.State([])
selected_word = gr.State("")
with gr.Row():
with gr.Column():
text_in = gr.Textbox(
lines=10,
label='Essay Input',
placeholder="Paste your essay here...",
elem_classes=["text-input"]
)
btn = gr.Button('Analyze Text', variant="primary")
with gr.Column():
sent_res = gr.HighlightedText(
label='Sentence-level Analysis',
color_map=color_map,
show_legend=True
)
word_res = gr.HighlightedText(
label='Word-level Analysis (Click words to replace)',
color_map=color_map,
show_legend=True
)
with gr.Row():
summary = gr.Textbox(label='Overall Analysis', interactive=False)
csv_f = gr.File(label='Download Detailed Analysis')
with gr.Row():
tab = gr.Dataframe(
label='Detailed Sentence Analysis',
wrap=True,
max_rows=10
)
with gr.Column(visible=False) as replacement_row:
gr.Markdown("### Replace Word")
with gr.Row():
replacement_dropdown = gr.Dropdown(
label="Select replacement",
interactive=True,
allow_custom_value=True
)
with gr.Row():
replace_btn = gr.Button("Replace", variant="primary")
cancel_btn = gr.Button("Cancel")
def on_word_select(evt: gr.SelectData):
if evt.value:
synonyms = get_synonyms(evt.value)
return (
evt.value,
gr.Dropdown(choices=synonyms, value=evt.value),
gr.Column(visible=True)
)
return None, None, gr.Column(visible=False)
word_res.select(
fn=on_word_select,
outputs=[selected_word, replacement_dropdown, replacement_row]
)
replace_btn.click(
fn=update_text,
inputs=[text_in, selected_word, replacement_dropdown, word_highlights],
outputs=[text_in, word_highlights]
).then(
fn=lambda: gr.Column(visible=False),
outputs=replacement_row
).then(
fn=lambda x: predict_doc(x),
inputs=text_in,
outputs=[summary, sent_res, tab, csv_f, word_highlights]
).then(
fn=process_word_highlights,
inputs=word_highlights,
outputs=word_res
)
cancel_btn.click(
fn=lambda: gr.Column(visible=False),
outputs=replacement_row
)
btn.click(
fn=predict_doc,
inputs=text_in,
outputs=[summary, sent_res, tab, csv_f, word_highlights]
).then(
fn=process_word_highlights,
inputs=word_highlights,
outputs=word_res
)
if __name__ == "__main__":
demo.launch()