Spaces:

Hellowish
/

AI_Detect

Sleeping

App Files Files Community

AI_Detect / app.py

Hellowish

Update app.py

98cf186 verified 5 months ago

raw

history blame contribute delete

7.89 kB

	import gradio as gr
	import joblib
	import numpy as np
	import re

	# 載入你訓練好的 .pkl 模型
	model = joblib.load("ai_detector_model.pkl") # 確認路徑正確

	# 自訂簡單分句函數
	def simple_sent_tokenize(text):
	# 以句點、問號、驚嘆號拆分，保留句尾符號
	sentences = re.split(r'(?<=[.!?])\s+', text.strip())
	return [s for s in sentences if s]

	# 自訂簡單分詞函數
	def simple_word_tokenize(text):
	# 只抓字母和數字組成的單詞
	return re.findall(r'\b\w+\b', text.lower())

	def extract_features(text):
	sentences = simple_sent_tokenize(text)

	words_clean = [w for w in simple_word_tokenize(text) if w.isalpha()]

	features = {}
	features['text_length'] = len(text)
	features['word_count'] = len(words_clean)
	features['sentence_count'] = len(sentences)
	features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
	features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0

	unique_words = set(words_clean)
	features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)

	sentence_lengths = [len(simple_word_tokenize(s)) for s in sentences]
	features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0

	features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
	features['period_density'] = text.count('.') / max(len(text), 1) * 1000
	features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
	features['question_density'] = text.count('?') / max(len(text), 1) * 1000

	complex_words = [w for w in words_clean if len(w) > 6]
	features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)

	ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
	human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
	text_lower = text.lower()
	features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
	features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)

	return np.array(list(features.values())).reshape(1, -1), features

	def predict(text):
	if not text.strip():
	return "請輸入文字內容。"

	try:
	X, feats = extract_features(text)
	if hasattr(model, "predict_proba"):
	prob = model.predict_proba(X)[0][1]
	else:
	prob = model.predict(X)[0]

	label = "AI 生成" if prob > 0.5 else "人類撰寫"

	reason = []
	if feats['vocabulary_richness'] < 0.3:
	reason.append("詞彙多樣性較低")
	if feats['sentence_length_variance'] < 10:
	reason.append("句子長度平均，像 AI")
	if feats['ai_marker_count'] > feats['human_marker_count']:
	reason.append("包含常見 AI 連接詞")
	if feats['human_marker_count'] > feats['ai_marker_count']:
	reason.append("包含主觀語氣詞")
	if not reason:
	reason.append("整體語言特徵與模型預測一致")

	return f"預測結果：{label}\nAI 機率：{prob:.2%}\n判斷依據：{', '.join(reason)}"
	except Exception as e:
	return f"預測時出現錯誤: {str(e)}"

	# Gradio 介面
	demo = gr.Interface(
	fn=predict,
	inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
	outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
	title="AI / Human 判斷器",
	description="上傳的模型為 .pkl 格式，根據語言特徵分析並判斷文本來源"
	)

	demo.launch()





	'''import gradio as gr
	import tensorflow as tf
	import pickle

	# ---------------- 載入模型 ----------------
	try:
	model = tf.keras.models.load_model("AIDetect.h5")
	print("✅ 模型載入成功")
	except Exception as e:
	print("❌ 模型載入失敗:", e)
	model = None

	# ---------------- 載入詞彙 ----------------
	try:
	with open("vocab.pkl", "rb") as f:
	vocab = pickle.load(f)
	vectorized_layer = tf.keras.layers.TextVectorization(
	max_tokens=len(vocab)+1, output_sequence_length=50
	)
	vectorized_layer.set_vocabulary(vocab)
	print("✅ 詞彙載入成功")
	except Exception as e:
	print("❌ 詞彙載入失敗:", e)
	vectorized_layer = None

	# ---------------- 載入 scaler ----------------
	try:
	with open("scaler.pkl", "rb") as f:
	scaler = pickle.load(f)
	print("✅ Scaler 載入成功")
	except Exception as e:
	print("❌ Scaler 載入失敗:", e)
	scaler = None

	# ---------------- 特徵計算 ----------------
	def compute_features(text):
	if isinstance(text, tf.Tensor):
	text = text.numpy().decode('utf-8') if text.dtype == tf.string else str(text.numpy())
	elif isinstance(text, bytes):
	text = text.decode('utf-8')
	else:
	text = str(text)

	words = text.split()
	word_count = len(words)
	unique_words = len(set(words))
	unique_word_ratio = unique_words / (word_count + 1e-6)
	repeat_rate = 1 - unique_word_ratio
	punctuation_count = sum(1 for c in text if c in ".,!?;:")
	punctuation_ratio = punctuation_count / (len(text) + 1e-6)
	avg_word_length = sum(len(w) for w in words) / (word_count if word_count else 1)

	return [[word_count, unique_word_ratio, repeat_rate, punctuation_ratio, avg_word_length]]

	# ---------------- 使用 scaler ----------------
	def transform_features(feat):
	if scaler is None:
	return feat # 如果 scaler 沒載入，就直接回傳原始特徵
	return scaler.transform(feat).tolist() # 轉成 list，避免使用 NumPy

	# ---------------- 生成解釋 ----------------
	def explain_prediction(text):
	if model is None or vectorized_layer is None:
	return "❌ 模型或詞彙尚未載入，無法預測"

	try:
	# ---------------- 特徵計算 ----------------
	feat_raw = compute_features(text)
	feat = transform_features(feat_raw)

	# ---------------- 文字向量化 ----------------
	seq = vectorized_layer([text])
	seq = tf.keras.utils.pad_sequences(seq, maxlen=50, padding='pre')

	# 轉成 TensorFlow tensor
	seq = tf.convert_to_tensor(seq)
	feat = tf.convert_to_tensor(feat, dtype=tf.float32)

	# ---------------- 預測 ----------------
	pred_prob = model([seq, feat], training=False).numpy()[0][0]
	label = "AI 生成" if pred_prob >= 0.5 else "人類撰寫"
	prob = pred_prob * 100

	# ---------------- 判斷依據 ----------------
	reasons = []
	if feat_raw[0][0] > 100: reasons.append("句子長度偏長")
	if feat_raw[0][2] > 0.3: reasons.append("重複率高")
	if feat_raw[0][1] < 0.2: reasons.append("詞彙多樣性低")
	if feat_raw[0][3] < 0.01: reasons.append("標點符號少")
	if feat_raw[0][4] > 6: reasons.append("平均詞長偏長")
	if not reasons: reasons.append("句子長度與用詞平均")
	explanation = "；".join(reasons)

	return f"預測結果：{label}\nAI 機率：{prob:.2f}%\n判斷依據：{explanation}"

	except Exception as e:
	return f"❌ 預測時發生錯誤: {e}"

	# ---------------- Gradio 介面 ----------------
	iface = gr.Interface(
	fn=explain_prediction,
	inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
	outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
	title="AI vs Human 文本判斷",
	description="輸入文章，模型會判斷是 AI 或人類撰寫，並給出機率與判斷依據"
	)

	iface.launch()'''