AI_Detect / app.py
Hellowish's picture
Update app.py
98cf186 verified
import gradio as gr
import joblib
import numpy as np
import re
# 載入你訓練好的 .pkl 模型
model = joblib.load("ai_detector_model.pkl") # 確認路徑正確
# 自訂簡單分句函數
def simple_sent_tokenize(text):
# 以句點、問號、驚嘆號拆分,保留句尾符號
sentences = re.split(r'(?<=[.!?])\s+', text.strip())
return [s for s in sentences if s]
# 自訂簡單分詞函數
def simple_word_tokenize(text):
# 只抓字母和數字組成的單詞
return re.findall(r'\b\w+\b', text.lower())
def extract_features(text):
sentences = simple_sent_tokenize(text)
words_clean = [w for w in simple_word_tokenize(text) if w.isalpha()]
features = {}
features['text_length'] = len(text)
features['word_count'] = len(words_clean)
features['sentence_count'] = len(sentences)
features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0
unique_words = set(words_clean)
features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
sentence_lengths = [len(simple_word_tokenize(s)) for s in sentences]
features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
features['period_density'] = text.count('.') / max(len(text), 1) * 1000
features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
features['question_density'] = text.count('?') / max(len(text), 1) * 1000
complex_words = [w for w in words_clean if len(w) > 6]
features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
text_lower = text.lower()
features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)
return np.array(list(features.values())).reshape(1, -1), features
def predict(text):
if not text.strip():
return "請輸入文字內容。"
try:
X, feats = extract_features(text)
if hasattr(model, "predict_proba"):
prob = model.predict_proba(X)[0][1]
else:
prob = model.predict(X)[0]
label = "AI 生成" if prob > 0.5 else "人類撰寫"
reason = []
if feats['vocabulary_richness'] < 0.3:
reason.append("詞彙多樣性較低")
if feats['sentence_length_variance'] < 10:
reason.append("句子長度平均,像 AI")
if feats['ai_marker_count'] > feats['human_marker_count']:
reason.append("包含常見 AI 連接詞")
if feats['human_marker_count'] > feats['ai_marker_count']:
reason.append("包含主觀語氣詞")
if not reason:
reason.append("整體語言特徵與模型預測一致")
return f"預測結果:{label}\nAI 機率:{prob:.2%}\n判斷依據:{', '.join(reason)}"
except Exception as e:
return f"預測時出現錯誤: {str(e)}"
# Gradio 介面
demo = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
title="AI / Human 判斷器",
description="上傳的模型為 .pkl 格式,根據語言特徵分析並判斷文本來源"
)
demo.launch()
'''import gradio as gr
import tensorflow as tf
import pickle
# ---------------- 載入模型 ----------------
try:
model = tf.keras.models.load_model("AIDetect.h5")
print("✅ 模型載入成功")
except Exception as e:
print("❌ 模型載入失敗:", e)
model = None
# ---------------- 載入詞彙 ----------------
try:
with open("vocab.pkl", "rb") as f:
vocab = pickle.load(f)
vectorized_layer = tf.keras.layers.TextVectorization(
max_tokens=len(vocab)+1, output_sequence_length=50
)
vectorized_layer.set_vocabulary(vocab)
print("✅ 詞彙載入成功")
except Exception as e:
print("❌ 詞彙載入失敗:", e)
vectorized_layer = None
# ---------------- 載入 scaler ----------------
try:
with open("scaler.pkl", "rb") as f:
scaler = pickle.load(f)
print("✅ Scaler 載入成功")
except Exception as e:
print("❌ Scaler 載入失敗:", e)
scaler = None
# ---------------- 特徵計算 ----------------
def compute_features(text):
if isinstance(text, tf.Tensor):
text = text.numpy().decode('utf-8') if text.dtype == tf.string else str(text.numpy())
elif isinstance(text, bytes):
text = text.decode('utf-8')
else:
text = str(text)
words = text.split()
word_count = len(words)
unique_words = len(set(words))
unique_word_ratio = unique_words / (word_count + 1e-6)
repeat_rate = 1 - unique_word_ratio
punctuation_count = sum(1 for c in text if c in ".,!?;:")
punctuation_ratio = punctuation_count / (len(text) + 1e-6)
avg_word_length = sum(len(w) for w in words) / (word_count if word_count else 1)
return [[word_count, unique_word_ratio, repeat_rate, punctuation_ratio, avg_word_length]]
# ---------------- 使用 scaler ----------------
def transform_features(feat):
if scaler is None:
return feat # 如果 scaler 沒載入,就直接回傳原始特徵
return scaler.transform(feat).tolist() # 轉成 list,避免使用 NumPy
# ---------------- 生成解釋 ----------------
def explain_prediction(text):
if model is None or vectorized_layer is None:
return "❌ 模型或詞彙尚未載入,無法預測"
try:
# ---------------- 特徵計算 ----------------
feat_raw = compute_features(text)
feat = transform_features(feat_raw)
# ---------------- 文字向量化 ----------------
seq = vectorized_layer([text])
seq = tf.keras.utils.pad_sequences(seq, maxlen=50, padding='pre')
# 轉成 TensorFlow tensor
seq = tf.convert_to_tensor(seq)
feat = tf.convert_to_tensor(feat, dtype=tf.float32)
# ---------------- 預測 ----------------
pred_prob = model([seq, feat], training=False).numpy()[0][0]
label = "AI 生成" if pred_prob >= 0.5 else "人類撰寫"
prob = pred_prob * 100
# ---------------- 判斷依據 ----------------
reasons = []
if feat_raw[0][0] > 100: reasons.append("句子長度偏長")
if feat_raw[0][2] > 0.3: reasons.append("重複率高")
if feat_raw[0][1] < 0.2: reasons.append("詞彙多樣性低")
if feat_raw[0][3] < 0.01: reasons.append("標點符號少")
if feat_raw[0][4] > 6: reasons.append("平均詞長偏長")
if not reasons: reasons.append("句子長度與用詞平均")
explanation = ";".join(reasons)
return f"預測結果:{label}\nAI 機率:{prob:.2f}%\n判斷依據:{explanation}"
except Exception as e:
return f"❌ 預測時發生錯誤: {e}"
# ---------------- Gradio 介面 ----------------
iface = gr.Interface(
fn=explain_prediction,
inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
title="AI vs Human 文本判斷",
description="輸入文章,模型會判斷是 AI 或人類撰寫,並給出機率與判斷依據"
)
iface.launch()'''