File size: 2,985 Bytes
250b967
 
b9dff7c
250b967
b9dff7c
250b967
20b8c66
b9dff7c
 
20b8c66
b9dff7c
20b8c66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9dff7c
 
 
 
 
250b967
 
20b8c66
 
 
 
250b967
20b8c66
b9dff7c
20b8c66
 
 
 
 
 
b9dff7c
20b8c66
b9dff7c
 
 
 
 
 
250b967
20b8c66
 
 
250b967
20b8c66
250b967
b9dff7c
250b967
20b8c66
 
 
 
 
250b967
b9dff7c
20b8c66
 
 
 
250b967
20b8c66
 
 
 
b9dff7c
540fde5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
import numpy as np
import librosa, joblib, os
from sklearn.ensemble import RandomForestClassifier
from transformers import pipeline

# β€”β€” 1) 訓練/載ε…₯θͺžιŸ³ζ¨‘εž‹ β€”β€” 
BASE_VOICE_PATH = r"C:\ζƒ…η·’"
VOICE_MODEL_FILE = "voice_model.joblib"

def train_voice_model():
    labels = ["angry","happy","sad","fear","surprise"]
    X, y = [], []
    for lbl in labels:
        folder = os.path.join(BASE_VOICE_PATH, lbl)
        if not os.path.isdir(folder):
            raise FileNotFoundError(f"ζ‰ΎδΈεˆ°θ³‡ζ–™ε€ΎοΌš{folder}")
        for fname in os.listdir(folder):
            if fname.lower().endswith(".wav"):
                path = os.path.join(folder, fname)
                audio, sr = librosa.load(path, sr=None)
                mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
                mfcc_mean = np.mean(mfccs.T, axis=0)
                X.append(mfcc_mean)
                y.append(lbl)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X, y)
    joblib.dump(clf, VOICE_MODEL_FILE)
    return clf

if os.path.exists(VOICE_MODEL_FILE):
    voice_clf = joblib.load(VOICE_MODEL_FILE)
else:
    voice_clf = train_voice_model()

def analyze_audio(path):
    audio, sr = librosa.load(path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfccs.T, axis=0).reshape(1, -1)
    return voice_clf.predict(mfcc_mean)[0]

# β€”β€” 2) δΈ­ζ–‡ζ–‡ε­—ζƒ…η·’εˆ†ζž β€”β€” 
def analyze_text(text):
    if any(w in text for w in ["ι–‹εΏƒ","快樂"]): return "happy"
    if any(w in text for w in ["η”Ÿζ°£","憀怒"]): return "angry"
    if any(w in text for w in ["ε‚·εΏƒ","ι›£ιŽ","ε“­"]): return "sad"
    if any(w in text for w in ["驚","意倖"]): return "surprise"
    if any(w in text for w in ["怕","恐懼"]): return "fear"
    return "neutral"

# β€”β€” 3) ε³ζ™‚θ‡‰ιƒ¨ζƒ…η·’εˆ†ζžοΌˆζ”Ήη”¨ Hugging Face ferplusοΌ‰ β€”β€” 
face_classifier = pipeline(
    "image-classification",
    model="nateraw/ferplus",
    device=-1  # CPU
)

def analyze_face(img):
    # img: PIL image or numpy array
    result = face_classifier(img, top_k=1)[0]
    return result["label"]

# β€”β€” 4) ε»Ίη«‹ Gradio ε€šζ¨™η±€δ»‹ι’ β€”β€” 
with gr.Blocks() as demo:
    gr.Markdown("# ε€šζ¨‘ζ…‹ζƒ…η·’εˆ†ζžη€Ίη―„")
    with gr.Tab("πŸ“ ζ–‡ε­—"):
        txt = gr.Textbox(placeholder="θΌΈε…₯中文…")
        btn_txt = gr.Button("εˆ†ζžζ–‡ε­—")
        out_txt = gr.Textbox()
        btn_txt.click(analyze_text, inputs=txt, outputs=out_txt)

    with gr.Tab("🎀 θͺžιŸ³"):
        aud = gr.Audio(type="filepath")
        btn_aud = gr.Button("εˆ†ζžθͺžιŸ³")
        out_aud = gr.Textbox()
        btn_aud.click(analyze_audio, inputs=aud, outputs=out_aud)

    with gr.Tab("πŸ“· 臉部"):
        img_cam = gr.Image(source="webcam")
        btn_img = gr.Button("εˆ†ζžθ‘¨ζƒ…")
        out_img = gr.Textbox()
        btn_img.click(analyze_face, inputs=img_cam, outputs=out_img)

demo.launch()