Spaces:
Sleeping
Sleeping
Update AI_Model_architecture.py
Browse files- AI_Model_architecture.py +212 -212
AI_Model_architecture.py
CHANGED
|
@@ -1,212 +1,212 @@
|
|
| 1 |
-
"""流程圖
|
| 2 |
-
讀取資料 → 分割資料 → 編碼 → 建立 Dataset / DataLoader
|
| 3 |
-
↓
|
| 4 |
-
建立模型(BERT+LSTM+CNN)
|
| 5 |
-
↓
|
| 6 |
-
BERT 輸出 [batch, seq_len, 768]
|
| 7 |
-
↓
|
| 8 |
-
BiLSTM [batch, seq_len, hidden_dim*2]
|
| 9 |
-
↓
|
| 10 |
-
CNN 模組 (Conv1D + Dropout + GlobalMaxPooling1D)
|
| 11 |
-
↓
|
| 12 |
-
Linear 分類器(輸出詐騙機率)
|
| 13 |
-
↓
|
| 14 |
-
訓練模型(Epochs)
|
| 15 |
-
↓
|
| 16 |
-
評估模型(Accuracy / F1 / Precision / Recall)
|
| 17 |
-
↓
|
| 18 |
-
儲存模型(.pth)
|
| 19 |
-
|
| 20 |
-
"""#引入重要套件Import Library
|
| 21 |
-
import torch # PyTorch 主模組
|
| 22 |
-
import torch.nn as nn # 神經網路相關的層(例如 LSTM、Linear)
|
| 23 |
-
import torch.nn.functional as F # 提供純函式版的操作方法,像是 F.relu()、F.cross_entropy(),通常不帶參數、不自動建立權重
|
| 24 |
-
import numpy as np
|
| 25 |
-
import pandas as pd
|
| 26 |
-
import os
|
| 27 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:16"#讓 CUDA 使用「更小記憶體分配塊」的方法,能有效減少 OOM 錯誤。
|
| 28 |
-
import re
|
| 29 |
-
|
| 30 |
-
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
| 31 |
-
from tqdm import tqdm
|
| 32 |
-
from sklearn.model_selection import train_test_split
|
| 33 |
-
from torch.utils.data import DataLoader, Dataset # 提供 Dataset、DataLoader 類別
|
| 34 |
-
from transformers import BertTokenizer
|
| 35 |
-
from sklearn.model_selection import train_test_split
|
| 36 |
-
from transformers import BertModel
|
| 37 |
-
#BertTokenizer 把文字句子轉換成 BERT 格式的 token ID,例如 [CLS] 今天 天氣 不錯 [SEP] → [101, 1234, 5678, ...]
|
| 38 |
-
##BertForSequenceClassification 是 Hugging Face 提供的一個完整 BERT 模型,接了分類用的 Linear 層,讓你直接拿來做分類任務(例如詐騙 vs 正常)
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
#資料前處理
|
| 49 |
-
class BertPreprocessor:
|
| 50 |
-
def __init__(self, tokenizer_name="ckiplab/bert-base-chinese", max_len=128):
|
| 51 |
-
self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
|
| 52 |
-
self.max_len = max_len
|
| 53 |
-
|
| 54 |
-
def load_and_clean(self, filepath):
|
| 55 |
-
#載入 CSV 並清理 message 欄位。
|
| 56 |
-
df = pd.read_csv(filepath)
|
| 57 |
-
df = df.dropna().drop_duplicates().reset_index(drop=True)
|
| 58 |
-
# 文字清理:移除空白、保留中文英數與標點
|
| 59 |
-
df["message"] = df["message"].astype(str)
|
| 60 |
-
df["message"] = df["message"].apply(lambda text: re.sub(r"\s+", "", text))
|
| 61 |
-
df["message"] = df["message"].apply(lambda text: re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。,!?]", "", text))
|
| 62 |
-
return df[["message", "label"]] # 保留必要欄位
|
| 63 |
-
|
| 64 |
-
def encode(self, messages):
|
| 65 |
-
#使用 HuggingFace BERT Tokenizer 將訊息編碼成模型輸入格式。
|
| 66 |
-
return self.tokenizer(
|
| 67 |
-
list(messages),
|
| 68 |
-
return_tensors="pt",
|
| 69 |
-
truncation=True,
|
| 70 |
-
padding="max_length",
|
| 71 |
-
max_length=self.max_len
|
| 72 |
-
)
|
| 73 |
-
#自動做資料前處理
|
| 74 |
-
def build_bert_inputs(normal_files, scam_files):
|
| 75 |
-
#將正常與詐騙資料分別指定 label,統一清理、編碼,回傳模型可用的 input tensors 與 labels。
|
| 76 |
-
processor = BertPreprocessor()
|
| 77 |
-
dfs = []
|
| 78 |
-
# 合併正常 + 詐騙檔案清單
|
| 79 |
-
all_files = normal_files + scam_files
|
| 80 |
-
|
| 81 |
-
for filepath in all_files:
|
| 82 |
-
df = processor.load_and_clean(filepath)
|
| 83 |
-
dfs.append(df)
|
| 84 |
-
|
| 85 |
-
# 合併所有資料。在資料清理過程中dropna():刪除有空值的列,drop_duplicates():刪除重複列,filter()或df[...]做條件過濾,concat():將多個 DataFrame合併
|
| 86 |
-
# 這些操作不會自動重排索引,造成索引亂掉。
|
| 87 |
-
# 合併後統一編號(常見於多筆資料合併)all_df = pd.concat(dfs, 關鍵-->ignore_index=True)
|
| 88 |
-
all_df = pd.concat(dfs, ignore_index=True)
|
| 89 |
-
#製作 train/val 資料集
|
| 90 |
-
train_texts, val_texts, train_labels, val_labels = train_test_split(
|
| 91 |
-
all_df["message"], all_df["label"],
|
| 92 |
-
stratify=all_df["label"],
|
| 93 |
-
test_size=0.2,
|
| 94 |
-
random_state=25,
|
| 95 |
-
shuffle=True
|
| 96 |
-
)
|
| 97 |
-
|
| 98 |
-
# 進行 BERT tokenizer 編碼
|
| 99 |
-
train_inputs = processor.encode(train_texts)
|
| 100 |
-
val_inputs = processor.encode(val_texts)
|
| 101 |
-
|
| 102 |
-
return train_inputs, train_labels, val_inputs, val_labels, processor
|
| 103 |
-
|
| 104 |
-
#AUTO YA~以for迴圈自動新增個別變數內,build_bert_inputs能自動擷取新增資料
|
| 105 |
-
normal_files_labels = [normal for normal in normal_files]
|
| 106 |
-
scam_files_labels = [scam for scam in scam_files]
|
| 107 |
-
|
| 108 |
-
#print(bert_inputs.keys())
|
| 109 |
-
|
| 110 |
-
#定義 PyTorch Dataset 類別
|
| 111 |
-
class ScamDataset(Dataset):
|
| 112 |
-
def __init__(self, inputs, labels):
|
| 113 |
-
self.input_ids = inputs["input_ids"] # input_ids:句子的 token ID; attention_mask:注意力遮罩(0 = padding
|
| 114 |
-
self.attention_mask = inputs["attention_mask"] # token_type_ids:句子的 segment 區分
|
| 115 |
-
self.token_type_ids = inputs["token_type_ids"] # torch.tensor(x, dtype=...)將資料(x)轉為Tensor的標準做法。
|
| 116 |
-
self.labels = torch.tensor(labels.values, dtype=torch.float32) # x可以是 list、NumPy array、pandas series...
|
| 117 |
-
# dtypefloat32:浮點數(常用於 回歸 或 BCELoss 二分類);long:整數(常用於 多分類 搭配 CrossEntropyLoss)。labels.values → 轉為 NumPy array
|
| 118 |
-
def __len__(self): # 告訴 PyTorch 這個 Dataset 有幾筆資料
|
| 119 |
-
return len(self.labels) # 給 len(dataset) 或 for i in range(len(dataset)) 用的
|
| 120 |
-
|
| 121 |
-
def __getitem__(self, idx): #回傳第 idx 筆資料(會自動在訓練中一筆筆抓)
|
| 122 |
-
return { #DataLoader 每次會呼叫這個方法多次來抓一個 batch 的資料
|
| 123 |
-
"input_ids":self.input_ids[idx],
|
| 124 |
-
"attention_mask":self.attention_mask[idx],
|
| 125 |
-
"token_type_ids":self.token_type_ids[idx],
|
| 126 |
-
"labels":self.labels[idx]
|
| 127 |
-
}
|
| 128 |
-
|
| 129 |
-
# 這樣可以同時處理 scam 和 normal 資料,不用重複寫清理與 token 處理
|
| 130 |
-
train_inputs, train_labels, val_inputs, val_labels, processor = build_bert_inputs(normal_files, scam_files)
|
| 131 |
-
|
| 132 |
-
train_dataset = ScamDataset(train_inputs, train_labels)
|
| 133 |
-
val_dataset = ScamDataset(val_inputs, val_labels)
|
| 134 |
-
|
| 135 |
-
train_loader = DataLoader(train_dataset, batch_size=8)
|
| 136 |
-
val_loader = DataLoader(val_dataset, batch_size=8)
|
| 137 |
-
|
| 138 |
-
#模型
|
| 139 |
-
class BertLSTM_CNN_Classifier(nn.Module):
|
| 140 |
-
def __init__(self, hidden_dim=128, num_layers=1, dropout=0.3):
|
| 141 |
-
super(BertLSTM_CNN_Classifier, self).__init__()
|
| 142 |
-
self.bert = BertModel.from_pretrained("ckiplab/bert-base-chinese") #載入預訓練 BERT 模型(ckiplab 中文版)
|
| 143 |
-
# LSTM 接在 BERT 的 token 輸出後(輸入是768維)
|
| 144 |
-
self.LSTM = nn.LSTM(input_size=768, # 把 BERT 的 token 序列再交給雙向 LSTM 做時間序列建模
|
| 145 |
-
hidden_size=hidden_dim,
|
| 146 |
-
num_layers=num_layers,
|
| 147 |
-
batch_first=True,
|
| 148 |
-
bidirectional=True)
|
| 149 |
-
# CNN 模組:接在 LSTM 後的輸出上
|
| 150 |
-
self.conv1 = nn.Conv1d(in_channels=hidden_dim*2,
|
| 151 |
-
out_channels=128,
|
| 152 |
-
kernel_size=3,
|
| 153 |
-
padding=1)
|
| 154 |
-
self.dropout = nn.Dropout(dropout)
|
| 155 |
-
self.global_maxpool = nn.AdaptiveAvgPool1d(1) # 等效於 GlobalMaxPooling1D
|
| 156 |
-
|
| 157 |
-
self.classifier = nn.Linear(128,1)
|
| 158 |
-
def forward(self, input_ids, attention_mask, token_type_ids):
|
| 159 |
-
outputs = self.bert(input_ids=input_ids,
|
| 160 |
-
attention_mask=attention_mask,
|
| 161 |
-
token_type_ids=token_type_ids)
|
| 162 |
-
hidden_states = outputs.last_hidden_state # [batch, seq_len, 768]
|
| 163 |
-
|
| 164 |
-
LSTM_out, _ = self.LSTM(hidden_states) # [batch, seq_len, hidden_dim*2]
|
| 165 |
-
LSTM_out = LSTM_out.transpose(1, 2) # [batch, hidden_dim*2, seq_len]
|
| 166 |
-
|
| 167 |
-
x = self.conv1(LSTM_out) # [batch, 128, seq_len]
|
| 168 |
-
x = self.dropout(x)
|
| 169 |
-
x = self.global_maxpool(x).squeeze(2) # [batch, 128]
|
| 170 |
-
|
| 171 |
-
logits = self.classifier(x)
|
| 172 |
-
return torch.sigmoid(logits).view(-1) # 👈 修正這行
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
# 設定 GPU 裝置
|
| 176 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 177 |
-
# 設定使用的最大執行緒數(視 CPU 而定)
|
| 178 |
-
torch.set_num_threads(8) # 建議設成你系統的實體核心數
|
| 179 |
-
# 初始化模型
|
| 180 |
-
model = BertLSTM_CNN_Classifier().to(device)
|
| 181 |
-
# 定義 optimizer 和損失函數
|
| 182 |
-
optimizer = torch.optim.Adam(model.parameters(),lr=2e-5)
|
| 183 |
-
criterion = nn.BCELoss()
|
| 184 |
-
|
| 185 |
-
# 訓練迴圈
|
| 186 |
-
|
| 187 |
-
if __name__ == "__main__":
|
| 188 |
-
if os.path.exists("model.pth"):
|
| 189 |
-
print("✅ 已找到 model.pth,載入模型跳過訓練")
|
| 190 |
-
model.load_state_dict(torch.load("model.pth", map_location=device))
|
| 191 |
-
else:
|
| 192 |
-
print("🚀 未找到 model.pth,開始訓練模型...")
|
| 193 |
-
num_epochs = 10
|
| 194 |
-
for epoch in range(num_epochs):
|
| 195 |
-
model.train()
|
| 196 |
-
total_loss = 0.0
|
| 197 |
-
for batch in train_loader:
|
| 198 |
-
optimizer.zero_grad()
|
| 199 |
-
input_ids = batch["input_ids"].to(device)
|
| 200 |
-
attention_mask = batch["attention_mask"].to(device)
|
| 201 |
-
token_type_ids = batch["token_type_ids"].to(device)
|
| 202 |
-
labels = batch["labels"].to(device)
|
| 203 |
-
|
| 204 |
-
outputs = model(input_ids, attention_mask, token_type_ids)
|
| 205 |
-
loss = criterion(outputs, labels)
|
| 206 |
-
loss.backward()
|
| 207 |
-
optimizer.step()
|
| 208 |
-
total_loss += loss.item()
|
| 209 |
-
print(f"[Epoch{epoch+1}]Training Loss:{total_loss:.4f}")
|
| 210 |
-
torch.save(model.state_dict(), "model.pth")# 儲存模型權重
|
| 211 |
-
print("✅ 模型訓練完成並儲存為 model.pth")
|
| 212 |
-
|
|
|
|
| 1 |
+
"""流程圖
|
| 2 |
+
讀取資料 → 分割資料 → 編碼 → 建立 Dataset / DataLoader
|
| 3 |
+
↓
|
| 4 |
+
建立模型(BERT+LSTM+CNN)
|
| 5 |
+
↓
|
| 6 |
+
BERT 輸出 [batch, seq_len, 768]
|
| 7 |
+
↓
|
| 8 |
+
BiLSTM [batch, seq_len, hidden_dim*2]
|
| 9 |
+
↓
|
| 10 |
+
CNN 模組 (Conv1D + Dropout + GlobalMaxPooling1D)
|
| 11 |
+
↓
|
| 12 |
+
Linear 分類器(輸出詐騙機率)
|
| 13 |
+
↓
|
| 14 |
+
訓練模型(Epochs)
|
| 15 |
+
↓
|
| 16 |
+
評估模型(Accuracy / F1 / Precision / Recall)
|
| 17 |
+
↓
|
| 18 |
+
儲存模型(.pth)
|
| 19 |
+
|
| 20 |
+
"""#引入重要套件Import Library
|
| 21 |
+
import torch # PyTorch 主模組
|
| 22 |
+
import torch.nn as nn # 神經網路相關的層(例如 LSTM、Linear)
|
| 23 |
+
import torch.nn.functional as F # 提供純函式版的操作方法,像是 F.relu()、F.cross_entropy(),通常不帶參數、不自動建立權重
|
| 24 |
+
import numpy as np
|
| 25 |
+
import pandas as pd
|
| 26 |
+
import os
|
| 27 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:16"#讓 CUDA 使用「更小記憶體分配塊」的方法,能有效減少 OOM 錯誤。
|
| 28 |
+
import re
|
| 29 |
+
|
| 30 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
| 31 |
+
from tqdm import tqdm
|
| 32 |
+
from sklearn.model_selection import train_test_split
|
| 33 |
+
from torch.utils.data import DataLoader, Dataset # 提供 Dataset、DataLoader 類別
|
| 34 |
+
from transformers import BertTokenizer
|
| 35 |
+
from sklearn.model_selection import train_test_split
|
| 36 |
+
from transformers import BertModel
|
| 37 |
+
#BertTokenizer 把文字句子轉換成 BERT 格式的 token ID,例如 [CLS] 今天 天氣 不錯 [SEP] → [101, 1234, 5678, ...]
|
| 38 |
+
##BertForSequenceClassification 是 Hugging Face 提供的一個完整 BERT 模型,接了分類用的 Linear 層,讓你直接拿來做分類任務(例如詐騙 vs 正常)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
data_file = "NorANDScamInfo_data1.csv"
|
| 42 |
+
#正常訊息資料集在這新增
|
| 43 |
+
normal_files = [data_file]
|
| 44 |
+
|
| 45 |
+
#詐騙訊息資料集在這新增
|
| 46 |
+
scam_files = [data_file]
|
| 47 |
+
|
| 48 |
+
#資料前處理
|
| 49 |
+
class BertPreprocessor:
|
| 50 |
+
def __init__(self, tokenizer_name="ckiplab/bert-base-chinese", max_len=128):
|
| 51 |
+
self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
|
| 52 |
+
self.max_len = max_len
|
| 53 |
+
|
| 54 |
+
def load_and_clean(self, filepath):
|
| 55 |
+
#載入 CSV 並清理 message 欄位。
|
| 56 |
+
df = pd.read_csv(filepath)
|
| 57 |
+
df = df.dropna().drop_duplicates().reset_index(drop=True)
|
| 58 |
+
# 文字清理:移除空白、保留中文英數與標點
|
| 59 |
+
df["message"] = df["message"].astype(str)
|
| 60 |
+
df["message"] = df["message"].apply(lambda text: re.sub(r"\s+", "", text))
|
| 61 |
+
df["message"] = df["message"].apply(lambda text: re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。,!?]", "", text))
|
| 62 |
+
return df[["message", "label"]] # 保留必要欄位
|
| 63 |
+
|
| 64 |
+
def encode(self, messages):
|
| 65 |
+
#使用 HuggingFace BERT Tokenizer 將訊息編碼成模型輸入格式。
|
| 66 |
+
return self.tokenizer(
|
| 67 |
+
list(messages),
|
| 68 |
+
return_tensors="pt",
|
| 69 |
+
truncation=True,
|
| 70 |
+
padding="max_length",
|
| 71 |
+
max_length=self.max_len
|
| 72 |
+
)
|
| 73 |
+
#自動做資料前處理
|
| 74 |
+
def build_bert_inputs(normal_files, scam_files):
|
| 75 |
+
#將正常與詐騙資料分別指定 label,統一清理、編碼,回傳模型可用的 input tensors 與 labels。
|
| 76 |
+
processor = BertPreprocessor()
|
| 77 |
+
dfs = []
|
| 78 |
+
# 合併正常 + 詐騙檔案清單
|
| 79 |
+
all_files = normal_files + scam_files
|
| 80 |
+
|
| 81 |
+
for filepath in all_files:
|
| 82 |
+
df = processor.load_and_clean(filepath)
|
| 83 |
+
dfs.append(df)
|
| 84 |
+
|
| 85 |
+
# 合併所有資料。在資料清理過程中dropna():刪除有空值的列,drop_duplicates():刪除重複列,filter()或df[...]做條件過濾,concat():將多個 DataFrame合併
|
| 86 |
+
# 這些操作不會自動重排索引,造成索引亂掉。
|
| 87 |
+
# 合併後統一編號(常見於多筆資料合併)all_df = pd.concat(dfs, 關鍵-->ignore_index=True)
|
| 88 |
+
all_df = pd.concat(dfs, ignore_index=True)
|
| 89 |
+
#製作 train/val 資料集
|
| 90 |
+
train_texts, val_texts, train_labels, val_labels = train_test_split(
|
| 91 |
+
all_df["message"], all_df["label"],
|
| 92 |
+
stratify=all_df["label"],
|
| 93 |
+
test_size=0.2,
|
| 94 |
+
random_state=25,
|
| 95 |
+
shuffle=True
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# 進行 BERT tokenizer 編碼
|
| 99 |
+
train_inputs = processor.encode(train_texts)
|
| 100 |
+
val_inputs = processor.encode(val_texts)
|
| 101 |
+
|
| 102 |
+
return train_inputs, train_labels, val_inputs, val_labels, processor
|
| 103 |
+
|
| 104 |
+
#AUTO YA~以for迴圈自動新增個別變數內,build_bert_inputs能自動擷取新增資料
|
| 105 |
+
normal_files_labels = [normal for normal in normal_files]
|
| 106 |
+
scam_files_labels = [scam for scam in scam_files]
|
| 107 |
+
|
| 108 |
+
#print(bert_inputs.keys())
|
| 109 |
+
|
| 110 |
+
#定義 PyTorch Dataset 類別
|
| 111 |
+
class ScamDataset(Dataset):
|
| 112 |
+
def __init__(self, inputs, labels):
|
| 113 |
+
self.input_ids = inputs["input_ids"] # input_ids:句子的 token ID; attention_mask:注意力遮罩(0 = padding���
|
| 114 |
+
self.attention_mask = inputs["attention_mask"] # token_type_ids:句子的 segment 區分
|
| 115 |
+
self.token_type_ids = inputs["token_type_ids"] # torch.tensor(x, dtype=...)將資料(x)轉為Tensor的標準做法。
|
| 116 |
+
self.labels = torch.tensor(labels.values, dtype=torch.float32) # x可以是 list、NumPy array、pandas series...
|
| 117 |
+
# dtypefloat32:浮點數(常用於 回歸 或 BCELoss 二分類);long:整數(常用於 多分類 搭配 CrossEntropyLoss)。labels.values → 轉為 NumPy array
|
| 118 |
+
def __len__(self): # 告訴 PyTorch 這個 Dataset 有幾筆資料
|
| 119 |
+
return len(self.labels) # 給 len(dataset) 或 for i in range(len(dataset)) 用的
|
| 120 |
+
|
| 121 |
+
def __getitem__(self, idx): #回傳第 idx 筆資料(會自動在訓練中一筆筆抓)
|
| 122 |
+
return { #DataLoader 每次會呼叫這個方法多次來抓一個 batch 的資料
|
| 123 |
+
"input_ids":self.input_ids[idx],
|
| 124 |
+
"attention_mask":self.attention_mask[idx],
|
| 125 |
+
"token_type_ids":self.token_type_ids[idx],
|
| 126 |
+
"labels":self.labels[idx]
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# 這樣可以同時處理 scam 和 normal 資料,不用重複寫清理與 token 處理
|
| 130 |
+
train_inputs, train_labels, val_inputs, val_labels, processor = build_bert_inputs(normal_files, scam_files)
|
| 131 |
+
|
| 132 |
+
train_dataset = ScamDataset(train_inputs, train_labels)
|
| 133 |
+
val_dataset = ScamDataset(val_inputs, val_labels)
|
| 134 |
+
|
| 135 |
+
train_loader = DataLoader(train_dataset, batch_size=8)
|
| 136 |
+
val_loader = DataLoader(val_dataset, batch_size=8)
|
| 137 |
+
|
| 138 |
+
#模型
|
| 139 |
+
class BertLSTM_CNN_Classifier(nn.Module):
|
| 140 |
+
def __init__(self, hidden_dim=128, num_layers=1, dropout=0.3):
|
| 141 |
+
super(BertLSTM_CNN_Classifier, self).__init__()
|
| 142 |
+
self.bert = BertModel.from_pretrained("ckiplab/bert-base-chinese") #載入預訓練 BERT 模型(ckiplab 中文版)
|
| 143 |
+
# LSTM 接在 BERT 的 token 輸出後(輸入是768維)
|
| 144 |
+
self.LSTM = nn.LSTM(input_size=768, # 把 BERT 的 token 序列再交給雙向 LSTM 做時間序列建模
|
| 145 |
+
hidden_size=hidden_dim,
|
| 146 |
+
num_layers=num_layers,
|
| 147 |
+
batch_first=True,
|
| 148 |
+
bidirectional=True)
|
| 149 |
+
# CNN 模組:接在 LSTM 後的輸出上
|
| 150 |
+
self.conv1 = nn.Conv1d(in_channels=hidden_dim*2,
|
| 151 |
+
out_channels=128,
|
| 152 |
+
kernel_size=3,
|
| 153 |
+
padding=1)
|
| 154 |
+
self.dropout = nn.Dropout(dropout)
|
| 155 |
+
self.global_maxpool = nn.AdaptiveAvgPool1d(1) # 等效於 GlobalMaxPooling1D
|
| 156 |
+
|
| 157 |
+
self.classifier = nn.Linear(128,1)
|
| 158 |
+
def forward(self, input_ids, attention_mask, token_type_ids):
|
| 159 |
+
outputs = self.bert(input_ids=input_ids,
|
| 160 |
+
attention_mask=attention_mask,
|
| 161 |
+
token_type_ids=token_type_ids)
|
| 162 |
+
hidden_states = outputs.last_hidden_state # [batch, seq_len, 768]
|
| 163 |
+
|
| 164 |
+
LSTM_out, _ = self.LSTM(hidden_states) # [batch, seq_len, hidden_dim*2]
|
| 165 |
+
LSTM_out = LSTM_out.transpose(1, 2) # [batch, hidden_dim*2, seq_len]
|
| 166 |
+
|
| 167 |
+
x = self.conv1(LSTM_out) # [batch, 128, seq_len]
|
| 168 |
+
x = self.dropout(x)
|
| 169 |
+
x = self.global_maxpool(x).squeeze(2) # [batch, 128]
|
| 170 |
+
|
| 171 |
+
logits = self.classifier(x)
|
| 172 |
+
return torch.sigmoid(logits).view(-1) # 👈 修正這行
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# 設定 GPU 裝置
|
| 176 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 177 |
+
# 設定使用的最大執行緒數(視 CPU 而定)
|
| 178 |
+
torch.set_num_threads(8) # 建議設成你系統的實體核心數
|
| 179 |
+
# 初始化模型
|
| 180 |
+
model = BertLSTM_CNN_Classifier().to(device)
|
| 181 |
+
# 定義 optimizer 和損失函數
|
| 182 |
+
optimizer = torch.optim.Adam(model.parameters(),lr=2e-5)
|
| 183 |
+
criterion = nn.BCELoss()
|
| 184 |
+
|
| 185 |
+
# 訓練迴圈
|
| 186 |
+
|
| 187 |
+
if __name__ == "__main__":
|
| 188 |
+
if os.path.exists("model.pth"):
|
| 189 |
+
print("✅ 已找到 model.pth,載入模型跳過訓練")
|
| 190 |
+
model.load_state_dict(torch.load("model.pth", map_location=device))
|
| 191 |
+
else:
|
| 192 |
+
print("🚀 未找到 model.pth,開始訓練模型...")
|
| 193 |
+
num_epochs = 10
|
| 194 |
+
for epoch in range(num_epochs):
|
| 195 |
+
model.train()
|
| 196 |
+
total_loss = 0.0
|
| 197 |
+
for batch in train_loader:
|
| 198 |
+
optimizer.zero_grad()
|
| 199 |
+
input_ids = batch["input_ids"].to(device)
|
| 200 |
+
attention_mask = batch["attention_mask"].to(device)
|
| 201 |
+
token_type_ids = batch["token_type_ids"].to(device)
|
| 202 |
+
labels = batch["labels"].to(device)
|
| 203 |
+
|
| 204 |
+
outputs = model(input_ids, attention_mask, token_type_ids)
|
| 205 |
+
loss = criterion(outputs, labels)
|
| 206 |
+
loss.backward()
|
| 207 |
+
optimizer.step()
|
| 208 |
+
total_loss += loss.item()
|
| 209 |
+
print(f"[Epoch{epoch+1}]Training Loss:{total_loss:.4f}")
|
| 210 |
+
torch.save(model.state_dict(), "model.pth")# 儲存模型權重
|
| 211 |
+
print("✅ 模型訓練完成並儲存為 model.pth")
|
| 212 |
+
|