Pedrinho-Dev01 commited on
Commit
3d7c8ba
·
1 Parent(s): 13a9adc

Updated Host

Browse files
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
@@ -7,6 +7,8 @@ RUN pip install --no-cache-dir -r requirements.txt
7
 
8
  COPY . .
9
 
 
 
10
  EXPOSE 7860
11
 
12
- CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM python:3.12-slim
2
 
3
  WORKDIR /app
4
 
 
7
 
8
  COPY . .
9
 
10
+ ENV HF_HOME=/tmp/huggingface
11
+
12
  EXPOSE 7860
13
 
14
+ CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
api.py CHANGED
@@ -5,13 +5,10 @@ Run with: uvicorn api:app --reload
5
  """
6
 
7
  import json
8
- import os
9
- from pathlib import Path
10
  from typing import Optional
11
 
12
  import email
13
  from email import policy as email_policy
14
- import numpy as np
15
  import torch
16
  from fastapi import FastAPI, HTTPException, UploadFile, File
17
  from fastapi.middleware.cors import CORSMiddleware
@@ -24,11 +21,8 @@ from transformers import (
24
 
25
  # ── Config ────────────────────────────────────────────────────────────────────
26
 
27
- BASE_DIR = Path(__file__).parent
28
- MODELS_DIR = BASE_DIR / "models"
29
-
30
- ROBERTA_DIR = MODELS_DIR / "roberta_large_final"
31
- ELECTRA_DIR = MODELS_DIR / "electra_large_final"
32
 
33
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
34
 
@@ -45,7 +39,7 @@ app = FastAPI(
45
 
46
  app.add_middleware(
47
  CORSMiddleware,
48
- allow_origins=["https://pedrinho-dev01.github.io/gone-phishing/"],
49
  allow_methods=["*"],
50
  allow_headers=["*"],
51
  )
@@ -54,17 +48,20 @@ app.add_middleware(
54
  # ── Model loading ─────────────────────────────────────────────────────────────
55
 
56
  class ModelBundle:
57
- def __init__(self, model_dir: Path, model_class, tokenizer_class=None):
58
- self.model_dir = model_dir
59
- self.tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
60
- self.model = model_class.from_pretrained(str(model_dir))
61
  self.model.to(DEVICE)
62
  self.model.eval()
63
 
64
- threshold_path = model_dir / "threshold_config.json"
 
 
65
  with open(threshold_path) as f:
66
  cfg = json.load(f)
67
  self.threshold: float = cfg["recommended_threshold"]
 
68
 
69
  @torch.no_grad()
70
  def predict_proba(self, text: str) -> float:
@@ -89,11 +86,9 @@ electra_bundle: Optional[ModelBundle] = None
89
  @app.on_event("startup")
90
  def load_models():
91
  global roberta_bundle, electra_bundle
92
- print("Loading RoBERTa …")
93
- roberta_bundle = ModelBundle(ROBERTA_DIR, RobertaForSequenceClassification)
94
- print("Loading ELECTRA ")
95
- electra_bundle = ModelBundle(ELECTRA_DIR, ElectraForSequenceClassification)
96
- print(f"Models loaded on {DEVICE}.")
97
 
98
 
99
  # ── Schemas ───────────────────────────────────────────────────────────────────
@@ -217,7 +212,7 @@ def extract_text_from_eml(raw_bytes: bytes) -> str:
217
  if subject:
218
  parts.append(f"Subject: {subject}")
219
 
220
- # From / To for extra signal
221
  from_addr = msg.get("from", "")
222
  if from_addr:
223
  parts.append(f"From: {from_addr}")
@@ -233,7 +228,6 @@ def extract_text_from_eml(raw_bytes: bytes) -> str:
233
  # Fallback to HTML only if no plain text found
234
  import html as html_lib
235
  raw_html = part.get_content()
236
- # Very light strip — remove tags
237
  import re
238
  text = re.sub(r"<[^>]+>", " ", raw_html)
239
  text = html_lib.unescape(text)
@@ -267,5 +261,4 @@ async def predict_eml(file: UploadFile = File(...)):
267
  print(analyzed_text)
268
  print("=== [END EMAIL CONTENT] ===\n")
269
 
270
- # Reuse the existing ensemble prediction logic
271
  return predict(PredictRequest(text=analyzed_text, model="ensemble"))
 
5
  """
6
 
7
  import json
 
 
8
  from typing import Optional
9
 
10
  import email
11
  from email import policy as email_policy
 
12
  import torch
13
  from fastapi import FastAPI, HTTPException, UploadFile, File
14
  from fastapi.middleware.cors import CORSMiddleware
 
21
 
22
  # ── Config ────────────────────────────────────────────────────────────────────
23
 
24
+ ROBERTA_REPO = "Dpedrinho01/trained_roberta_large"
25
+ ELECTRA_REPO = "Dpedrinho01/trained_electra_large"
 
 
 
26
 
27
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
28
 
 
39
 
40
  app.add_middleware(
41
  CORSMiddleware,
42
+ allow_origins=["*"],
43
  allow_methods=["*"],
44
  allow_headers=["*"],
45
  )
 
48
  # ── Model loading ─────────────────────────────────────────────────────────────
49
 
50
  class ModelBundle:
51
+ def __init__(self, repo_id: str, model_class):
52
+ print(f"Loading {repo_id} …")
53
+ self.tokenizer = AutoTokenizer.from_pretrained(repo_id)
54
+ self.model = model_class.from_pretrained(repo_id)
55
  self.model.to(DEVICE)
56
  self.model.eval()
57
 
58
+ # Load threshold from the repo's threshold_config.json
59
+ from huggingface_hub import hf_hub_download
60
+ threshold_path = hf_hub_download(repo_id=repo_id, filename="threshold_config.json")
61
  with open(threshold_path) as f:
62
  cfg = json.load(f)
63
  self.threshold: float = cfg["recommended_threshold"]
64
+ print(f" ✓ {repo_id} loaded (threshold={self.threshold}, device={DEVICE})")
65
 
66
  @torch.no_grad()
67
  def predict_proba(self, text: str) -> float:
 
86
  @app.on_event("startup")
87
  def load_models():
88
  global roberta_bundle, electra_bundle
89
+ roberta_bundle = ModelBundle(ROBERTA_REPO, RobertaForSequenceClassification)
90
+ electra_bundle = ModelBundle(ELECTRA_REPO, ElectraForSequenceClassification)
91
+ print(f"All models ready on {DEVICE}.")
 
 
92
 
93
 
94
  # ── Schemas ───────────────────────────────────────────────────────────────────
 
212
  if subject:
213
  parts.append(f"Subject: {subject}")
214
 
215
+ # From for extra signal
216
  from_addr = msg.get("from", "")
217
  if from_addr:
218
  parts.append(f"From: {from_addr}")
 
228
  # Fallback to HTML only if no plain text found
229
  import html as html_lib
230
  raw_html = part.get_content()
 
231
  import re
232
  text = re.sub(r"<[^>]+>", " ", raw_html)
233
  text = html_lib.unescape(text)
 
261
  print(analyzed_text)
262
  print("=== [END EMAIL CONTENT] ===\n")
263
 
 
264
  return predict(PredictRequest(text=analyzed_text, model="ensemble"))
models/electra_large_final/.gitattributes DELETED
@@ -1 +0,0 @@
1
- model.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
models/electra_large_final/config.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "add_cross_attention": false,
3
- "architectures": [
4
- "ElectraForSequenceClassification"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "bos_token_id": null,
8
- "classifier_dropout": null,
9
- "dtype": "float32",
10
- "embedding_size": 1024,
11
- "eos_token_id": null,
12
- "hidden_act": "gelu",
13
- "hidden_dropout_prob": 0.1,
14
- "hidden_size": 1024,
15
- "initializer_range": 0.02,
16
- "intermediate_size": 4096,
17
- "is_decoder": false,
18
- "layer_norm_eps": 1e-12,
19
- "max_position_embeddings": 512,
20
- "model_type": "electra",
21
- "num_attention_heads": 16,
22
- "num_hidden_layers": 24,
23
- "pad_token_id": 0,
24
- "position_embedding_type": "absolute",
25
- "summary_activation": "gelu",
26
- "summary_last_dropout": 0.1,
27
- "summary_type": "first",
28
- "summary_use_proj": true,
29
- "tie_word_embeddings": true,
30
- "transformers_version": "5.3.0",
31
- "type_vocab_size": 2,
32
- "use_cache": false,
33
- "vocab_size": 30522
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/electra_large_final/threshold_config.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "recommended_threshold": 0.35,
3
- "standard_metrics": {
4
- "accuracy": 0.9256,
5
- "f1": 0.9051987767584098,
6
- "precision": 0.9230769230769231,
7
- "recall": 0.888
8
- },
9
- "custom_metrics": {
10
- "accuracy": 0.9256,
11
- "f1": 0.9055837563451776,
12
- "precision": 0.9195876288659793,
13
- "recall": 0.892
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/electra_large_final/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
models/electra_large_final/tokenizer_config.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "backend": "tokenizers",
3
- "cls_token": "[CLS]",
4
- "do_lower_case": true,
5
- "is_local": false,
6
- "mask_token": "[MASK]",
7
- "model_max_length": 512,
8
- "pad_token": "[PAD]",
9
- "sep_token": "[SEP]",
10
- "strip_accents": null,
11
- "tokenize_chinese_chars": true,
12
- "tokenizer_class": "BertTokenizer",
13
- "unk_token": "[UNK]"
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/electra_large_final/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e251fe80c570139a5ddea6518864f1ccf76ef6536208c2d234507ba2c06c2b9
3
- size 4856
 
 
 
 
models/roberta_large_final/.gitattributes DELETED
@@ -1 +0,0 @@
1
- model.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
models/roberta_large_final/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "add_cross_attention": false,
3
- "architectures": [
4
- "RobertaForSequenceClassification"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "bos_token_id": 0,
8
- "classifier_dropout": null,
9
- "dtype": "float32",
10
- "eos_token_id": 2,
11
- "hidden_act": "gelu",
12
- "hidden_dropout_prob": 0.1,
13
- "hidden_size": 1024,
14
- "initializer_range": 0.02,
15
- "intermediate_size": 4096,
16
- "is_decoder": false,
17
- "layer_norm_eps": 1e-05,
18
- "max_position_embeddings": 514,
19
- "model_type": "roberta",
20
- "num_attention_heads": 16,
21
- "num_hidden_layers": 24,
22
- "pad_token_id": 1,
23
- "tie_word_embeddings": true,
24
- "transformers_version": "5.3.0",
25
- "type_vocab_size": 1,
26
- "use_cache": false,
27
- "vocab_size": 50265
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/roberta_large_final/threshold_config.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "recommended_threshold": 0.35,
3
- "standard_metrics": {
4
- "accuracy": 0.9352,
5
- "f1": 0.916923076923077,
6
- "precision": 0.9410526315789474,
7
- "recall": 0.894
8
- },
9
- "custom_metrics": {
10
- "accuracy": 0.9336,
11
- "f1": 0.9150460593654043,
12
- "precision": 0.9371069182389937,
13
- "recall": 0.894
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/roberta_large_final/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
models/roberta_large_final/tokenizer_config.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "backend": "tokenizers",
4
- "bos_token": "<s>",
5
- "cls_token": "<s>",
6
- "eos_token": "</s>",
7
- "errors": "replace",
8
- "is_local": false,
9
- "mask_token": "<mask>",
10
- "model_max_length": 512,
11
- "pad_token": "<pad>",
12
- "sep_token": "</s>",
13
- "tokenizer_class": "RobertaTokenizer",
14
- "trim_offsets": true,
15
- "unk_token": "<unk>"
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/roberta_large_final/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf7746da523087b4c98b10face3adad900b52a4c3ab325a7207442bec1e9eddb
3
- size 4856