MAS-AI-0000 commited on
Commit
13464bf
Β·
verified Β·
1 Parent(s): 1da3a1c

Update textPreprocess.py

Browse files
Files changed (1) hide show
  1. textPreprocess.py +123 -123
textPreprocess.py CHANGED
@@ -1,123 +1,123 @@
1
- import torch
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
3
- import os
4
-
5
- # ── 1) Configuration ────────────────────────────────────────────────────────────
6
- BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
7
- MODEL_DIR = os.path.join(BASE_DIR, "Lib/Models/Text") # Update this path to your model location
8
- MAX_LEN = 512
9
-
10
- # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
11
- device = "cuda" if torch.cuda.is_available() else "cpu"
12
- print(f"Text prediction device: {device}")
13
-
14
- # Global variables for model and tokenizer
15
- tokenizer = None
16
- model = None
17
- ID2LABEL = {0: "human", 1: "ai"}
18
-
19
- try:
20
- # Config carries id2label/label2id if you saved them
21
- config = AutoConfig.from_pretrained(MODEL_DIR)
22
-
23
- # Loads tokenizer.json + special_tokens_map.json automatically
24
- tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
25
-
26
- # Loads model.safetensors automatically (no extra flags needed)
27
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
28
- model.eval().to(device)
29
-
30
- # Update label mapping from config if available
31
- ID2LABEL = model.config.id2label if getattr(model.config, "id2label", None) else {0: "human", 1: "ai"}
32
-
33
- print(f"Text classification model loaded successfully")
34
- print("Labels:", ID2LABEL)
35
- except Exception as e:
36
- print(f"Error loading text model: {e}")
37
- print("Text prediction will return fallback responses")
38
-
39
- # ── 3) Inference function ──────────────────────────────────────────────────────
40
- @torch.inference_mode()
41
- def predict_text(text: str, max_length: int = None):
42
- """
43
- Predict whether the given text is human-written or AI-generated.
44
-
45
- Args:
46
- text (str): The text to classify
47
- max_length (int): Maximum sequence length for tokenization (defaults to MAX_LEN)
48
-
49
- Returns:
50
- dict: Contains predicted_class and confidence
51
- """
52
- if model is None or tokenizer is None:
53
- return {"predicted_class": "Human", "confidence": 0}
54
-
55
- if max_length is None:
56
- max_length = MAX_LEN
57
-
58
- try:
59
- # Tokenize input
60
- enc = tokenizer(
61
- text,
62
- return_tensors="pt",
63
- truncation=True,
64
- max_length=max_length,
65
- )
66
- enc = {k: v.to(device) for k, v in enc.items()}
67
-
68
- # Get predictions
69
- logits = model(**enc).logits
70
- probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
71
- pred_id = int(probs.argmax(-1))
72
-
73
- # Get label (capitalize first letter for consistency)
74
- label = ID2LABEL.get(pred_id, str(pred_id))
75
- label = label.capitalize() # "human" -> "Human", "ai" -> "Ai"
76
-
77
- return {
78
- "predicted_class": label,
79
- "confidence": float(probs[pred_id])
80
- }
81
- except Exception as e:
82
- print(f"Error during text prediction: {e}")
83
- return {"predicted_class": "Human", "confidence": 0}
84
-
85
- # ── 4) Batch prediction (optional, for future use) ─────────────────────────────
86
- @torch.inference_mode()
87
- def predict_batch(texts, batch_size=16):
88
- """
89
- Predict multiple texts in batches.
90
-
91
- Args:
92
- texts (list): List of text strings to classify
93
- batch_size (int): Batch size for processing
94
-
95
- Returns:
96
- list: List of prediction dictionaries
97
- """
98
- if model is None or tokenizer is None:
99
- return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
100
-
101
- results = []
102
- for i in range(0, len(texts), batch_size):
103
- chunk = texts[i:i+batch_size]
104
- enc = tokenizer(
105
- chunk,
106
- return_tensors="pt",
107
- truncation=True,
108
- max_length=MAX_LEN,
109
- padding=True,
110
- )
111
- enc = {k: v.to(device) for k, v in enc.items()}
112
- logits = model(**enc).logits
113
- probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
114
- ids = probs.argmax(-1)
115
-
116
- for t, pid, p in zip(chunk, ids, probs):
117
- label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
118
- results.append({
119
- "text": t,
120
- "predicted_class": label,
121
- "confidence": float(p[int(pid)])
122
- })
123
- return results
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
3
+ import os
4
+
5
+ # ── 1) Configuration ────────────────────────────────────────────────────────────
6
+ BASE_DIR = "MAS-AI-0000/Authentica"
7
+ MODEL_DIR = os.path.join(BASE_DIR, "Lib/Models/Text") # Update this path to your model location
8
+ MAX_LEN = 512
9
+
10
+ # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ print(f"Text prediction device: {device}")
13
+
14
+ # Global variables for model and tokenizer
15
+ tokenizer = None
16
+ model = None
17
+ ID2LABEL = {0: "human", 1: "ai"}
18
+
19
+ try:
20
+ # Config carries id2label/label2id if you saved them
21
+ config = AutoConfig.from_pretrained(MODEL_DIR)
22
+
23
+ # Loads tokenizer.json + special_tokens_map.json automatically
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
25
+
26
+ # Loads model.safetensors automatically (no extra flags needed)
27
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
28
+ model.eval().to(device)
29
+
30
+ # Update label mapping from config if available
31
+ ID2LABEL = model.config.id2label if getattr(model.config, "id2label", None) else {0: "human", 1: "ai"}
32
+
33
+ print(f"Text classification model loaded successfully")
34
+ print("Labels:", ID2LABEL)
35
+ except Exception as e:
36
+ print(f"Error loading text model: {e}")
37
+ print("Text prediction will return fallback responses")
38
+
39
+ # ── 3) Inference function ──────────────────────────────────────────────────────
40
+ @torch.inference_mode()
41
+ def predict_text(text: str, max_length: int = None):
42
+ """
43
+ Predict whether the given text is human-written or AI-generated.
44
+
45
+ Args:
46
+ text (str): The text to classify
47
+ max_length (int): Maximum sequence length for tokenization (defaults to MAX_LEN)
48
+
49
+ Returns:
50
+ dict: Contains predicted_class and confidence
51
+ """
52
+ if model is None or tokenizer is None:
53
+ return {"predicted_class": "Human", "confidence": 0}
54
+
55
+ if max_length is None:
56
+ max_length = MAX_LEN
57
+
58
+ try:
59
+ # Tokenize input
60
+ enc = tokenizer(
61
+ text,
62
+ return_tensors="pt",
63
+ truncation=True,
64
+ max_length=max_length,
65
+ )
66
+ enc = {k: v.to(device) for k, v in enc.items()}
67
+
68
+ # Get predictions
69
+ logits = model(**enc).logits
70
+ probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
71
+ pred_id = int(probs.argmax(-1))
72
+
73
+ # Get label (capitalize first letter for consistency)
74
+ label = ID2LABEL.get(pred_id, str(pred_id))
75
+ label = label.capitalize() # "human" -> "Human", "ai" -> "Ai"
76
+
77
+ return {
78
+ "predicted_class": label,
79
+ "confidence": float(probs[pred_id])
80
+ }
81
+ except Exception as e:
82
+ print(f"Error during text prediction: {e}")
83
+ return {"predicted_class": "Human", "confidence": 0}
84
+
85
+ # ── 4) Batch prediction (optional, for future use) ─────────────────────────────
86
+ @torch.inference_mode()
87
+ def predict_batch(texts, batch_size=16):
88
+ """
89
+ Predict multiple texts in batches.
90
+
91
+ Args:
92
+ texts (list): List of text strings to classify
93
+ batch_size (int): Batch size for processing
94
+
95
+ Returns:
96
+ list: List of prediction dictionaries
97
+ """
98
+ if model is None or tokenizer is None:
99
+ return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
100
+
101
+ results = []
102
+ for i in range(0, len(texts), batch_size):
103
+ chunk = texts[i:i+batch_size]
104
+ enc = tokenizer(
105
+ chunk,
106
+ return_tensors="pt",
107
+ truncation=True,
108
+ max_length=MAX_LEN,
109
+ padding=True,
110
+ )
111
+ enc = {k: v.to(device) for k, v in enc.items()}
112
+ logits = model(**enc).logits
113
+ probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
114
+ ids = probs.argmax(-1)
115
+
116
+ for t, pid, p in zip(chunk, ids, probs):
117
+ label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
118
+ results.append({
119
+ "text": t,
120
+ "predicted_class": label,
121
+ "confidence": float(p[int(pid)])
122
+ })
123
+ return results