| | import torch |
| | import torch.nn as nn |
| | from transformers import BertModel, BertTokenizer |
| | import re |
| |
|
| | class TextIntentModel(nn.Module): |
| | """ |
| | Transformer-based model for text intent and sentiment analysis. |
| | Fine-tuned BERT for conversational intent detection. |
| | """ |
| | def __init__(self, num_intents=5, pretrained=True): |
| | super().__init__() |
| | self.num_intents = num_intents |
| |
|
| | |
| | if pretrained: |
| | self.bert = BertModel.from_pretrained('bert-base-uncased') |
| | self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
| | else: |
| | from transformers import BertConfig |
| | config = BertConfig() |
| | self.bert = BertModel(config) |
| | self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
| |
|
| | |
| | for param in self.bert.parameters(): |
| | param.requires_grad = False |
| |
|
| | hidden_size = self.bert.config.hidden_size |
| |
|
| | |
| | self.intent_classifier = nn.Sequential( |
| | nn.Linear(hidden_size, 256), |
| | nn.ReLU(), |
| | nn.Dropout(0.3), |
| | nn.Linear(256, num_intents) |
| | ) |
| |
|
| | |
| | self.sentiment_head = nn.Sequential( |
| | nn.Linear(hidden_size, 128), |
| | nn.ReLU(), |
| | nn.Linear(128, 7) |
| | ) |
| |
|
| | |
| | self.confidence_head = nn.Sequential( |
| | nn.Linear(hidden_size, 64), |
| | nn.ReLU(), |
| | nn.Linear(64, 1), |
| | nn.Sigmoid() |
| | ) |
| |
|
| | def forward(self, input_ids, attention_mask): |
| | """ |
| | input_ids: tokenized text (B, seq_len) |
| | attention_mask: attention mask (B, seq_len) |
| | Returns: intent_logits, sentiment_logits, confidence |
| | """ |
| | outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) |
| | pooled_output = outputs.pooler_output |
| |
|
| | intent_logits = self.intent_classifier(pooled_output) |
| | sentiment_logits = self.sentiment_head(pooled_output) |
| | confidence = self.confidence_head(pooled_output) |
| |
|
| | return intent_logits, sentiment_logits, confidence.squeeze() |
| |
|
| | def preprocess_text(self, text): |
| | """ |
| | Preprocess and tokenize text input. |
| | """ |
| | |
| | text = self.clean_text(text) |
| |
|
| | |
| | encoding = self.tokenizer( |
| | text, |
| | max_length=128, |
| | padding='max_length', |
| | truncation=True, |
| | return_tensors='pt' |
| | ) |
| |
|
| | return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze() |
| |
|
| | def clean_text(self, text): |
| | """ |
| | Clean and normalize text. |
| | """ |
| | |
| | text = re.sub(r'[^\w\s.,!?]', '', text) |
| | |
| | text = ' '.join(text.split()) |
| | return text.lower() |
| |
|
| | def detect_hesitation_phrases(self, text): |
| | """ |
| | Detect phrases indicating hesitation or confusion. |
| | """ |
| | hesitation_keywords = [ |
| | 'um', 'uh', 'like', 'you know', 'sort of', 'kind of', |
| | 'i think', 'maybe', 'perhaps', 'i\'m not sure' |
| | ] |
| |
|
| | text_lower = text.lower() |
| | hesitation_score = sum(1 for keyword in hesitation_keywords if keyword in text_lower) |
| |
|
| | return min(hesitation_score / 5.0, 1.0) |
| |
|
| | def extract_intent_features(self, text): |
| | """ |
| | Extract intent-related features from text. |
| | """ |
| | with torch.no_grad(): |
| | input_ids, attention_mask = self.preprocess_text(text) |
| | if input_ids.dim() == 1: |
| | input_ids = input_ids.unsqueeze(0) |
| | attention_mask = attention_mask.unsqueeze(0) |
| |
|
| | intent_logits, sentiment_logits, confidence = self.forward(input_ids, attention_mask) |
| |
|
| | return { |
| | 'intent_logits': intent_logits, |
| | 'sentiment_logits': sentiment_logits, |
| | 'confidence': confidence, |
| | 'hesitation_score': self.detect_hesitation_phrases(text) |
| | } |