FraleyLabAttachmentBot / transcriptanalysis.py
AjithKSenthil's picture
started coding the transcript analysis and survey prediction code
379ec43
raw
history blame contribute delete
No virus
4.11 kB
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# Load the dataset
chat_transcripts = ["chat transcript 1", "chat transcript 2", ...]
survey_responses = [3.5, 4.2, ...] # Numerical survey responses
# Split the data into training, validation, and testing sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(chat_transcripts, survey_responses, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)
# Pre-process the data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
class SurveyDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
def __len__(self):
return len(self.labels)
train_dataset = SurveyDataset(train_encodings, train_labels)
val_dataset = SurveyDataset(val_encodings, val_labels)
test_dataset = SurveyDataset(test_encodings, test_labels)
# Fine-tune the BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1).to(device)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
optim = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=num_training_steps)
for epoch in range(num_epochs):
model.train()
for batch in train_loader:
optim.zero_grad()
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].unsqueeze(1).to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optim.step()
lr_scheduler.step()
# Evaluate the model
model.eval()
preds = []
with torch.no_grad():
for batch in test_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
preds.extend(logits.squeeze().tolist())
mse = mean_squared_error(test_labels, preds)
r2 = r2_score(test_labels, preds)
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)
def predict_survey_response(chat_transcript, model, tokenizer, device):
# Preprocess the chat transcript
encoding = tokenizer(chat_transcript, truncation=True, padding=True, return_tensors="pt")
# Move tensors to the device
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
# Predict the survey response using the fine-tuned model
model.eval()
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
predicted_response = logits.squeeze().item()
return predicted_response
# Example usage
new_chat_transcript = "A new chat transcript"
predicted_response = predict_survey_response(new_chat_transcript, model, tokenizer, device)
print("Predicted survey response:", predicted_response)