Spaces:

FPRT
/

SurgerySort

Runtime error

App Files Files Community

noequal commited on Sep 1, 2023

Commit

84fe4f3

•

1 Parent(s): 9128ec6

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -83

app.py CHANGED Viewed

@@ -1,101 +1,67 @@
 import streamlit as st
-import torch
-from torch.utils.data import Dataset, random_split
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
-# Generate sample clinical text and labels
-sample_data = [
-    ("Had successful surgery today. Feeling relieved.", "surgery"),
-    ("Started new medication for pain management.", "non-surgery"),
-    ("Scheduled for surgery next week. Nervous but hopeful.", "surgery"),
-    ("Attended a seminar on non-surgical treatments.", "non-surgery"),
-]
-# Map labels to integers
-label_mapping = {"surgery": 1, "non-surgery": 0}
-train_texts, train_labels = zip(*sample_data)
-train_labels = [label_mapping[label] for label in train_labels]
-# Logging and Outputs
-st.write("Sample data:")
-for text, label in zip(train_texts, train_labels):
-    st.write(f"Text: {text}\nLabel: {label}\n")
-# Load pre-trained model and tokenizer
-model_name = "distilbert-base-uncased"  # You can use any suitable classification model
-model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Create PyTorch Dataset object
-class ClinicalDataset(Dataset):
-         def __init__(self, texts, labels, tokenizer, max_seq_length):
-             self.texts = texts
-             self.labels = labels
-             self.tokenizer = tokenizer
-             self.max_seq_length = max_seq_length
-         def __len__(self):
-             return len(self.texts)
-         def __getitem__(self, idx):
-             text = self.texts[idx]
-             label = self.labels[idx]
-             encoding = self.tokenizer(
-                text,
-                return_tensors="pt",
-                padding='max_length',  # Pad sequences to the maximum sequence length
-                truncation=True,
-                max_length=self.max_seq_length
-             )
-             return {
-                 "input_ids": encoding["input_ids"].squeeze(),
-                 "attention_mask": encoding["attention_mask"].squeeze(),
-                 "labels": torch.tensor(label)
-             }
-# Data Collator
-data_collator = DataCollatorForLanguageModeling(
-    tokenizer=tokenizer,
-    mlm_probability=0.15
 )
-seq_length = 128
-dataset = ClinicalDataset(texts=train_texts, labels=train_labels, tokenizer=tokenizer, max_seq_length=seq_length)
-# Split dataset into training and validation sets
-train_size = int(0.8 * len(dataset))
-val_size = len(dataset) - train_size
-train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
-# Fine-tune pre-trained model on clinical dataset
 training_args = TrainingArguments(
-         output_dir='./results',          # output directory
-         num_train_epochs=3,              # total number of training epochs
-         per_device_train_batch_size=16,  # batch size per device during training
-         per_device_eval_batch_size=64,   # batch size for evaluation
-         warmup_steps=500,                # number of warmup steps for learning rate scheduler
-         weight_decay=0.01,               # strength of weight decay
-         logging_dir='./logs',            # directory for storing logs
-         logging_steps=10,)
 trainer = Trainer(
     model=model,
     args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=val_dataset,
-    data_collator=data_collator,
 )
-st.write("Training started...")
-trainer.train()
-st.write("Training completed.")
-# Logging Training Output
-st.write("Training logs:")
-with open('./logs/train.log', 'r') as log_file:
-    st.code(log_file.read())

 import streamlit as st
+import numpy as np
+import evaluate
+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+# Define label mappings
+id2label = {0: "SURGERY", 1: "NON-SURGERY"}
+label2id = {"SURGERY": 0, "NON-SURGERY": 1}
+# Load evaluation metric
+accuracy = evaluate.load("accuracy")
+# Define preprocessing function
+def preprocess_function(examples):
+    return tokenizer(examples, truncation=True, padding=True)
+# Load model for sequence classification
+model = AutoModelForSequenceClassification.from_pretrained(
+    "emilyalsentzer/Bio_ClinicalBERT", num_labels=2, id2label=id2label, label2id=label2id
 )
+# Define compute_metrics function
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    return accuracy.compute(predictions=predictions, references=labels)
+# Define data collator
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+# Define training arguments
 training_args = TrainingArguments(
+    output_dir="my_awesome_model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+)
+# Initialize trainer
 trainer = Trainer(
     model=model,
     args=training_args,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
 )
+# Streamlit UI
+st.title("Clinical Text Classification")
+text = st.text_area("Enter clinical text:", "")
+if st.button("Classify"):
+    # Tokenize user input and predict
+    tokenized_text = preprocess_function(text)
+    result = trainer.predict(tokenized_text)
+    prediction = np.argmax(result.predictions, axis=1)[0]
+    st.write("Predicted Label:", id2label[prediction])