medicalsymptoms / textclassification_patient_symptoms_and_diseases.py
tarak00003's picture
Upload 3 files
f75a3f7
# -*- coding: utf-8 -*-
"""TextClassification Patient Symptoms and Diseases.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1bdFtsrVyYwbXH_jH6yJR3QEC6UQiguUK
AIMERS
"""
import pandas as pd
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
# Load the data
data = pd.read_csv('symptomssingle.csv')
# Check for any missing values and remove them
data = data.dropna()
# Define a function to separate symptoms and diseases from the text
def separate_symptoms_and_diseases(text):
symptoms = re.findall(r'{"symptoms":"(.*?)"}', text)
disease = re.sub(r'(?:{"symptoms":".*?"},?)+', '', text).strip()
disease = disease.replace('],', '').strip() # Remove '],' from the disease name
return symptoms, disease
# Apply the function to the data
data['symptoms_and_diseases'] = data['data'].apply(separate_symptoms_and_diseases)
data[['symptoms', 'disease']] = pd.DataFrame(data['symptoms_and_diseases'].tolist(), index=data.index)
data = data.drop(columns=['data', 'symptoms_and_diseases'])
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')
# Preprocessing function
def preprocess(symptoms):
processed_symptoms = []
for symptom in symptoms:
doc = nlp(symptom)
processed_symptom = ' '.join(token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha)
processed_symptoms.append(processed_symptom)
return ' '.join(processed_symptoms)
# Preprocess the symptoms column
data['symptoms_preprocessed'] = data['symptoms'].apply(preprocess)
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['symptoms_preprocessed'], data['disease'], test_size=0.2, random_state=42)
# Create a pipeline for text classification
pipeline = Pipeline([
('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
('classifier', LogisticRegression(solver='liblinear', C=10))
])
# Train the model
pipeline.fit(X_train, y_train)
# Make predictions
y_pred = pipeline.predict(X_test)
# Evaluate the model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
!pip install joblib
import joblib
# Save the trained model
joblib.dump(pipeline, 'DiseasePredictionBasedonSymptoms.joblib')
import joblib
# Load the saved model
loaded_pipeline = joblib.load('DiseasePredictionBasedonSymptoms.joblib')
# Make predictions using the loaded model (example)
sample_symptom = "Skin Rash"
processed_symptom = preprocess([sample_symptom])
prediction = loaded_pipeline.predict([processed_symptom])
print("Predicted disease:", prediction[0])