|
|
|
"""TextClassification Patient Symptoms and Diseases.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1bdFtsrVyYwbXH_jH6yJR3QEC6UQiguUK |
|
|
|
AIMERS |
|
""" |
|
|
|
import pandas as pd |
|
import re |
|
import spacy |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.metrics import accuracy_score, classification_report |
|
from sklearn.linear_model import LogisticRegression |
|
|
|
|
|
data = pd.read_csv('symptomssingle.csv') |
|
|
|
|
|
data = data.dropna() |
|
|
|
|
|
def separate_symptoms_and_diseases(text): |
|
symptoms = re.findall(r'{"symptoms":"(.*?)"}', text) |
|
disease = re.sub(r'(?:{"symptoms":".*?"},?)+', '', text).strip() |
|
disease = disease.replace('],', '').strip() |
|
return symptoms, disease |
|
|
|
|
|
data['symptoms_and_diseases'] = data['data'].apply(separate_symptoms_and_diseases) |
|
data[['symptoms', 'disease']] = pd.DataFrame(data['symptoms_and_diseases'].tolist(), index=data.index) |
|
data = data.drop(columns=['data', 'symptoms_and_diseases']) |
|
|
|
|
|
nlp = spacy.load('en_core_web_sm') |
|
|
|
|
|
def preprocess(symptoms): |
|
processed_symptoms = [] |
|
for symptom in symptoms: |
|
doc = nlp(symptom) |
|
processed_symptom = ' '.join(token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha) |
|
processed_symptoms.append(processed_symptom) |
|
return ' '.join(processed_symptoms) |
|
|
|
|
|
data['symptoms_preprocessed'] = data['symptoms'].apply(preprocess) |
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(data['symptoms_preprocessed'], data['disease'], test_size=0.2, random_state=42) |
|
|
|
|
|
pipeline = Pipeline([ |
|
('tfidf', TfidfVectorizer(ngram_range=(1, 2))), |
|
('classifier', LogisticRegression(solver='liblinear', C=10)) |
|
]) |
|
|
|
|
|
pipeline.fit(X_train, y_train) |
|
|
|
|
|
y_pred = pipeline.predict(X_test) |
|
|
|
|
|
print("Accuracy: ", accuracy_score(y_test, y_pred)) |
|
print("Classification Report:\n", classification_report(y_test, y_pred)) |
|
|
|
!pip install joblib |
|
import joblib |
|
|
|
|
|
joblib.dump(pipeline, 'DiseasePredictionBasedonSymptoms.joblib') |
|
|
|
import joblib |
|
|
|
|
|
loaded_pipeline = joblib.load('DiseasePredictionBasedonSymptoms.joblib') |
|
|
|
|
|
sample_symptom = "Skin Rash" |
|
processed_symptom = preprocess([sample_symptom]) |
|
prediction = loaded_pipeline.predict([processed_symptom]) |
|
|
|
print("Predicted disease:", prediction[0]) |
|
|
|
|