# -*- coding: utf-8 -*- """TextClassification Patient Symptoms and Diseases.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1bdFtsrVyYwbXH_jH6yJR3QEC6UQiguUK AIMERS """ import pandas as pd import re import spacy from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score, classification_report from sklearn.linear_model import LogisticRegression # Load the data data = pd.read_csv('symptomssingle.csv') # Check for any missing values and remove them data = data.dropna() # Define a function to separate symptoms and diseases from the text def separate_symptoms_and_diseases(text): symptoms = re.findall(r'{"symptoms":"(.*?)"}', text) disease = re.sub(r'(?:{"symptoms":".*?"},?)+', '', text).strip() disease = disease.replace('],', '').strip() # Remove '],' from the disease name return symptoms, disease # Apply the function to the data data['symptoms_and_diseases'] = data['data'].apply(separate_symptoms_and_diseases) data[['symptoms', 'disease']] = pd.DataFrame(data['symptoms_and_diseases'].tolist(), index=data.index) data = data.drop(columns=['data', 'symptoms_and_diseases']) # Load the spaCy model nlp = spacy.load('en_core_web_sm') # Preprocessing function def preprocess(symptoms): processed_symptoms = [] for symptom in symptoms: doc = nlp(symptom) processed_symptom = ' '.join(token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha) processed_symptoms.append(processed_symptom) return ' '.join(processed_symptoms) # Preprocess the symptoms column data['symptoms_preprocessed'] = data['symptoms'].apply(preprocess) # Split the data into train and test sets X_train, X_test, y_train, y_test = train_test_split(data['symptoms_preprocessed'], data['disease'], test_size=0.2, random_state=42) # Create a pipeline for text classification pipeline = Pipeline([ ('tfidf', TfidfVectorizer(ngram_range=(1, 2))), ('classifier', LogisticRegression(solver='liblinear', C=10)) ]) # Train the model pipeline.fit(X_train, y_train) # Make predictions y_pred = pipeline.predict(X_test) # Evaluate the model print("Accuracy: ", accuracy_score(y_test, y_pred)) print("Classification Report:\n", classification_report(y_test, y_pred)) !pip install joblib import joblib # Save the trained model joblib.dump(pipeline, 'DiseasePredictionBasedonSymptoms.joblib') import joblib # Load the saved model loaded_pipeline = joblib.load('DiseasePredictionBasedonSymptoms.joblib') # Make predictions using the loaded model (example) sample_symptom = "Skin Rash" processed_symptom = preprocess([sample_symptom]) prediction = loaded_pipeline.predict([processed_symptom]) print("Predicted disease:", prediction[0])