tarak00003
/

tarakcse

Text Classification

Model card Files Files and versions Community

tarakcse / medicalsymptoms1.py

tarak00003's picture

Upload 3 files

f7eb132 over 1 year ago

history blame contribute delete

2.86 kB

	# -- coding: utf-8 --
	"""medicalsymptoms1.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1uRT7zfEMnu-tq74GyZoUUtAb-In4XtX8
	"""

	import pandas as pd
	import re
	import spacy
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.pipeline import Pipeline
	from sklearn.metrics import accuracy_score, classification_report
	from sklearn.linear_model import LogisticRegression

	# Load the data
	data = pd.read_csv('symptomssingle.csv')

	# Check for any missing values and remove them
	data = data.dropna()

	# Define a function to separate symptoms and diseases from the text
	def separate_symptoms_and_diseases(text):
	symptoms = re.findall(r'{"symptoms":"(.*?)"}', text)
	disease = re.sub(r'(?:{"symptoms":".*?"},?)+', '', text).strip()
	disease = disease.replace('],', '').strip() # Remove '],' from the disease name
	return symptoms, disease

	# Apply the function to the data
	data['symptoms_and_diseases'] = data['data'].apply(separate_symptoms_and_diseases)
	data[['symptoms', 'disease']] = pd.DataFrame(data['symptoms_and_diseases'].tolist(), index=data.index)
	data = data.drop(columns=['data', 'symptoms_and_diseases'])

	# Load the spaCy model
	nlp = spacy.load('en_core_web_sm')

	# Preprocessing function
	def preprocess(symptoms):
	processed_symptoms = []
	for symptom in symptoms:
	doc = nlp(symptom)
	processed_symptom = ' '.join(token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha)
	processed_symptoms.append(processed_symptom)
	return ' '.join(processed_symptoms)

	# Preprocess the symptoms column
	data['symptoms_preprocessed'] = data['symptoms'].apply(preprocess)


	# Split the data into train and test sets
	X_train, X_test, y_train, y_test = train_test_split(data['symptoms_preprocessed'], data['disease'], test_size=0.2, random_state=42)

	# Create a pipeline for text classification
	pipeline = Pipeline([
	('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
	('classifier', LogisticRegression(solver='liblinear', C=10))
	])

	# Train the model
	pipeline.fit(X_train, y_train)

	# Make predictions
	y_pred = pipeline.predict(X_test)

	# Evaluate the model
	print("Accuracy: ", accuracy_score(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

	!pip install joblib
	import joblib

	# Save the trained model
	joblib.dump(pipeline, 'DiseasePredictionBasedonSymptoms.joblib')

	import joblib

	# Load the saved model
	loaded_pipeline = joblib.load('DiseasePredictionBasedonSymptoms.joblib')

	# Make predictions using the loaded model (example)
	sample_symptom = "Skin Rash"
	processed_symptom = preprocess([sample_symptom])
	prediction = loaded_pipeline.predict([processed_symptom])

	print("Predicted disease:", prediction[0])