Maching-Learning-Models / Logistic_Regression.py

Upload 3 files

662d38c verified 4 months ago

6.16 kB

	'''
	Logistic Regression là một mô hình học có giám sát (supervised learning) dùng cho bài toán phân loại (classification).
	Nó dựa trên Linear Regression, nhưng thay vì dự đoán giá trị liên tục → nó biến đầu ra thành xác suất (0–1) bằng hàm sigmoid.

	🧠 2. CÔNG THỨC TOÁN HỌC
	2.1. Linear part
	Trước hết, ta tính tổng có trọng số:
	z = w0 + w1x1 + w2x2 + w3x3 + ... + wnxn
	2.2 Áp dụng hàm Sigmoid
	y^ = σ(z) = 1 / 1 + e^-z
	=> Kết quả y^ nằm trong khoảng (0,1) hiểu là xác xuất đối tượng thuộc lớp 1.

	📊 3. QUYẾT ĐỊNH PHÂN LOẠI
	Nếu:
	y^ >= 0.5 ==> dự đoán 1
	y^ < 0.5 ==> dự đoán 0

	⚙️ 4. HÀM MẤT MÁT (LOSS FUNCTION)
	Ta dùng Binary Cross-Entropy Loss (Log Loss):
	J(w) = -1/m ∑[y^(i)log(y^^(i)) + (1-y^(i))log(1-y^^(i))]
	Mục tiêu: tối thiểu hóa J(w) → tìm bộ trọng số w tốt nhất.

	5. QUÁ TRÌNH HỌC (TRAINING)
	Dùng Gradient Descent để cập nhật trọng số:
	w := w - α * ∂J/∂w # Trong đó: α là learning rate.
	'''

	# 💻 6. CODE MẪU PYTHON
	# Thư viện sử dụng cho Logistic Regression
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

	# Dữ liệu ví dụ
	data = pd.DataFrame({
	'hours_studied': [1,2,3,4,5,6,7,8,9,10],
	'pass_exam': [0,0,0,0,0,1,1,1,1,1]
	})
	X = data[['hours_studied']]
	y = data['pass_exam']

	# Chia dữ liệu
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Huấn luyện mô hình
	model = LogisticRegression()
	model.fit(X_train, y_train)

	# Dự đoán
	y_pred = model.predict(X_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

	# 📈 Ứng dụng thực tế của Logistic Regression
	'''
	Lĩnh vực Ứng dụng
	y học Dự đoán bệnh(có/không)
	Email Phân loại spam/không spam
	Marketing Khách hàng mua/không mua
	Tài chính Dự đoán vỡ nợ/không vỡ nợ
	Nhận dạng Có khuôn mặt/không có khuôn mặt
	'''

	# 🧠 Trực quan hóa hàm Sigmoid
	z = np.linspace(-10, 10, 100)
	sigmoid = 1 / (1 + np.exp(-z))
	plt.plot(z, sigmoid)
	plt.title('Sigmoid Function')
	plt.xlabel('z')
	plt.ylabel('σ(z)')
	plt.grid()
	plt.show()
	''' → Đồ thị cong từ 0 → 1, giúp chuyển hóa giá trị tuyến tính thành xác suất. '''

	'''
	🧩 9. ƯU ĐIỂM & NHƯỢC ĐIỂM
	✅ Ưu điểm
	Đơn giản, dễ huấn luyện
	Dễ hiểu, giải thích rõ ràng
	Hiệu quả cho dữ liệu tuyến tính
	⚠️ Nhược điểm
	Kém hiệu quả với dữ liệu phi tuyến
	Không xử lý tốt nhiều lớp phức tạp (multi-class → phải dùng One-vs-Rest)
	Giả định quan hệ tuyến tính giữa biến độc lập và log-odds
	🧪 10. MỞ RỘNG
	Multinomial Logistic Regression → cho phân loại nhiều lớp
	Regularization (L1, L2) → chống overfitting
	Feature scaling → nên chuẩn hóa dữ liệu trước khi huấn luyện
	'''

	# 🚀 Ứng dụng Nhỏ- Dự Đoán xác xuất Thi Đậu
	hours = np.array([[7]])
	pred = model.predict_proba(hours)
	print(f"Xác suất đậu: {pred[0][1]*100:.2f}%")

	# 🧩 12. SO SÁNH VỚI LINEAR REGRESSION
	'''
	Đặc điểm Linear Regression Logistic Regression
	Đầu ra Giá trị liên tục Xác suất (0–1)
	Bài toán Dự đoán (Regression) Phân loại (Classification)
	Hàm kích hoạt Không có Sigmoid
	Hàm mất mát MSE Log Loss
	'''

	# LOGISTIC REGRESSION (Dùng thư viện đơn giản)
	import numpy as np
	import matplotlib.pyplot as plt

	# Set up styling for Matplotlib
	plt.style.use('ggplot')

	# Create dataset
	dataset = np.array([
	[-10, 0],
	[-5, 0],
	[-7, 0],
	[0, 0],
	[-2, 0],
	[5, 1],
	[7, 1],
	[6, 1],
	[10, 1],
	[15, 1],
	[9, 1]
	])

	# Draw the dataset
	negative_class = dataset[:5]
	positive_class = dataset[5:]

	# Draw the negative first and positive
	plt.scatter(negative_class[:, 0], negative_class[:, 1], c='y', label='Class 0')
	plt.scatter(positive_class[:, 0], positive_class[:, 1], c='g', label='Class 1')
	plt.legend()
	plt.show()


	# -------------------- Logistic Regression Functions --------------------
	def get_prediction(m, b, x):
	"""
	Get the predictions: y_hat using the input
	"""
	return 1 / (1 + np.exp(-(m * x + b))) # ✅ Sửa công thức sigmoid


	def get_cost(y, y_hat):
	"""
	Get the value of the cost function
	"""
	k = y.shape[0]
	return (-1 / k) * np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))


	def get_gradient(m, b, x, y, y_hat):
	"""
	Return the gradient of the loss function w.r.t m and b
	"""
	k = y.shape[0]
	dm = (1 / k) * np.sum((y_hat - y) * x)
	db = (1 / k) * np.sum(y_hat - y)
	return dm, db


	def get_accuracy(y, y_hat):
	return ((y_hat >= 0.5).astype(int) == y).sum() / y.shape[0] # ✅ Sửa chia đúng mẫu


	# -------------------- Gradient Descent --------------------
	m = 1.0
	b = 10.0
	iterations = 200
	lr = 0.03
	x = dataset[:, 0]
	y = dataset[:, 1]
	costs = []

	for it in range(iterations):
	y_hat = get_prediction(m, b, x)
	cost = get_cost(y, y_hat)
	accuracy = get_accuracy(y, y_hat)
	print(f"Iteration {it} - Cost: {cost:.4f}, Accuracy: {accuracy:.4f}")

	dm, db = get_gradient(m, b, x, y, y_hat) # ✅ Sửa dn → db
	m -= lr * dm
	b -= lr * db
	costs.append(cost)

	# Plot cost over iterations
	plt.plot(costs)
	plt.xlabel("Iteration")
	plt.ylabel("Cost")
	plt.title("Cost Function over Time")
	plt.show()