Implementasi K-Means dengan Python
Setelah memahami konsep melalui simulasi interaktif, mari kita lihat bagaimana mengimplementasikan K-Means menggunakan Python. Berikut adalah contoh code lengkap yang bisa Anda coba langsung di Google Colab.
Coba Langsung di Google Colab
Di notebook tersebut Anda akan menemukan implementasi lengkap K-Means dengan implementasi algoritma dari scratch, visualisasi step-by-step, studi kasus retail customer segmentation, dan perbandingan dengan library scikit-learn.
Contoh Code: Implementasi K-Means dari Scratch
import numpy as np
import matplotlib.pyplot as plt
def find_closest_centroids(X, centroids):
"""
Finds the closest centroid for each data point
Args:
X (ndarray): (m, n) Input values
centroids (ndarray): (K, n) centroids
Returns:
idx (array_like): (m,) closest centroids
"""
K = centroids.shape[0]
idx = np.zeros(X.shape[0], dtype=int)
for i in range(X.shape[0]):
distance = []
for j in range(centroids.shape[0]):
norm_ij = np.linalg.norm(X[i] - centroids[j])
distance.append(norm_ij)
idx[i] = np.argmin(distance)
return idx
def compute_centroids_means(X, idx, K):
"""
Returns the new centroids by computing the means of the
data points assigned to each centroid.
"""
m, n = X.shape
centroids = np.zeros((K, n))
for k in range(K):
points = X[idx == k]
centroids[k] = np.mean(points, axis=0)
return centroids
def kMeans(X, initial_centroids, max_iters=10):
"""
Runs the K-Means algorithm on data matrix X
"""
m, n = X.shape
K = initial_centroids.shape[0]
centroids = initial_centroids
idx = np.zeros(m)
for i in range(max_iters):
print(f"K-Means iteration {i}/{max_iters-1}")
# Assign points to closest centroids
idx = find_closest_centroids(X, centroids)
# Update centroids
centroids = compute_centroids_means(X, idx, K)
return centroids, idx
Contoh Penggunaan: Customer Segmentation
# Data pelanggan: [Pendapatan Tahunan, Pengeluaran Tahunan]
X = np.array([
[7.29, 2.75], [6.20, 2.68], [5.39, 2.27], [5.67, 2.96],
[6.60, 3.07], [7.76, 3.16], [6.63, 3.15], [5.77, 3.14],
# ... data lainnya
])
# Inisialisasi centroids
initial_centroids = np.array([[5.39, 2.27], [5.53, 3.34]])
# Jalankan K-Means
centroids, idx = kMeans(X, initial_centroids, max_iters=10)
# Visualisasi hasil
colors = ['red', 'blue']
for i in range(len(np.unique(idx))):
cluster_points = X[idx == i]
plt.scatter(cluster_points[:, 0], cluster_points[:, 1],
c=colors[i], label=f'Cluster {i+1}')
plt.scatter(centroids[:, 0], centroids[:, 1],
c='black', marker='x', s=200, label='Centroids')
plt.xlabel('Pendapatan Tahunan (ribuan $)')
plt.ylabel('Pengeluaran Tahunan (ratusan $)')
plt.legend()
plt.title('Customer Segmentation dengan K-Means')
plt.show()
Analisis Hasil
Dari implementasi di atas, kita bisa mengidentifikasi 2 segment pelanggan:
Pengeluaran sangat dipengaruhi pendapatan. Target: Produk value-for-money, promosi diskon
Pengeluaran relatif stabil terlepas dari pendapatan. Target: Produk premium, layanan eksklusif
Evaluasi Model
def calculate_inertia(X, centroids, idx):
"""
Calculate Within-Cluster Sum of Squares (WCSS)
"""
inertia = 0
for i in range(len(X)):
cluster_idx = idx[i]
distance = np.linalg.norm(X[i] - centroids[cluster_idx])
inertia += distance ** 2
return inertia
# Hitung inertia
inertia = calculate_inertia(X, centroids, idx)
print(f"Inertia: {inertia:.2f}")
Perbandingan dengan Scikit-Learn
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Generate sample data
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=0)
# Scikit-learn implementation
kmeans = KMeans(n_clusters=4, random_state=42)
labels = kmeans.fit_predict(X)
centroids_sklearn = kmeans.cluster_centers_
# Visualisasi
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.6)
plt.scatter(centroids_sklearn[:, 0], centroids_sklearn[:, 1],
c='red', marker='x', s=200)
plt.title('K-Means dengan Scikit-Learn')
plt.show()
Resources Tambahan