ML Project – Lifestyle Health Risk Prediction using K-means Clustering

DataFlair Team
11 months ago
Machine Learning courses with 110+ Real-time projects Start Now!!
Program 1

import pandas as pd # For handling tabular data.
import numpy as np #For numerical operations and random data generation.
import matplotlib.pyplot as plt # For plotting graphs.
from sklearn.cluster import KMeans   # Unsupervised clustering algorithm from sklearn.
from sklearn.preprocessing import StandardScaler  #Used to normalize features (mean = 0, std = 1).

np.random.seed(42) #  Ensures reproducibility of the random data.
n_people = 200 # Total number of synthetic individuals.

data = {
    'PersonID': np.arange(1, n_people + 1),
    'Exercise (hrs/week)': np.random.normal(3, 1.5, n_people).clip(0, 10),  # Mean (μ) = 3 Standard deviation (σ) = 1.5
    'Sleep (hrs/day)': np.random.normal(7, 1.0, n_people).clip(3, 10),
    'Junk Food (times/week)': np.random.randint(0, 8, n_people),
    'Screen Time (hrs/day)': np.random.normal(6, 2.0, n_people).clip(2, 14)
}
df = pd.DataFrame(data)
df
# Creates synthetic values for 4 lifestyle habits:
# Exercise: Normally distributed around 3 hrs/week (clipped 0–10)
# Sleep: Around 7 hrs/day (clipped 3–10)
# Junk Food: Integer from 0–7
# Screen Time: Around 6 hrs/day (clipped 2–14)

df.to_csv('D://scikit_data/KMeans/lifestyle_health_risk.csv', index=False)

# Feature Selection & Normalization

features = ['Exercise (hrs/week)', 'Sleep (hrs/day)', 'Junk Food (times/week)', 'Screen Time (hrs/day)']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled
# Select only the relevant columns (exclude PersonID).
# Normalize all features so that they contribute equally during clustering.
# Output is a scaled X_scaled array (mean=0, std=1).

#Apply K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Health Risk Category'] = kmeans.fit_predict(X_scaled)
df.to_csv('D://scikit_data/KMeans/lifestyle_health_risk1.csv', index=False)

# A  scatter plot showing how people are grouped based on their sleep and diet habits.
# Helps visually identify which group is more or less healthy.

plt.figure(figsize=(8, 6))
plt.scatter(df['Sleep (hrs/day)'], df['Junk Food (times/week)'],
            c=df['Health Risk Category'], cmap='coolwarm', s=60)
plt.scatter(
    scaler.inverse_transform(kmeans.cluster_centers_)[:, 1],
    scaler.inverse_transform(kmeans.cluster_centers_)[:, 2],
    s=200, c='black', marker='X', label='Centroids')

plt.title('Lifestyle Clustering: Sleep vs. Junk Food')
plt.xlabel('Sleep (hrs/day)')
plt.ylabel('Junk Food (times/week)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()