Machine Learning courses with 110+ Real-time projects Start Now!!
Program 1
YouTube Video Clustering Dataset
YouTube Video Clustering Dataset 1
#YouTube Video Clustering by Views, Likes, and Watch Time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Step 1: Create synthetic YouTube video data
np.random.seed(42)
n_videos = 150
data = {
'VideoID': np.arange(1, n_videos + 1),
'Views': np.random.randint(1000, 1000000, n_videos),
'Likes': np.random.randint(100, 50000, n_videos),
'Watch Time (mins)': np.random.normal(5000, 1500, n_videos).astype(int)
}
df = pd.DataFrame(data)
df.head()
#df.shape
# Save dataset
df.to_csv("D://scikit_data/KMeans/youtube_video_clustering.csv", index=False)
# Step 2: Select features and scale them
features = ['Views', 'Likes', 'Watch Time (mins)']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled
# Step 3: Apply KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)
df.to_csv("D://scikit_data/KMeans/youtube_video_clustering1.csv", index=False)
# Step 4: Plotting the clusters using Views vs. Watch Time (2D view)
plt.figure(figsize=(8, 6))
plt.scatter(df['Views'], df['Watch Time (mins)'],
c=df['Cluster'], cmap='viridis', s=60)
plt.scatter(scaler.inverse_transform(kmeans.cluster_centers_)[:, 0],
scaler.inverse_transform(kmeans.cluster_centers_)[:, 2],
s=200, c='red', marker='X', label='Centroids')
plt.title('YouTube Video Clustering by Views and Watch Time')
plt.xlabel('Views')
plt.ylabel('Watch Time (mins)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()