import pandas as pd
data = pd.read_csv('BankChurners.csv').set_index('CLIENTNUM')
data
Attrition_Flag Customer_Age Gender Dependent_count Education_Level Marital_Status Income_Category Card_Category Months_on_book Total_Relationship_Count ... Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
CLIENTNUM
768805383 Existing Customer 45 M 3 High School Married $60K - $80K Blue 39 5 ... 12691.0 777 11914.0 1.335 1144 42 1.625 0.061 0.000093 0.999910
818770008 Existing Customer 49 F 5 Graduate Single Less than $40K Blue 44 6 ... 8256.0 864 7392.0 1.541 1291 33 3.714 0.105 0.000057 0.999940
713982108 Existing Customer 51 M 3 Graduate Married $80K - $120K Blue 36 4 ... 3418.0 0 3418.0 2.594 1887 20 2.333 0.000 0.000021 0.999980
769911858 Existing Customer 40 F 4 High School Unknown Less than $40K Blue 34 3 ... 3313.0 2517 796.0 1.405 1171 20 2.333 0.760 0.000134 0.999870
709106358 Existing Customer 40 M 3 Uneducated Married $60K - $80K Blue 21 5 ... 4716.0 0 4716.0 2.175 816 28 2.500 0.000 0.000022 0.999980
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
772366833 Existing Customer 50 M 2 Graduate Single $40K - $60K Blue 40 3 ... 4003.0 1851 2152.0 0.703 15476 117 0.857 0.462 0.000191 0.999810
710638233 Attrited Customer 41 M 2 Unknown Divorced $40K - $60K Blue 25 4 ... 4277.0 2186 2091.0 0.804 8764 69 0.683 0.511 0.995270 0.004729
716506083 Attrited Customer 44 F 1 High School Married Less than $40K Blue 36 5 ... 5409.0 0 5409.0 0.819 10291 60 0.818 0.000 0.997880 0.002118
717406983 Attrited Customer 30 M 2 Graduate Unknown $40K - $60K Blue 36 4 ... 5281.0 0 5281.0 0.535 8395 62 0.722 0.000 0.996710 0.003294
714337233 Attrited Customer 43 F 2 Graduate Married Less than $40K Silver 25 6 ... 10388.0 1961 8427.0 0.703 10294 61 0.649 0.189 0.996620 0.003377

10127 rows × 22 columns

data.info()
<class 'pandas.core.frame.DataFrame'>
Index: 10127 entries, 768805383 to 714337233
Data columns (total 22 columns):
 #   Column                                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                                              --------------  -----  
 0   Attrition_Flag                                                                                                                      10127 non-null  object 
 1   Customer_Age                                                                                                                        10127 non-null  int64  
 2   Gender                                                                                                                              10127 non-null  object 
 3   Dependent_count                                                                                                                     10127 non-null  int64  
 4   Education_Level                                                                                                                     10127 non-null  object 
 5   Marital_Status                                                                                                                      10127 non-null  object 
 6   Income_Category                                                                                                                     10127 non-null  object 
 7   Card_Category                                                                                                                       10127 non-null  object 
 8   Months_on_book                                                                                                                      10127 non-null  int64  
 9   Total_Relationship_Count                                                                                                            10127 non-null  int64  
 10  Months_Inactive_12_mon                                                                                                              10127 non-null  int64  
 11  Contacts_Count_12_mon                                                                                                               10127 non-null  int64  
 12  Credit_Limit                                                                                                                        10127 non-null  float64
 13  Total_Revolving_Bal                                                                                                                 10127 non-null  int64  
 14  Avg_Open_To_Buy                                                                                                                     10127 non-null  float64
 15  Total_Amt_Chng_Q4_Q1                                                                                                                10127 non-null  float64
 16  Total_Trans_Amt                                                                                                                     10127 non-null  int64  
 17  Total_Trans_Ct                                                                                                                      10127 non-null  int64  
 18  Total_Ct_Chng_Q4_Q1                                                                                                                 10127 non-null  float64
 19  Avg_Utilization_Ratio                                                                                                               10127 non-null  float64
 20  Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1  10127 non-null  float64
 21  Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2  10127 non-null  float64
dtypes: float64(7), int64(9), object(6)
memory usage: 1.8+ MB
data['Attrition_Flag'].unique()
array(['Existing Customer', 'Attrited Customer'], dtype=object)
data['Education_Level'].unique()
array(['High School', 'Graduate', 'Uneducated', 'Unknown', 'College',
       'Post-Graduate', 'Doctorate'], dtype=object)
data['Marital_Status'].unique()
array(['Married', 'Single', 'Unknown', 'Divorced'], dtype=object)
data['Income_Category'].unique()
array(['$60K - $80K', 'Less than $40K', '$80K - $120K', '$40K - $60K',
       '$120K +', 'Unknown'], dtype=object)
data['Card_Category'].unique()
array(['Blue', 'Gold', 'Silver', 'Platinum'], dtype=object)

33. Clustering#

data2 = data.copy()
data2 = data2.drop(['Attrition_Flag','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1)
data2.head()
Customer_Age Gender Dependent_count Education_Level Marital_Status Income_Category Card_Category Months_on_book Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
CLIENTNUM
768805383 45 M 3 High School Married $60K - $80K Blue 39 5 1 3 12691.0 777 11914.0 1.335 1144 42 1.625 0.061
818770008 49 F 5 Graduate Single Less than $40K Blue 44 6 1 2 8256.0 864 7392.0 1.541 1291 33 3.714 0.105
713982108 51 M 3 Graduate Married $80K - $120K Blue 36 4 1 0 3418.0 0 3418.0 2.594 1887 20 2.333 0.000
769911858 40 F 4 High School Unknown Less than $40K Blue 34 3 4 1 3313.0 2517 796.0 1.405 1171 20 2.333 0.760
709106358 40 M 3 Uneducated Married $60K - $80K Blue 21 5 1 0 4716.0 0 4716.0 2.175 816 28 2.500 0.000
data2 = pd.get_dummies(data2)
from sklearn.cluster import KMeans
import numpy as np
kmeans = KMeans(n_clusters=5, random_state = 42)

kmeans.fit(data2)
KMeans(n_clusters=5, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
pred = kmeans.predict(data2)
pred
array([2, 3, 0, ..., 3, 3, 3], dtype=int32)
# prompt: fai fit di un kmeans con 5 cluster su data2 usando uno standard scaler senza usare le pipeline di sklearn

from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to your data
scaler.fit(data2)

# Transform the data using the fitted scaler
scaled_data = scaler.transform(data2)

# Now, fit the KMeans model on the scaled data
kmeans = KMeans(n_clusters=5, random_state=42, n_init = 'auto')
kmeans.fit(scaled_data)

# Predict cluster labels for your data
pred = kmeans.predict(scaled_data)

# Print or use the predictions as needed
pred
# prompt: fai fit di un kmeans con 5 cluster su data2 usando uno standard scaler e le pipeline di scikit-learn

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=5, n_init = 'auto', random_state=42))
])

pipeline.fit(data2)
pred = pipeline.predict(data2)
pred
array([4, 1, 4, ..., 0, 4, 3], dtype=int32)
pipeline
Pipeline(steps=[('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=5, random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
pipeline.named_steps['kmeans']
KMeans(n_clusters=5, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# prompt: trova il miglior K con metodo elbow

import matplotlib.pyplot as plt

# Calculate inertia for different values of k
inertia = []
k_values = range(1, 101)  # Test k values from 1 to 10

for k in k_values:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('kmeans', KMeans(n_clusters=k, n_init='auto', random_state=42))
    ])
    pipeline.fit(data2)
    inertia.append(pipeline.named_steps['kmeans'].inertia_)

# Plot the elbow method graph
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()
../_images/bea194235237bdb3a9f77d90673aec60f724083187d1451238f18ba0ba481137.png
# prompt: trova il miglior k con silhoutte

from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances

# Calculate silhouette scores for different values of k
silhouette_scores = []
k_values = range(2, 21)  # Test k values from 2 to 100 (Silhouette score is not defined for k=1)

pwd = pairwise_distances(data2)

for k in k_values:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('kmeans', KMeans(n_clusters=k, n_init='auto', random_state=42))
    ])
    pipeline.fit(data2)
    labels = pipeline.predict(data2)
    silhouette_avg = silhouette_score(pwd, labels, metric='precomputed')
    silhouette_scores.append(silhouette_avg)

# Find the best k based on the highest silhouette score
best_k = k_values[np.argmax(silhouette_scores)]
print(f"Best k based on Silhouette score: {best_k}")

# Plot the silhouette scores
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')
plt.show()
Best k based on Silhouette score: 2
../_images/992464b796ee5e3eb6285e6ea0ec624fc59d1485282c31830cdc7a3711807de2.png
# prompt: effettua la silhouette analysis per k=range(1,6)
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score

# Calculate silhouette scores for different values of k
silhouette_scores = []
k_values = range(2, 6)  # Test k values from 2 to 5 (Silhouette score is not defined for k=1)

for k in k_values:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('kmeans', KMeans(n_clusters=k, n_init='auto', random_state=42))
    ])
    pipeline.fit(data2)
    cluster_labels = pipeline.predict(data2)
    silhouette_avg = silhouette_score(data2, cluster_labels) # Use data2 directly here
    silhouette_scores.append(silhouette_avg)
    sample_silhouette_values = silhouette_samples(data2, cluster_labels)

    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)


    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    #ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    #ax1.set_ylim([0, len(data2) + (k + 1) * 10])

    y_lower = 10
    for i in range(k):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / k)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title(f"The silhouette plot for k={k}")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

# Find the best k based on the highest silhouette score
best_k = k_values[np.argmax(silhouette_scores)]
print(f"Best k based on Silhouette score: {best_k}")

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')
plt.show()
Best k based on Silhouette score: 2
../_images/9226934363593c0aa46462969f04d11e1f346a14539b8a6e48d97a6635716940.png ../_images/124a91687179e941853b787fed44243f9136b6899a151a96c049c9dfd484a565.png ../_images/fd5c95ef0282fa30b331ab8bb7b44b44d64bfa0014e6a9fa86deb83cdc4e29ce.png ../_images/08c716f733ff9c2604487d0c633c47478620ce2931c09adcac2964499427a768.png ../_images/5f0c212da0d3dea45b4a44237ff609e2bea11b4817c0ec69443538e9f05be7a0.png
# prompt: can you find the best k for a GMM with elbow?

from sklearn.mixture import GaussianMixture

# Calculate BIC scores for different values of k
bic_scores = []
k_values = range(2, 11)  # Test k values from 2 to 20

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data2)

for k in k_values:
    gmm = GaussianMixture(n_components=k, random_state=42, n_init=10)
    gmm.fit(scaled_data)  # Use scaled_data here
    bic_scores.append(gmm.bic(scaled_data))

# Find the best k based on the lowest BIC score
best_k = k_values[np.argmin(bic_scores)]
print(f"Best k based on BIC score: {best_k}")

# Plot the BIC scores
plt.plot(k_values, bic_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('BIC Score')
plt.title('BIC Score for Optimal k in GMM')
plt.show()
Best k based on BIC score: 10
../_images/f6052bdda36052ee19ea4e0b6ab72afb40b6ea383a5a2d35363e56feeaf75596.png
# prompt: effettua la silhouette analysis per k=range(1,6)
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score

# Calculate silhouette scores for different values of k
silhouette_scores = []
k_values = range(2, 6)  # Test k values from 2 to 5 (Silhouette score is not defined for k=1)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data2)

for k in k_values:
    gmm = GaussianMixture(n_components=k, random_state=42, n_init=10)
    gmm.fit(scaled_data)
    cluster_labels = pipeline.gmm(data2)
    silhouette_avg = silhouette_score(data2, cluster_labels) # Use data2 directly here
    silhouette_scores.append(silhouette_avg)
    sample_silhouette_values = silhouette_samples(data2, cluster_labels)

    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)


    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    #ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    #ax1.set_ylim([0, len(data2) + (k + 1) * 10])

    y_lower = 10
    for i in range(k):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / k)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title(f"The silhouette plot for k={k}")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

# Find the best k based on the highest silhouette score
best_k = k_values[np.argmax(silhouette_scores)]
print(f"Best k based on Silhouette score: {best_k}")

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')
plt.show()
Best k based on Silhouette score: 2
../_images/28c6de6ec73cb1b09ea454b1edcec5640d4b8785c8f532583df44c3c77670ae0.png ../_images/aa2db5e0f78788c45d3b74127fbb1a670091c1afe4fef2adde3938077717f6ab.png ../_images/78662f63d6094af0409ea6ba5cb384616cfb40e061797cf9d77c4fc7e26e58dc.png ../_images/08c716f733ff9c2604487d0c633c47478620ce2931c09adcac2964499427a768.png ../_images/a3c792b81e4b0cfa16afea5942a4219f8419eb0a0193077f4e16acc932896961.png
# prompt: fai pca sul dataset "data2" e mostra uno scree plot

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Assuming 'data2' is already defined as in your provided code
# ... (your existing code to define data2) ...

# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data2)

# Apply PCA
pca = PCA()
pca.fit(scaled_data)

# Scree plot
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')
plt.grid(True)
plt.show()
../_images/fe58abf8b40d2bc51ab5afe7fbd7812444fe636547b53a2e8a7b4b957fa8cf00.png
np.cumsum(pca.explained_variance_ratio_)[24]
0.9472334124009314
pca = PCA(n_components=25)
pca.fit(scaled_data)

x_pca = pca.transform(scaled_data)
# prompt: effettua la silhouette analysis per k=range(1,6)
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score

# Calculate silhouette scores for different values of k
silhouette_scores = []
k_values = range(2, 6)  # Test k values from 2 to 5 (Silhouette score is not defined for k=1)

pca = PCA()
pca.fit(scaled_data)

x_pca = pca.transform(scaled_data)

for k in k_values:
    kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
    kmeans.fit(x_pca)

    cluster_labels = kmeans.predict(x_pca)
    silhouette_avg = silhouette_score(x_pca, cluster_labels) # Use data2 directly here
    silhouette_scores.append(silhouette_avg)
    sample_silhouette_values = silhouette_samples(x_pca, cluster_labels)

    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)


    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    #ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    #ax1.set_ylim([0, len(data2) + (k + 1) * 10])

    y_lower = 10
    for i in range(k):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / k)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title(f"The silhouette plot for k={k}")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

# Find the best k based on the highest silhouette score
best_k = k_values[np.argmax(silhouette_scores)]
print(f"Best k based on Silhouette score: {best_k}")

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')
plt.show()
Best k based on Silhouette score: 3
../_images/6e56293edd8f60ab6da0ae64411fe558d8db8bf2d247ffcb864ea1767137d117.png ../_images/31f31f3386332aebd4c7ba050e70e6704831ae169a3ec272d1fb9bf6b981e984.png ../_images/8ef470263795f4ff85f1a85b11d4cc492f8b905060c2fb368139b20f3deed25e.png ../_images/d564800fbe232688e245b5169f4a00a391e629781069bf8423771bef85597e4f.png ../_images/0e04f39b4fbdac744b1c9b3b57c49041b2dc04e84b5a790fcde0534afe2733a3.png
import seaborn as sns

kmeans = KMeans(n_clusters=3, n_init='auto', random_state=42)
kmeans.fit(x_pca)

cluster_labels = kmeans.predict(x_pca)

sns.scatterplot(x=x_pca[:,0], y=x_pca[:,1], hue = cluster_labels, palette=sns.color_palette('hls', 3))
<Axes: >
../_images/2ae42be87a20d7f4d2cb12c9baf164d5e42bdba239c0e2c5b9f9923d47e6d8b8.png
data2['cluster'] = cluster_labels
data['cluster'] = cluster_labels
(np.cov(x_pca.T)*100).astype(np.int32)
array([[438,   0,   0, ...,   0,   0,   0],
       [  0, 255,   0, ...,   0,   0,   0],
       [  0,   0, 194, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]], dtype=int32)
x_pca[:,[0,1]].shape
(10127, 2)
# prompt: can you plot boxplots for numeric variables in data2 grouped by cluster?

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data2' and 'cluster_labels' are already defined from your previous code
# ... (your existing code to define data2 and cluster_labels) ...

# Select numeric columns for boxplots
numeric_cols = data.select_dtypes(include=['number']).columns

# Create boxplots for each numeric variable grouped by cluster
for col in numeric_cols:
  plt.figure(figsize=(8, 6))
  sns.boxplot(x='cluster', y=col, data=data, notch=True)
  plt.title(f'Boxplot of {col} by Cluster')
  plt.show()
../_images/9455351ca9ec43e5300825ccad245c4f9b1669fa82179da2106d41be95408565.png ../_images/91d6ae0a459042e7203c81db46716fd44fb3c67a5dbdac6da6153d9eb51bea89.png ../_images/27d4940564194634dc7e00a7b9c79ea7d3ac1c5953df573ae844ebb28b879bc4.png ../_images/dd11271ff589d8c9a73ff19f2addf6e014ed0d353d0197241acb894f84eef1cd.png ../_images/293a82ccc50442150f68767494027e18880ca407273fdc5e43187239d9d927bb.png ../_images/cbfdadcc160e9e680d11b3186170a0e317ec8553accf3f06a3c12bf414ae6366.png ../_images/e83f55791cacfed26815ff961875614504446bb5944b220419cc1580d56f2349.png ../_images/174aa05fe1e1c90e65ed28b09f8a255f78f7c1fbbc105d703089bb28a96faea3.png ../_images/64c5f6ef050d3391c1264172e1fb3c2152d263a89284404af9e8a1148b37e090.png ../_images/2a6b25b54de7a4949ac1e9d35dbca776d717a2d7b31daa96da1f2cca4d09d0c2.png ../_images/69fbd35f6fb66931edea8b305205b5d19dbec0c1957f9aa7e1fab0626920bdec.png ../_images/81ff54c7d717f059ebf7d5e5197d064256b1209be2dbcfbcb477dea3589b682c.png ../_images/9e37905675aec4e7e15c82328fd213676c160befbd21c45eefbebd6de7bf4e0a.png ../_images/c975d95a0d13d459e4420002f6009438c8001b6267a9dcdfa9f737c2a400ded3.png ../_images/1ea400077b43796ed6360a29bbac73d3dd7b21cfcd2cd184ea18919e55eec1a4.png ../_images/932c13f5b34329da76b874dafaeab7b8e2566812beb0bea375b82f5b70e90dbb.png ../_images/f2c689765e955130f8da149d80063601a169bbdd1a86fecf52c9c14dbe4b06af.png
# prompt: can you plot density estimates for numeric variables in data2 grouped by cluster?

# Assuming 'data2' and 'cluster_labels' are already defined from your previous code
# ... (your existing code to define data2 and cluster_labels) ...

# Select numeric columns for density plots
numeric_cols = data2.select_dtypes(include=['number']).columns

# Create density plots for each numeric variable grouped by cluster
for col in numeric_cols:
    plt.figure(figsize=(8, 6))
    for cluster in data2['cluster'].unique():
        sns.kdeplot(data2[data2['cluster'] == cluster][col], label=f'Cluster {cluster}')
    plt.title(f'Density Plot of {col} by Cluster')
    plt.xlabel(col)
    plt.ylabel('Density')
    plt.legend()
    plt.show()
../_images/d1385f88d3263ebcfaf965884455881c79101b5bb4f56e5270e655b7c125187e.png ../_images/440f79173c318d05848d70a5ca5bf95d5fc327149de7575c5be08696e869fced.png ../_images/c4de583cc7899a14b0b05e3ac9152fd27271d30a4d82c82324450b7e945fbe24.png ../_images/52a13d8133efc38ce9370dee3f49ebae0174effcb6ecac2bb33f57ac752f6943.png ../_images/e1292f822de7d494ecf4d5a66534bdf9844bba4aa8b0e2f522a4108c857098e1.png ../_images/d77faa35f7058fbb1af988ac4701514a06a15dff3be2cb1c0bff60da03d8ee69.png ../_images/a2a2d15281677bac3a138558b9f5b6a113ae1af988dc4be1d4911fcc4134a554.png ../_images/59654256d7c798df6d25e13a8da50daea9801f0bc16b7791a0ac483ed41a76cd.png ../_images/400dc0899069ac9f27c76ebc50dc69ab53ea2b944ad9d34a8586e8479256b0d0.png ../_images/21cdc6755894a72e7070d80887adfaf351c43502eee09a72221f01acd4971edc.png ../_images/cf70d403b25a44005d1046e8a43e8b763f532fcc9f8d49dd443e271936bb2daf.png ../_images/edc727ad72dd8eb8a94cdc5857a3da5de04d4e59fdb937489c3670da45c4ed9d.png ../_images/13c77eabc42a0a062fdb02fe0cfaecab1987b7508c3acc4d53d2f1ac190ae352.png ../_images/8a2671280b2ca180aa1f4d7ca69770198dae7b42c1f7628121f524445aa12be9.png
<ipython-input-100-ccc587c35145>:13: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning.
  sns.kdeplot(data2[data2['cluster'] == cluster][col], label=f'Cluster {cluster}')
<ipython-input-100-ccc587c35145>:13: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning.
  sns.kdeplot(data2[data2['cluster'] == cluster][col], label=f'Cluster {cluster}')
<ipython-input-100-ccc587c35145>:13: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning.
  sns.kdeplot(data2[data2['cluster'] == cluster][col], label=f'Cluster {cluster}')
WARNING:matplotlib.legend:No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
../_images/21c3ee7ac616fc9c66c4c8cee4f4698be5379384ca6c1778481279ec680f2895.png