Learning by practicing: Learning about KMeans and the Elbow Curve

Monday, March 23, 2020
Learning about KMeans and the Elbow Curve

This code is all part of my deep learning journey and as always, is being placed here so I can always revisit it as I continue to expand on my learning of this topic.
#!/usr/bin/env python3

'''
Continuting my journey learning about machine learning
This code is focused on learning about Clustering and KMeans with a specific focus on the Elbow Method
Author: Nik Alleyne
Author Blog: www.securitynik.com
Filename: KMeans-pkts-elbow.py
'''




import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt



def main():
    pkt_df = pd.read_csv('/tmp/FeaturePktEnginFinal.csv', verbose=True)
    print(pkt_df.columns)

    # Looking for null data
    print('[*] Looking for NULL data \n{}'.format(pkt_df.isnull()))

    # Filling NULL data
    pkt_df.fillna(0, inplace=True)


    # Leveraging MinMax Scaler to attempt to give the data better representation
    min_max_scaler = MinMaxScaler()
    print('[*] Here is your MinMax Scaler information \n{}'.format(min_max_scaler))
    
    # Scaling the destination ports and tcp length
    pkt_df['tcpdport'] = min_max_scaler.fit_transform(pkt_df[['tcpdport']])
    pkt_df['tcplen'] = min_max_scaler.fit_transform(pkt_df[['tcplen']])
    print('\n[*] Here the Min Max Scaled TCP Length \n{}'.format(pkt_df['tcplen']))
    print('\n[*] Here the Min Max Scaled TCP Destination Ports \n{}'.format(pkt_df['tcpdport']))


    # Setup a new KMeans classifier to work on the scaled data
    km_scaled = KMeans(n_clusters=3)
    km_scaled_predict = km_scaled.fit_predict(pkt_df[['tcpdport', 'tcplen']])
    print('[*] Here are your new clusters \n{}'.format(km_scaled_predict))
    print('\n[*] Once again, here are your cluster centers \n{}'.format(km_scaled.cluster_centers_))

    # Add km_scaled_predict as a new column
    pkt_df['km_scaled_predict'] = km_scaled_predict
    print('[*] Here is the new data \n{}'.format(pkt_df))

    # Plot a new scatter plot
    # Create 3 new data frames to plot the graphs
    pkt_df0 = pkt_df[pkt_df.km_scaled_predict == 0]
    pkt_df1 = pkt_df[pkt_df.km_scaled_predict == 1]
    pkt_df2 = pkt_df[pkt_df.km_scaled_predict == 2]

    # Scatter plot the packet size
    plt.scatter(pkt_df0.tcpdport, pkt_df0['tcplen'], color='green')
    plt.scatter(pkt_df1.tcpdport, pkt_df1['tcplen'], color='red')
    plt.scatter(pkt_df2.tcpdport, pkt_df2['tcplen'], color='blue')

    '''
    Add the centroids to the scatter plots
    All rows and first column [:, 0]
    All rows and second column [:, 1]
    '''
    plt.scatter(km_scaled.cluster_centers_[:,0], km_scaled.cluster_centers_[:,1], color='black', marker='*', label='centroid')

    plt.xlabel('TCP Destination Port')
    plt.ylabel('TCP Packet Length')
    plt.legend()
    plt.show()

    # Finding the optimal K value using Elbow Method
    sum_of_sqr_err = []
    for k in range(1,10):
        km_clf = KMeans(n_clusters=k)
        km_clf.fit(pkt_df[['tcplen', 'tcpdport']])
        
        # Inertia is the Sum Of Squares Error (SSE)
        sum_of_sqr_err.append(km_clf.inertia_)
    
    # Print the SSE values
    print('\n[*] Here are your SSE Values \n{}'.format(sum_of_sqr_err))

    # plot the elbow graph
    plt.xlabel('K')
    plt.ylabel('Sum of Squared')
    plt.plot(range(1,10), sum_of_sqr_err)
    plt.show()
    

    # Ploting new clusters with K=2
    km_scaled = KMeans(n_clusters=2)
    km_scaled_predict = km_scaled.fit_predict(pkt_df[['tcpdport', 'tcplen']])
    print('[*] Here are your new clusters \n{}'.format(km_scaled_predict))
    print('\n[*] Once again, here are your cluster centers \n{}'.format(km_scaled.cluster_centers_))

    # Add km_scaled_predict as a new column
    pkt_df['km_scaled_predict'] = km_scaled_predict
    print('[*] Here is the new data \n{}'.format(pkt_df))

    # Plot a new scatter plot
    # Create 3 new data frames to plot the graphs
    pkt_df0 = pkt_df[pkt_df.km_scaled_predict == 0]
    pkt_df1 = pkt_df[pkt_df.km_scaled_predict == 1]
    pkt_df2 = pkt_df[pkt_df.km_scaled_predict == 2]

    # Scatter plot the packet size
    plt.scatter(pkt_df0.tcpdport, pkt_df0['tcplen'], color='green')
    plt.scatter(pkt_df1.tcpdport, pkt_df1['tcplen'], color='red')
    plt.scatter(pkt_df2.tcpdport, pkt_df2['tcplen'], color='blue')

    '''
    Add the centroids to the scatter plots
    All rows and first column [:, 0]
    All rows and second column [:, 1]
    '''
    plt.scatter(km_scaled.cluster_centers_[:,0], km_scaled.cluster_centers_[:,1], color='black', marker='*', label='centroid')

    plt.xlabel('TCP Destination Port')
    plt.ylabel('TCP Packet Length')
    plt.legend()
    plt.show()


    plt.close('all')


    

if __name__ == '__main__':
    main()


'''
References:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
https://dev.to/nexttech/k-means-clustering-with-scikit-learn-14kk
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
https://www.youtube.com/watch?v=ZueoXMgCd1c
https://www.youtube.com/watch?v=EItlUEPCIzM
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
'''
dad
Learning by practicing

Monday, March 23, 2020

Learning about KMeans and the Elbow Curve

No comments:

Post a Comment