Monday, March 23, 2020

Learning about KMeans and the Elbow Curve

This code is all part of my deep learning journey and as always, is being placed here so I can always revisit it as I continue to expand on my learning of this topic.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python3

'''
Continuting my journey learning about machine learning
This code is focused on learning about Clustering and KMeans with a specific focus on the Elbow Method
Author: Nik Alleyne
Author Blog: www.securitynik.com
Filename: KMeans-pkts-elbow.py
'''




import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt



def main():
    pkt_df = pd.read_csv('/tmp/FeaturePktEnginFinal.csv', verbose=True)
    print(pkt_df.columns)

    # Looking for null data
    print('[*] Looking for NULL data \n{}'.format(pkt_df.isnull()))

    # Filling NULL data
    pkt_df.fillna(0, inplace=True)


    # Leveraging MinMax Scaler to attempt to give the data better representation
    min_max_scaler = MinMaxScaler()
    print('[*] Here is your MinMax Scaler information \n{}'.format(min_max_scaler))
    
    # Scaling the destination ports and tcp length
    pkt_df['tcpdport'] = min_max_scaler.fit_transform(pkt_df[['tcpdport']])
    pkt_df['tcplen'] = min_max_scaler.fit_transform(pkt_df[['tcplen']])
    print('\n[*] Here the Min Max Scaled TCP Length \n{}'.format(pkt_df['tcplen']))
    print('\n[*] Here the Min Max Scaled TCP Destination Ports \n{}'.format(pkt_df['tcpdport']))


    # Setup a new KMeans classifier to work on the scaled data
    km_scaled = KMeans(n_clusters=3)
    km_scaled_predict = km_scaled.fit_predict(pkt_df[['tcpdport', 'tcplen']])
    print('[*] Here are your new clusters \n{}'.format(km_scaled_predict))
    print('\n[*] Once again, here are your cluster centers \n{}'.format(km_scaled.cluster_centers_))

    # Add km_scaled_predict as a new column
    pkt_df['km_scaled_predict'] = km_scaled_predict
    print('[*] Here is the new data \n{}'.format(pkt_df))

    # Plot a new scatter plot
    # Create 3 new data frames to plot the graphs
    pkt_df0 = pkt_df[pkt_df.km_scaled_predict == 0]
    pkt_df1 = pkt_df[pkt_df.km_scaled_predict == 1]
    pkt_df2 = pkt_df[pkt_df.km_scaled_predict == 2]

    # Scatter plot the packet size
    plt.scatter(pkt_df0.tcpdport, pkt_df0['tcplen'], color='green')
    plt.scatter(pkt_df1.tcpdport, pkt_df1['tcplen'], color='red')
    plt.scatter(pkt_df2.tcpdport, pkt_df2['tcplen'], color='blue')

    '''
    Add the centroids to the scatter plots
    All rows and first column [:, 0]
    All rows and second column [:, 1]
    '''
    plt.scatter(km_scaled.cluster_centers_[:,0], km_scaled.cluster_centers_[:,1], color='black', marker='*', label='centroid')

    plt.xlabel('TCP Destination Port')
    plt.ylabel('TCP Packet Length')
    plt.legend()
    plt.show()

    # Finding the optimal K value using Elbow Method
    sum_of_sqr_err = []
    for k in range(1,10):
        km_clf = KMeans(n_clusters=k)
        km_clf.fit(pkt_df[['tcplen', 'tcpdport']])
        
        # Inertia is the Sum Of Squares Error (SSE)
        sum_of_sqr_err.append(km_clf.inertia_)
    
    # Print the SSE values
    print('\n[*] Here are your SSE Values \n{}'.format(sum_of_sqr_err))

    # plot the elbow graph
    plt.xlabel('K')
    plt.ylabel('Sum of Squared')
    plt.plot(range(1,10), sum_of_sqr_err)
    plt.show()
    

    # Ploting new clusters with K=2
    km_scaled = KMeans(n_clusters=2)
    km_scaled_predict = km_scaled.fit_predict(pkt_df[['tcpdport', 'tcplen']])
    print('[*] Here are your new clusters \n{}'.format(km_scaled_predict))
    print('\n[*] Once again, here are your cluster centers \n{}'.format(km_scaled.cluster_centers_))

    # Add km_scaled_predict as a new column
    pkt_df['km_scaled_predict'] = km_scaled_predict
    print('[*] Here is the new data \n{}'.format(pkt_df))

    # Plot a new scatter plot
    # Create 3 new data frames to plot the graphs
    pkt_df0 = pkt_df[pkt_df.km_scaled_predict == 0]
    pkt_df1 = pkt_df[pkt_df.km_scaled_predict == 1]
    pkt_df2 = pkt_df[pkt_df.km_scaled_predict == 2]

    # Scatter plot the packet size
    plt.scatter(pkt_df0.tcpdport, pkt_df0['tcplen'], color='green')
    plt.scatter(pkt_df1.tcpdport, pkt_df1['tcplen'], color='red')
    plt.scatter(pkt_df2.tcpdport, pkt_df2['tcplen'], color='blue')

    '''
    Add the centroids to the scatter plots
    All rows and first column [:, 0]
    All rows and second column [:, 1]
    '''
    plt.scatter(km_scaled.cluster_centers_[:,0], km_scaled.cluster_centers_[:,1], color='black', marker='*', label='centroid')

    plt.xlabel('TCP Destination Port')
    plt.ylabel('TCP Packet Length')
    plt.legend()
    plt.show()


    plt.close('all')


    

if __name__ == '__main__':
    main()


'''
References:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
https://dev.to/nexttech/k-means-clustering-with-scikit-learn-14kk
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
https://www.youtube.com/watch?v=ZueoXMgCd1c
https://www.youtube.com/watch?v=EItlUEPCIzM
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
'''



dad

No comments:

Post a Comment