1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | #!/usr/bin/env python3 ''' Continuting my journey learning about machine learning This code is focused on learning about Clustering and KMeans with a specific focus on the Elbow Method Author: Nik Alleyne Author Blog: www.securitynik.com Filename: KMeans-pkts-elbow.py ''' import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import MinMaxScaler from matplotlib import pyplot as plt def main(): pkt_df = pd.read_csv('/tmp/FeaturePktEnginFinal.csv', verbose=True) print(pkt_df.columns) # Looking for null data print('[*] Looking for NULL data \n{}'.format(pkt_df.isnull())) # Filling NULL data pkt_df.fillna(0, inplace=True) # Leveraging MinMax Scaler to attempt to give the data better representation min_max_scaler = MinMaxScaler() print('[*] Here is your MinMax Scaler information \n{}'.format(min_max_scaler)) # Scaling the destination ports and tcp length pkt_df['tcpdport'] = min_max_scaler.fit_transform(pkt_df[['tcpdport']]) pkt_df['tcplen'] = min_max_scaler.fit_transform(pkt_df[['tcplen']]) print('\n[*] Here the Min Max Scaled TCP Length \n{}'.format(pkt_df['tcplen'])) print('\n[*] Here the Min Max Scaled TCP Destination Ports \n{}'.format(pkt_df['tcpdport'])) # Setup a new KMeans classifier to work on the scaled data km_scaled = KMeans(n_clusters=3) km_scaled_predict = km_scaled.fit_predict(pkt_df[['tcpdport', 'tcplen']]) print('[*] Here are your new clusters \n{}'.format(km_scaled_predict)) print('\n[*] Once again, here are your cluster centers \n{}'.format(km_scaled.cluster_centers_)) # Add km_scaled_predict as a new column pkt_df['km_scaled_predict'] = km_scaled_predict print('[*] Here is the new data \n{}'.format(pkt_df)) # Plot a new scatter plot # Create 3 new data frames to plot the graphs pkt_df0 = pkt_df[pkt_df.km_scaled_predict == 0] pkt_df1 = pkt_df[pkt_df.km_scaled_predict == 1] pkt_df2 = pkt_df[pkt_df.km_scaled_predict == 2] # Scatter plot the packet size plt.scatter(pkt_df0.tcpdport, pkt_df0['tcplen'], color='green') plt.scatter(pkt_df1.tcpdport, pkt_df1['tcplen'], color='red') plt.scatter(pkt_df2.tcpdport, pkt_df2['tcplen'], color='blue') ''' Add the centroids to the scatter plots All rows and first column [:, 0] All rows and second column [:, 1] ''' plt.scatter(km_scaled.cluster_centers_[:,0], km_scaled.cluster_centers_[:,1], color='black', marker='*', label='centroid') plt.xlabel('TCP Destination Port') plt.ylabel('TCP Packet Length') plt.legend() plt.show() # Finding the optimal K value using Elbow Method sum_of_sqr_err = [] for k in range(1,10): km_clf = KMeans(n_clusters=k) km_clf.fit(pkt_df[['tcplen', 'tcpdport']]) # Inertia is the Sum Of Squares Error (SSE) sum_of_sqr_err.append(km_clf.inertia_) # Print the SSE values print('\n[*] Here are your SSE Values \n{}'.format(sum_of_sqr_err)) # plot the elbow graph plt.xlabel('K') plt.ylabel('Sum of Squared') plt.plot(range(1,10), sum_of_sqr_err) plt.show() # Ploting new clusters with K=2 km_scaled = KMeans(n_clusters=2) km_scaled_predict = km_scaled.fit_predict(pkt_df[['tcpdport', 'tcplen']]) print('[*] Here are your new clusters \n{}'.format(km_scaled_predict)) print('\n[*] Once again, here are your cluster centers \n{}'.format(km_scaled.cluster_centers_)) # Add km_scaled_predict as a new column pkt_df['km_scaled_predict'] = km_scaled_predict print('[*] Here is the new data \n{}'.format(pkt_df)) # Plot a new scatter plot # Create 3 new data frames to plot the graphs pkt_df0 = pkt_df[pkt_df.km_scaled_predict == 0] pkt_df1 = pkt_df[pkt_df.km_scaled_predict == 1] pkt_df2 = pkt_df[pkt_df.km_scaled_predict == 2] # Scatter plot the packet size plt.scatter(pkt_df0.tcpdport, pkt_df0['tcplen'], color='green') plt.scatter(pkt_df1.tcpdport, pkt_df1['tcplen'], color='red') plt.scatter(pkt_df2.tcpdport, pkt_df2['tcplen'], color='blue') ''' Add the centroids to the scatter plots All rows and first column [:, 0] All rows and second column [:, 1] ''' plt.scatter(km_scaled.cluster_centers_[:,0], km_scaled.cluster_centers_[:,1], color='black', marker='*', label='centroid') plt.xlabel('TCP Destination Port') plt.ylabel('TCP Packet Length') plt.legend() plt.show() plt.close('all') if __name__ == '__main__': main() ''' References: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html https://dev.to/nexttech/k-means-clustering-with-scikit-learn-14kk https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html https://www.youtube.com/watch?v=ZueoXMgCd1c https://www.youtube.com/watch?v=EItlUEPCIzM https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html ''' |
dad
No comments:
Post a Comment