1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
| #!/usr/bin/env python3
'''
This code is me learning about Pandas Data Science.
This is based on the Pandas for Data Science training from Pentester Academy
I decided to do things from my own perspective to some extent
I drove around the neighbourhood and captured the Wi-Fi information, so that
I can get my own perspective from my own data
Feel free to use this code as you see fit
Author: Nik Alleyne
Author Blog: www.securitynik.com
'''
from io import StringIO
import netaddr
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import subprocess as sp
import sys
def usage():
print('[*] Usage Information: ')
print('[*] ./pandasWi-Fi.py <filename>. e.g. ./pandas-Wi-Fi.py my.csv')
print('[*] Author: Nik Alleyne')
print('[*] Author Blog: www.securitynik.com')
sys.exit(-1)
def wifi_data_analysis(csv_file):
print('[*] Opening the csv file ... ')
wifi_data = open(csv_file, 'r').read()
'''
Need to split the data into two sections before creating the Pandas dataframe
As can be seen below, there are two headers sections. One of these is for the Access Point
and the other is for the client
root@securitynik:~/PA-Pandas# cat securitynik-Wi-Fi-Test-01.csv | grep -i BSSID
BSSID, First time seen, Last time seen, channel, Speed, Privacy, Cipher, Authentication, Power, # beacons, # IV, LAN IP, ID-length, ESSID, Key
Station MAC, First time seen, Last time seen, Power, # packets, BSSID, Probed ESSIDs
'''
print('[+] Splitting the data into a AP and client section')
client_header = 'Station MAC, First time seen, Last time seen, Power, # packets, BSSID, Probed ESSIDs'
ap_client_split = wifi_data.index(client_header)
# Get AP section
wifi_ap_data = StringIO(wifi_data[:ap_client_split])
# Get Client section
wifi_client_data = StringIO(wifi_data[ap_client_split:])
'''
This was a pain in the ass. Kept getting errors when attempting to create the DataFrame.
Fortunately, this link helped to solve the problem
https://stackoverflow.com/questions/18039057/python-pandas-error-tokenizing-data
'''
print('[+] Creating Access Point DataFrame ...')
access_point_df = pd.read_csv(wifi_ap_data, sep=',', header=0, skipinitialspace=True, error_bad_lines=False, warn_bad_lines=False, parse_dates=['First time seen', 'Last time seen'])
print('\n[*] Access Point column information before conversion {}'.format(access_point_df.columns))
# My understanding is that we would be better off renaming those columns to something without space
access_point_df.rename(columns={ 'BSSID' : 'BSSID', 'First time seen' : 'FirstTimeSeen', 'Last time seen' : 'LastTimeSeen', 'channel' : 'channel', 'Speed' : 'Speed', 'Privacy' : 'Privacy', 'Cipher' : 'Cipher', 'Authentication' : 'Authentication', 'Power' : 'Power', '# beacons' : 'BeaconsCount', '# IV' : 'IV', 'LAN IP' : 'LAN-IP', 'ID-length' : 'ID-Length', 'ESSID' : 'ESSID', 'Key' : 'Key' }, inplace=True)
print('\n[*] Sample Access Point data \n {}'.format(access_point_df.head()))
print('\n[*] Getting over all count of Access Point Data \n {}'.format(access_point_df.count()))
print('\n[*] Overall you have {} rows and columns {} in the AP dataframe \n'.format(access_point_df.shape[0], access_point_df.shape[1]))
print('\n[*] Data types in the AP dataframe \n {}'.format(access_point_df.dtypes))
# Looking for the unique Access Point SSID
print('\n[*] Here are the ESSIDs found ... \n {}' .format(list(set(access_point_df.ESSID))))
# Get a count of the total unique SSIDs returned
print('\n[*] Total unique SSIDs returned was:{} \n' .format(len(list(set(access_point_df.ESSID)))))
# Looking for situatio where there is NAN
print('[*] Do we have any "nan" values \n {}'.format(access_point_df.ESSID.hasnans))
# Now that we see we have ESSID with NAN values, let's replace them
access_point_df.ESSID.fillna('HIDDEN ESSID', inplace=True)
# Let's now check again for those nan values
print('[*] Do we have any "nan" values \n {}'.format(access_point_df.ESSID.hasnans))
print('[*] First 10 records after the replacement of nans \n {}' .format(access_point_df.head()))
# Good stuff, we replaced all the nan values
# Looking at the frequency with which the SSIDs have been seen
print('[*] Frequency of the SSID seen \n {}'.format(access_point_df.ESSID.value_counts()))
# Plot the graph of the usage
access_point_df.ESSID.value_counts().plot(kind='pie', figsize=(10,5))
plt.show()
# Looking at the channels in use
print('\n[*] Frequency of the channels being seen \n {}'.format(access_point_df.channel.value_counts()))
access_point_df.channel.value_counts().plot(kind='bar', figsize=(10,5))
plt.show()
# Time now for some grouping
# first group by ESSID and the channels they are seen on
print('\n[*] Grouping by SSID and channel ... \n {}'.format(access_point_df.groupby(['ESSID', 'channel'])['channel'].count()))
# Looking at unstack
print('\n[*] Looking at unstacking ... \n {}'.format(access_point_df.groupby(['ESSID', 'channel'])['channel'].count().unstack()))
# The result above produced a number of channels with 'nan' values. Time to fill that with 0s
print('\n[*] Filled the NANs with 0 ... \n {}'.format(access_point_df.groupby(['ESSID', 'channel'])['channel'].count().unstack().fillna(0)))
# Create graph of the grouping information
access_point_df.groupby(['ESSID', 'channel'])['channel'].count().unstack().fillna(0).plot(kind='bar', stacked=True, figsize=(10,5)).legend(bbox_to_anchor=(1.1,1))
plt.show()
# Extract the OUI from the MAC address - basically the firs 3 bytes
oui_manufacturer = access_point_df.BSSID.str.extract('(..:..:..)', expand=False)
print('\n[*] Here is your top 10 manufacturers OUI \n {} '.format(oui_manufacturer.head(10)))
# Print the counts of each OUI
print('\n[*] Here is your manufacturers OUI with the count \n {} '.format(oui_manufacturer.value_counts()))
'''
Client information and analysis start from here
'''
print('*'*100)
print('[+] Creating Client DataFrame ...')
client_df = pd.read_csv(wifi_client_data, sep=',', header=0, skipinitialspace=True, error_bad_lines=False, warn_bad_lines=False, parse_dates=['First time seen', 'Last time seen'])
print('\n[*] Access Point column information before conversion {}'.format(client_df.columns))
# Once again, addressing the space issue between column names
client_df.rename(columns= {'Station MAC' : 'StationMAC', 'First time seen' : 'FirstTimeSeen', 'Last time seen' : 'LastTimeSeen', 'Power' : 'Power', '# packets' : 'PacketCount', 'BSSID' : 'BSSID', 'Probed ESSIDs' : 'ProbedESSIDs'}, inplace=True)
print('\n[*] Sample client data \n {}'.format(client_df.head()))
print('\n[*] Getting over all count of client Data \n {}'.format(client_df.count()))
print('\n[*] Overall you have {} rows and columns {} in the AP dataframe \n'.format(client_df.shape[0], client_df.shape[1]))
print('\n[*] Data types in the client dataframe \n {}'.format(client_df.dtypes))
# Taking a look at the client SSIDs
print('\n[*] Here are your client SSIDs \n {}'.format(client_df.BSSID.head()))
# Looking at the probd ESSIDS
print('\n[*] Here are the ESSIDs the clients are probing for ... \n {}'.format(client_df.ProbedESSIDs))
def main():
sp.call(['clear'])
sns.set_color_codes('dark')
# Checking the command line to ensure 1 argument is passed to the command
if (len(sys.argv) != 2 ):
usage()
else:
print('[*] Reading command line arguments ... ')
if (sys.argv[1].endswith('.csv')):
print('[*] Found a CSV file ... ')
else:
print('[!] File is not .csv file. Exiting!!')
sys.exit(-1)
# Reading the CSV file
wifi_data_analysis(sys.argv[1])
if __name__ == '__main__':
main()
|