Monday, March 23, 2020

Beginning Machine Learning - Detecting Fraudulent Transactions

This code is all part of my deep learning journey and as always, is being placed here so I can always revisit it as I continue to expand on my learning of this topic.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3

'''
Continuing my machine learning journey

Author: Nik Alleyne
Author Blog: www.securitynik.com
file: payment_fraud.csv

'''

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix)


def main():
    train_test_size = 0.33
    # import dataset
    fraud_df = pd.read_csv('./payment_fraud.csv', verbose=True)
    print('[*] First 10 records \n{}'.format(fraud_df.head(10)))
    print('\n[*] Here is another 10 sample records \n{}'.format(fraud_df.sample(10)))

    print('\n[*] Converting the payment method from categorical variable to numeric ...')
    fraud_df.paymentMethod.replace(['paypal', 'creditcard', 'storecredit'], [0,1,2], inplace=True)
    print('[*] Here is another 10 sample records after conversion \n{}'.format(fraud_df.sample(10)))

    # Create the X axis
    X_axis = fraud_df.drop('label', axis=1)
    print('[*] Sample 10 records from the X_axis \n{}'.format(X_axis.sample(10)))

    # Create the Y axis
    y_axis = fraud_df['label']
    print('\n[*] Sample 10 records from the X_axis \n{}'.format(y_axis.sample(10)))
    print('[*] Shape of the X_axis is: {}'.format(X_axis.shape))
    print('[*] Shape of the y_axis is: {}'.format(y_axis.shape))

    # Split the data into training and testing set
    print('[*] Splitting the data. Testing size is: {}%'.format(train_test_size*100))
    X_train, X_test, y_train, y_test = train_test_split(X_axis, y_axis, test_size=train_test_size, shuffle=True)
    print('\n[*] Shape of the X_train is: {}'.format(X_train.shape))
    print('[*] Shape of the X_test is: {}'.format(X_test.shape))
    print('\n[*] Shape of the y_train is: {}'.format(y_train.shape))
    print('[*] Shape of the y_test is: {}'.format(y_test.shape))

    '''
    First use logistic regression classifier
    The solver was set to 'lbfgs' because of a warning which was 
    produced when the classifier was created without it 
    '''
    lr_clf = LogisticRegression(solver='lbfgs')
    lr_clf.fit(X_train, y_train)
    print('\n[*] Here is the lr_clf aftering being fitted \n{}'.format(lr_clf))

    # Making a prediction on the test data
    predict_fraud = lr_clf.predict(X_test)
    print('\n[*] Here are your prediction on possible fraudlent transactions \n{}'.format([i for i in predict_fraud]))

    # Test the accuracy
    print('\n[*] Accuracy Score: {}'.format(accuracy_score(predict_fraud, y_test)))
    print('[*] Confusion Matrix: \n{}'.format(confusion_matrix(y_test, predict_fraud)))
    print('[*] Classification report on your prediction \n{}'.format(classification_report(y_test,predict_fraud)))


    # Let's make a prediction on a user's input
    a = [[1,2,3,4,5]]
    print('[*] Enter your 5 feature values as command separated')
    user_input = input('[*] Example: 1,2,3,4,5: ')
    user_input = user_input.split(',')
    user_input = [int(i) for i in user_input]
    print('[*] You Entered: {}'.format(user_input))
    print('[*] Here is your label:{} \n 0-Not Fraud \n 1-Fraud'.format(lr_clf.predict([user_input])))
    print('[*] Here is the probability score: {}'.format(lr_clf.predict_proba([user_input])))



if __name__ == '__main__':
    main()


'''
Reference:
https://raw.githubusercontent.com/oreilly-mlsec/book-resources/master/chapter2/datasets/payment_fraud.csv
https://www.amazon.com/Machine-Learning-Security-Protecting-Algorithms-dp-1491979909/dp/1491979909/
https://stackoverflow.com/questions/23307301/replacing-column-values-in-a-pandas-dataframe
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.htddml

'''


No comments:

Post a Comment