Monday, March 23, 2020

Beginning Deep Learning Working With the Reuters Dataset

This code is all part of my deep learning journey and as always, is being placed here so I can always revisit it as I continue to expand on my learning of this topic.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python3


'''
 Beginning my deep learning journey -
 This part of the journey focus on the binary or two class classification problem.
 Learning to classify the Reuters Newswire dataset into positive and negative reviews based on 
 text content


 File: dlReuters.py
 Author: Nik Alleyne
 Author Blog: www.securitynik.com
 Date: 2020-02-04
'''

import numpy as np
from keras import (models, layers)
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical
from matplotlib import pyplot as plt
from keras.utils.vis_utils import plot_model


# Function used to vectorize data, making into a set of 1s nd 0s
def vectorize_data(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results


def main():
    '''
    Split the dataset into training and testing
    using the 10,000 most frequently occurring word
    '''
    (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=10000)

    # Get the shape of the training and testing data
    print('\n[*] X_train shape: {}'.format(X_train.shape))
    print('[*] y_train shape: {}'.format(y_train.shape))
    print('[*] X_test shape: {}'.format(X_test.shape))
    print('[*] y_test shape: {}'.format(y_test.shape))

    # Each sample record is a list of integers
    print('[*] Sample record from X_train: \n{}'.format(X_train[0]))

    #Vectorize the training data
    X_train = vectorize_data(X_train).astype('float32')
    X_test = vectorize_data(X_test).astype('float32')

    # One Hot Encode the labels
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)
    print('[*] Sample y_train data after one hot encoded \n{}'.format(y_train))

    # Create a validation set
    X_train_val = X_train[:1000]
    y_train_val = y_train[:1000]

    X_train_partial = X_train[1000:]
    y_train_partial = y_train[1000:]


    # Build the 3 node neural network
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(46, activation='softmax'))

    # Compile the model
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
    
    # Fit the model
    nn_history = model.fit(X_train, y_train, epochs=20, batch_size=512, validation_data=(X_train_val, y_train_val))
    print('\n[*] Here is the content of nn_history.history \n{}'.format(nn_history.history))

    # Plotting the loss for for the training and validation set
    val_loss = nn_history.history['val_loss']
    train_loss = nn_history.history['loss']
    
    print('[*] Length of validation Loss is {}'.format(len(val_loss)))
    print('[*] Length of training Loss is {}'.format(len(train_loss)))

    val_acc = nn_history.history['val_acc']
    train_acc = nn_history.history['acc']
        
    epochs = range(1, len(val_loss) + 1)
    plt.figure(figsize=(8,8))
    plt.plot(epochs, val_loss, color='green', marker='+', label='Validation Loss', linestyle='dashed', linewidth=2, markersize=15)
    plt.plot(epochs, train_loss, color='red', marker='.', label='Training Loss', linestyle='dashed', linewidth=2, markersize=15)
    plt.plot(epochs, val_acc, color='blue', marker='*', label='Validation Accuracy', linestyle='dashed', linewidth=2, markersize=15)
    plt.plot(epochs, train_acc, color='orange', marker='d', label='Training Accuracy', linestyle='dashed', linewidth=2, markersize=15)
    plt.xlabel('Epoch')
    plt.ylabel('Loss/Accuracy')
    plt.title('Training/Validation vs Accuracy/Loss')
    plt.legend()
    plt.show()

    plt.close('all')

    # Evaluate the test data
    results = model.evaluate(X_test, y_test)
    print('[*] Loss on the test data is: {}'.format(results[0]))
    print('[*] Accuracy on the test data is: {}'.format(results[1]))

    #Perform a prediction on the test set
    reuters_predict = model.predict(X_test)
    print('[*] Here are your predictions \n{}'.format(reuters_predict))
    print('[*] Here is the model summary \n{}'.format(model.summary()))

    print('[*] Here is the shape for predictions: {}'.format(reuters_predict[0].shape))
    
    #Visualize the model via a graph
    plot_model(model, to_file='/tmp/reuters-model.png', show_shapes=True, show_layer_names=True)

    print('[*] Largest prediction entry {}'.format(np.argmax(reuters_predict[0])))



if __name__ == '__main__':
    main()


'''
References:
https://www.manning.com/books/deep-learning-with-python
https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.plot.html#matplotlib.pyplot.plot
https://matplotlib.org/3.1.0/api/markers_api.html#module-matplotlib.markers
'''

No comments:

Post a Comment