1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | #!/usr/bin/env python3 ''' Beginning my deep learning journey - This part of the journey focus on the binary or two class classification problem. Learning to classify the IMDB dataset into positive and negative reviews based on text content File: dlIMDB.py Author: Nik Alleyne Author Blog: www.securitynik.com Date: 2020-01-31 ''' import numpy as np from keras.datasets import imdb from keras import (optimizers, layers, models) from matplotlib import pyplot as plt from keras.utils.vis_utils import plot_model # Function used to vectorize data, making into a set of 1s nd 0s def vectorize_data(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1 return results def main(): ''' split the data into training and testing Use the top 10,000 most frequently seen words Discard words least seen ''' (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000, maxlen=None) # Looking for the total number of words in the entire set total_words = len(np.unique(np.hstack(X_train))) + len(np.unique(np.hstack(X_test))) print('[*] Total Words in the dataset: {}'.format(total_words)) # Taking a look at the data print('[*] X_train sample data:\n{}'.format(X_train[5])) print('\n[*] y_train sample data: {}'.format(y_train[5])) print('\n[*] X_test sample data:\n{}'.format(X_test[5])) print('\n[*] y_test sample data: {}'.format(y_test[5])) # Get the shape of both the training and testing set print('\n[*] X_train shape: {}'.format(X_train.shape)) print('[*] y_train shape: {}'.format(y_train.shape)) print('[*] X_test shape: {}'.format(X_test.shape)) print('[*] y_test shape: {}'.format(y_test.shape)) ''' Encode both the training and testing data so that it can be fed to the neural network First vectorize the training and testing data ''' X_train = vectorize_data(X_train).astype('float32') X_test = vectorize_data(X_test).astype('float32') print('\n[*] Encoded X_train data: \n {}'.format(X_train)) ''' Convert the testing data to np array ''' y_train = np.asarray(y_train).astype('float32') y_test = np.asarray(y_test).astype('float32') # Create the validation set of 10000 records from the training set X_train_val = X_train[:10000] y_train_val = y_train[:10000] ''' Create a new training and testing set from the remainder of the X_train after the validation records have been extracted ''' X_train_partial = X_train[10000:] y_train_partial = y_train[10000:] print('\n [*] Here are your unique classes {}'.format(np.unique(y_train))) # Build the neural network model = models.Sequential() model.add(layers.Dense(64, activation='relu', input_shape=(10000,))) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) # Compile the model model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['accuracy']) ''' Fit the model and test it using the validation data The returned results have a history object which has a member history Take the results from the history object and store it in the nn_history variable this variable tracks the accuracy & loss and validation accuracy & validation loss ''' nn_history = model.fit(X_train_partial, y_train_partial, epochs=3, batch_size=512, validation_data=(X_train_val, y_train_val)) print('\n[*] Here is the results from history_nn\n{}'.format(nn_history.history)) print('\n[*] Here are the keys for nn_history.history: {}'.format(nn_history.history.keys())) ''' Perform an evaulation of the model based oh the test data ''' results = model.evaluate(X_test, y_test, verbose=1) print('[*] Here is the loss results for the evaluated model {}'.format(results[0])) print('[*] Here is the accuracy results for the evaluated model {}'.format(results[1])) #Ploting the training and validation loss nn_history_loss = nn_history.history['loss'] nn_history_val_loss = nn_history.history['val_loss'] epochs = range(1, len(nn_history_loss) + 1) plt.plot(epochs, nn_history_loss, 'bo', label='Training Loss') plt.plot(epochs, nn_history_val_loss, 'b', label='Validation Loss') plt.title(' Training vs Validation Loss ') plt.xlabel('epochs') plt.ylabel('loss') plt.legend() plt.show() ''' Quite interesting for me, as the training loss decreased, the validation loss increased ''' #Plotting the training and validation accuracy nn_history_accuracy = nn_history.history['accuracy'] nn_history_val_accuracy = nn_history.history['val_accuracy'] plt.clf() plt.plot(epochs, nn_history_accuracy, 'bo', label='Training Accuracy') plt.plot(epochs, nn_history_val_accuracy, 'b', label='Validation Accuracy') plt.title(' Training vs Validation Accuracy ') plt.xlabel('epochs') plt.ylabel('loss') plt.legend() plt.show() plt.close('all') ''' Another finding was that as the training accuracy increased, the validation acurracy basically flatlined. These findings apparently ties back into overfitting The model is doing well on the training data but horrible on the validation data The fact that a model performs well on training data does not mean it will perform well on data it has never seen before I adjusted the epoch a few times and it sees 3 might be the sweet spot for my example ''' # Time to make a prediction on the trained model predict_sentiment = model.predict(X_test) print('[*] Here are your sentiments for the testing data \n{}'.format(predict_sentiment)) print('\n[*] Model Summary Information \n{}'.format(model.summary())) # Create a visual plot of the model plot_model(model, to_file='/tmp/model.png', show_shapes=True, show_layer_names=True) if __name__ == '__main__': main() ''' Referenes: https://www.manning.com/books/deep-learning-with-python https://keras.io/getting-started/sequential-model-guide/ https://keras.io/optimizers/ https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/ https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456 https://www.stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/ https://machinelearningmastery.com/visualize-deep-learning-neural-network-model-keras/ https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/ ''' |
Monday, March 23, 2020
Beginning Deep Learning IMDB Dataset
This code is all part of my deep learning journey and as always, is being placed here so I can always revisit it as I continue to expand on my learning of this topic.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment