# import tensorflow 
from tensorflow.keras import layers, models
import tensorflow as tf

# import pandas, numpy, and matplotlib for data structures and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# connect to Colab's GPU for faster calculation
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  pass
  #raise SystemError('GPU device not found')


# mount my own google drive to locate the training/testing data
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# load training/testing data from the google drive
train_data = pd.read_csv("/content/drive/MyDrive/SignLanguage/sign_mnist_train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/SignLanguage/sign_mnist_test.csv")


# visualize a image to check if the data was loaded correctly
matrix = train_data.iloc[3].iloc[1:].values.reshape((28, 28))
plt.imshow(matrix, cmap='gray')

<matplotlib.image.AxesImage at 0x7f5362937ad0>


# split the training and testing data into x and y elements
x_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values

x_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

# divide by 255 to normalize the pixel data
x_train = x_train/255.0
x_test = x_test/255.0

# reshape the data so that it can be used for training
x_train = x_train.reshape(-1,28,28,1)
x_test = x_test.reshape(-1,28,28,1)


# plot histogram of labels to check the distribution
figure = plt.figure(figsize=(8, 6))
plt.hist(y_train, bins=np.arange(26)-0.5, edgecolor='black')
plt.xticks(list(range(0,26)))
plt.title("Distribution of Training Data")
plt.xlabel("Labels")
plt.ylabel("Count")
plt.show()


# build a simple CNN model
sign_model_first = models.Sequential([
    # add a convolutional layer                                  
    layers.Conv2D(32, kernel_size=(5, 5), strides=(1, 1), padding='same', activation='relu', input_shape=(28, 28, 1)),
    # add a max-pooling layer
    layers.MaxPooling2D(pool_size=(2, 2)),

    # flatten the image data
    layers.Flatten(),
    # dense the input into 512 neurons
    layers.Dense(512, activation='relu'),
    # create probability map of 25 different sign languages per image
    layers.Dense(25, activation='softmax')
])

# compile the model
sign_model_first.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# train the model
history_first = sign_model_first.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

Epoch 1/5
858/858 [==============================] - 7s 8ms/step - loss: 0.6612 - accuracy: 0.8219 - val_loss: 0.5166 - val_accuracy: 0.8334
Epoch 2/5
858/858 [==============================] - 7s 8ms/step - loss: 0.0192 - accuracy: 0.9991 - val_loss: 0.5463 - val_accuracy: 0.8622
Epoch 3/5
858/858 [==============================] - 6s 7ms/step - loss: 0.0037 - accuracy: 1.0000 - val_loss: 0.5728 - val_accuracy: 0.8759
Epoch 4/5
858/858 [==============================] - 6s 7ms/step - loss: 0.0188 - accuracy: 0.9948 - val_loss: 0.6976 - val_accuracy: 0.8624
Epoch 5/5
858/858 [==============================] - 6s 7ms/step - loss: 4.5856e-04 - accuracy: 1.0000 - val_loss: 0.7205 - val_accuracy: 0.8674


# plot the accuracy graph for training/testing dataset for each epoch
plot_target = ['accuracy', 'val_accuracy']
figure = plt.figure(figsize=(8, 6))

for x in plot_target:
    plt.plot(history_first.history[x], label = x)
plt.legend()
plt.title("Accuracy For Each Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.grid()
plt.show()


# create a model to check conv2d layer and maxpooling layer
layer_names = [x.output for x in sign_model_first.layers[:2]]
activation_model = models.Model(inputs=sign_model_first.input, outputs=layer_names)

activation = activation_model.predict(x_test)


# plot all the 32 channels of a image in the conv2d layer
rows = 4
columns = 8

figure = plt.figure(figsize=(13, 8))
plt.suptitle("Image After Convolutional Layer")

for i in range(0, columns*rows):
    figure.add_subplot(rows, columns, i+1)
    plt.imshow(activation[0][2][:,:,i].reshape(28,28), cmap='viridis')

plt.show()


# plot all the 32 channels of a image in the maxpooling layer
rows = 4
columns = 8

figure = plt.figure(figsize=(13, 8))
plt.suptitle("Image After Max Pooling Layer")

for i in range(0, columns*rows):
    figure.add_subplot(rows, columns, i+1)
    plt.imshow(activation[1][2][:,:,i].reshape(14,14), cmap='viridis')

plt.show()


# build the final model
sign_model = models.Sequential([
    # add first convolutional layer                               
    layers.Conv2D(32, kernel_size=(5, 5), strides=(1, 1), padding='same', activation='relu', input_shape=(28, 28, 1)),
    # add first max-pooling layer
    layers.MaxPooling2D(pool_size=(2, 2)),

    # add second convolutional layer   
    layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same', activation='relu'),
    # add second max-pooling layer
    layers.MaxPooling2D(pool_size=(2, 2)),

    # add third convolutional layer   
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding='same'),
    # add third max-pooling layer
    layers.MaxPooling2D(pool_size=(2, 2)),

    # flatten the image data
    layers.Flatten(),
    # randomly drop half of the nodes to prevent overfitting
    layers.Dropout(0.5),
    # dense the input into 512 neurons
    layers.Dense(512, activation='relu'),
    # randomly drop half of the nodes to prevent overfitting
    layers.Dropout(0.5),
    # create probability map of 25 different sign languages per image
    layers.Dense(25, activation='softmax')
])

# print the summary of the model
sign_model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_22 (Conv2D)           (None, 28, 28, 32)        832       
_________________________________________________________________
max_pooling2d_22 (MaxPooling (None, 14, 14, 32)        0         
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 14, 14, 64)        18496     
_________________________________________________________________
max_pooling2d_23 (MaxPooling (None, 7, 7, 64)          0         
_________________________________________________________________
conv2d_24 (Conv2D)           (None, 7, 7, 128)         73856     
_________________________________________________________________
max_pooling2d_24 (MaxPooling (None, 3, 3, 128)         0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 1152)              0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 1152)              0         
_________________________________________________________________
dense_20 (Dense)             (None, 512)               590336    
_________________________________________________________________
dropout_13 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_21 (Dense)             (None, 25)                12825     
=================================================================
Total params: 696,345
Trainable params: 696,345
Non-trainable params: 0
_________________________________________________________________


# compile the model
sign_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# train and save the history for the possible future use
history = sign_model.fit(x_train, y_train, batch_size=100, epochs=10, validation_data=(x_test, y_test))

Epoch 1/10
275/275 [==============================] - 4s 13ms/step - loss: 1.7047 - accuracy: 0.4706 - val_loss: 0.4482 - val_accuracy: 0.8660
Epoch 2/10
275/275 [==============================] - 3s 12ms/step - loss: 0.3300 - accuracy: 0.8869 - val_loss: 0.1784 - val_accuracy: 0.9364
Epoch 3/10
275/275 [==============================] - 3s 12ms/step - loss: 0.1389 - accuracy: 0.9537 - val_loss: 0.1498 - val_accuracy: 0.9505
Epoch 4/10
275/275 [==============================] - 3s 12ms/step - loss: 0.0842 - accuracy: 0.9728 - val_loss: 0.1325 - val_accuracy: 0.9579
Epoch 5/10
275/275 [==============================] - 3s 12ms/step - loss: 0.0558 - accuracy: 0.9820 - val_loss: 0.0968 - val_accuracy: 0.9718
Epoch 6/10
275/275 [==============================] - 4s 13ms/step - loss: 0.0448 - accuracy: 0.9861 - val_loss: 0.0940 - val_accuracy: 0.9736
Epoch 7/10
275/275 [==============================] - 3s 12ms/step - loss: 0.0398 - accuracy: 0.9863 - val_loss: 0.0826 - val_accuracy: 0.9803
Epoch 8/10
275/275 [==============================] - 3s 12ms/step - loss: 0.0331 - accuracy: 0.9891 - val_loss: 0.0793 - val_accuracy: 0.9802
Epoch 9/10
275/275 [==============================] - 3s 12ms/step - loss: 0.0305 - accuracy: 0.9902 - val_loss: 0.0973 - val_accuracy: 0.9805
Epoch 10/10
275/275 [==============================] - 3s 12ms/step - loss: 0.0274 - accuracy: 0.9912 - val_loss: 0.1140 - val_accuracy: 0.9730


# plot the accuracy graph for training/testing dataset for each epoch
plot_target = ['accuracy', 'val_accuracy']
figure = plt.figure(figsize=(8, 6))

for x in plot_target:
    plt.plot(history.history[x], label = x)
plt.legend()
plt.title("Accuracy For Each Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.grid()
plt.show()


# create prediction for the test data and get the predicted labels
prediction = sign_model.predict(x_test)
prediction_label = np.argmax(prediction, axis = 1)


from collections import defaultdict
import math

# Organize the prediction data so that it is sorted by the number of wrong labels
miss = defaultdict(list)

for i in range(len(y_test)):
  if prediction_label[i] != y_test[i]:
    miss[y_test[i]].append(i)

miss_order = sorted(miss.items(), key=lambda x : len(x[1]), reverse=True)

# count the total number of each labels for percentage
label_count = defaultdict(lambda: 0)

for x in y_test:
  label_count[x] += 1

# create a list with index of first wrong image for each wrong label
most_missed = []
miss_max = 9

if len(miss_order) < miss_max:
  miss_max = len(miss_order)

for i in range(miss_max):
  most_missed.append(miss_order[i][0])

# create a plot that shows label number and wrong percentage
plt.figure(figsize=(12, 12))
plt.suptitle("Wrong Sign Languages By Missing Percentage")

for idx, n in enumerate(most_missed):
  index = miss[n][0]
  plt.subplot(3, 3, idx + 1)
  label = y_test[index]
  miss_count = len(miss[n])
  total_count = label_count[label]
  percentage = miss_count/total_count * 100
  plt.imshow(x_test[index].reshape(28,28), cmap = 'Greys', interpolation='nearest')
  plt.title('Label: ' + str(label) + ', Miss %: ' + str(math.floor(percentage)) + "%")
  plt.axis('off')
    
plt.show()


# get first index of top 2 most wrong labels 
sample_index1 = miss[most_missed[0]][0]
sample_actual_label1 = y_test[index]
sample_prediction_label1 = prediction_label[sample_index1]
anti_sample_index1 = -1

sample_index2 = miss[most_missed[1]][0]
sample_actual_label2 = y_test[index]
sample_prediction_label2 = prediction_label[sample_index2]
anti_sample_index2 = -1

# get the first index of predicted labels for the above images
for i in range(len(y_test)):
  if y_test[i] == sample_prediction_label1:
    anti_sample_index1 = i
    break

for i in range(len(y_test)):
  if y_test[i] == sample_prediction_label2:
    anti_sample_index2 = i
    break

# plot the comparison diagram
if anti_sample_index1 != -1 and anti_sample_index2 != -1:
  plt.figure(figsize=(8, 8))
  plt.suptitle("Actual Label and Prediction Comparison")

  plt.subplot(2, 2, 1)
  plt.imshow(x_test[sample_index1].reshape(28,28), cmap = 'Greys', interpolation='nearest')
  plt.title('Actual Label: ' + str(y_test[sample_index1]) + ', Predict: ' + str(prediction_label[sample_index1]))

  plt.subplot(2, 2, 2)
  plt.imshow(x_test[anti_sample_index1].reshape(28,28), cmap = 'Greys', interpolation='nearest')
  plt.title('Actual Label: ' + str(y_test[anti_sample_index1]))

  plt.subplot(2, 2, 3)
  plt.imshow(x_test[sample_index2].reshape(28,28), cmap = 'Greys', interpolation='nearest')
  plt.title('Actual Label: ' + str(y_test[sample_index2]) + ', Predict: ' + str(prediction_label[sample_index2]))

  plt.subplot(2, 2, 4)
  plt.imshow(x_test[anti_sample_index2].reshape(28,28), cmap = 'Greys', interpolation='nearest')
  plt.title('Actual Label: ' + str(y_test[anti_sample_index2]))


import seaborn as sns

# create confusion matrix with counts of result
confusion = tf.math.confusion_matrix(labels=y_test, predictions = prediction_label).numpy()

# normalize the confusion matrix
confusion_norm = np.around(confusion.astype('float') / confusion.sum(axis=1)[:, np.newaxis], decimals=2)

label_list = list(range(0,25))
confusion_df = pd.DataFrame(confusion_norm,index = label_list, columns = label_list)

# plot the confusion matrix with color
figure = plt.figure(figsize=(10, 10))
sns.heatmap(confusion_df, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.title("Confusion Matrix")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:7: RuntimeWarning: invalid value encountered in true_divide
  import sys

Introduction¶

Application of Machine Vision in Sign Languages¶

Importing Dataset and Libraries¶

Organize Dataset¶

Training The First Model¶

Convolutional Layer¶

Max Pooling Layer¶

Improving The Model¶

Optimizer¶

Dropout¶

Dense Layers¶

Learning Rate¶

Convolutional Layers¶

Batch Size and Epochs¶

The Final Model¶

Wrong Values and Analysis¶

Top 9 Wrong Signs¶

Actual Label and Prediction Visuals¶

Confusion Matrix¶

Effectiveness and Limitations¶