Implementation of Light CNN (Python Keras)


In this article, we have implemented a deep learning model called Light CNN (LCNN), so we have summarized it. We will first explain LCNN and its feature, Max Feature Mapping (MFM), and then implement and evaluate it. All code is python, LCNN implementation is done using Tensorflow, Keras. Please refer to the code for implementing LCNN on Github. Github URL : Light CNN LCNN was proposed by STC in 2015 and is currently used in fields such as image classification and speech classification as a deep learning method being researched by an institution called STC. LCNN is composed of 8 convolutional layers, and it is a major feature that the activation function in each layer uses what is called Max Feature Mapping. Max Feature Mapping For more information on MFM, please refer to here. I will also post the original paper. "A Light CNN for Deep Face Representation with Noisy Labels " (


import tensorflow as tf
from keras.layers import Activation, Dense, BatchNormalization, MaxPool2D, Lambda, Input, Flatten, Dropout
from keras.layers.convolutional import Conv2D
from keras.models import Model
from keras.initializers import he_normal

#Custom layer
from .layers import Maxout

#function that return the stuck of Conv2D and MFM
def MaxOutConv2D(x, dim, kernel_size, strides, padding='same'):
    conv_out = Conv2D(dim, kernel_size=kernel_size, strides=strides, padding=padding)(x)
    mfm_out = Maxout(int(dim/2))(conv_out)
    return mfm_out

#function that return the stuck of FC and MFM
def MaxOutDense(x, dim):
    dense_out = Dense(dim)(x)
    mfm_out = Maxout(int(dim/2))(dense_out)
    return mfm_out

# this function helps to build LCNN. 
def build_lcnn(shape, n_label=2):
     shape (list) : 
      Input shape for LCNN. (Example : [128, 128, 1])
     n_label (int) : 
      Number of label that LCNN should predict.
    input = Input(shape=shape)

    conv2d_1 = MaxOutConv2D(input, 64, kernel_size=5, strides=1, padding='same')
    maxpool_1 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv2d_1)

    conv_2d_2 = MaxOutConv2D(maxpool_1, 64, kernel_size=1, strides=1, padding='same')
    batch_norm_2 = BatchNormalization()(conv_2d_2)

    conv2d_3 = MaxOutConv2D(batch_norm_2, 96, kernel_size=3, strides=1, padding='same')
    maxpool_3 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv2d_3)
    batch_norm_3 = BatchNormalization()(maxpool_3)

    conv_2d_4 = MaxOutConv2D(batch_norm_3, 96, kernel_size=1, strides=1, padding='same')
    batch_norm_4 = BatchNormalization()(conv_2d_4)

    conv2d_5 = MaxOutConv2D(batch_norm_4, 128, kernel_size=3, strides=1, padding='same')
    maxpool_5 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv2d_5)

    conv_2d_6 = MaxOutConv2D(maxpool_5, 128, kernel_size=1, strides=1, padding='same')
    batch_norm_6 = BatchNormalization()(conv_2d_6)

    conv_2d_7 = MaxOutConv2D(batch_norm_6, 64, kernel_size=3, strides=1, padding='same')
    batch_norm_7 = BatchNormalization()(conv_2d_7)

    conv_2d_8 = MaxOutConv2D(batch_norm_7, 64, kernel_size=1, strides=1, padding='same')
    batch_norm_8 = BatchNormalization()(conv_2d_8)

    conv_2d_9 = MaxOutConv2D(batch_norm_8, 64, kernel_size=3, strides=1, padding='same')
    maxpool_9 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv_2d_9)
    flatten = Flatten()(maxpool_9)

    dense_10 = MaxOutDense(flatten, 160)
    batch_norm_10 = BatchNormalization()(dense_10)
    dropout_10 = Dropout(0.75)(batch_norm_10)

    output = Dense(n_label, activation='softmax')(dropout_10)
    return Model(inputs=input, outputs=output)


The implemented LCNN will be the model used in the speech recognition competition, but I tried with mnist and CIFAR10 to see how much performance can be achieved with simple image recognition. I did not tune the model at all, but I was able to show 99% performance with mnist and about 75% performance with CIFAR10. mnist

import numpy as np
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.datasets import mnist

lr = 0.001
epochs = 10
batch_size =256

[x_train, y_train], [x_test, y_test] = mnist.load_data()
x_train = x_train / 255
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], x_train.shape[2], 1))
y_train = to_categorical(y_train)
input_shape = x_train.shape[1:]

lcnn = build_lcnn(input_shape, n_label=10)
lcnn.compile(optimizer=Adam(learning_rate=lr), loss='categorical_crossentropy', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
history =, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[es])

x_test = x_test / 255
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], x_test.shape[2], 1))
y_test = to_categorical(y_test)

loss, acc = lcnn.evaluate(x_test, y_test)

print(f'Accuracy : {acc*100}') # Result --> Accuracy : 99.90999794006348
print(f'Loss : {loss}')# Result --> Loss : 0.04250425341885457


import numpy as np
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.datasets import cifar10

lr = 0.001
epochs = 100
batch_size =64

[x_train, y_train], [x_test, y_test] =cifar10.load_data()

x_train = x_train / 255
y_train = to_categorical(y_train)
input_shape = x_train.shape[1:]

lcnn = build_lcnn(input_shape, n_label=10)
lcnn.compile(optimizer=Adam(learning_rate=lr), loss='categorical_crossentropy', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=5 , verbose=1)
history =, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[es])

x_test = x_test / 255
y_test = to_categorical(y_test)

loss, acc = lcnn.evaluate(x_test, y_test)
print(f'Accuracy : {acc*100}') # Result --> Accuracy : 75.1200020313263
print(f'Loss : {loss}')# Result --> Loss : 1.2616282165050507


I have implemented a deep learning model called LCNN, so I have summarized it. I'm glad if you can use it as a reference. Github URL : Reference "A Light CNN for Deep Face Representation with Noisy Labels" "STC Antispoofing Systems for the ASVspoof2019 Challenge" "Audio replay attack detection with deep learning frameworks"

