.. raw:: latex
\diilbookstyleinputcell
.. code:: python
import torch
from torch import nn
from d2l import torch as d2l
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
# Use `is_grad_enabled` to determine whether we are in training mode
if not torch.is_grad_enabled():
# In prediction mode, use mean and variance obtained by moving average
X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# When using a fully connected layer, calculate the mean and
# variance on the feature dimension
mean = X.mean(dim=0)
var = ((X - mean) ** 2).mean(dim=0)
else:
# When using a two-dimensional convolutional layer, calculate the
# mean and variance on the channel dimension (axis=1). Here we
# need to maintain the shape of `X`, so that the broadcasting
# operation can be carried out later
mean = X.mean(dim=(0, 2, 3), keepdim=True)
var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)
# In training mode, the current mean and variance are used
X_hat = (X - mean) / torch.sqrt(var + eps)
# Update the mean and variance using moving average
moving_mean = (1.0 - momentum) * moving_mean + momentum * mean
moving_var = (1.0 - momentum) * moving_var + momentum * var
Y = gamma * X_hat + beta # Scale and shift
return Y, moving_mean.data, moving_var.data
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
from mxnet import autograd, init, np, npx
from mxnet.gluon import nn
from d2l import mxnet as d2l
npx.set_np()
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
# Use `autograd` to determine whether we are in training mode
if not autograd.is_training():
# In prediction mode, use mean and variance obtained by moving average
X_hat = (X - moving_mean) / np.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# When using a fully connected layer, calculate the mean and
# variance on the feature dimension
mean = X.mean(axis=0)
var = ((X - mean) ** 2).mean(axis=0)
else:
# When using a two-dimensional convolutional layer, calculate the
# mean and variance on the channel dimension (axis=1). Here we
# need to maintain the shape of `X`, so that the broadcasting
# operation can be carried out later
mean = X.mean(axis=(0, 2, 3), keepdims=True)
var = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True)
# In training mode, the current mean and variance are used
X_hat = (X - mean) / np.sqrt(var + eps)
# Update the mean and variance using moving average
moving_mean = (1.0 - momentum) * moving_mean + momentum * mean
moving_var = (1.0 - momentum) * moving_var + momentum * var
Y = gamma * X_hat + beta # Scale and shift
return Y, moving_mean, moving_var
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
import tensorflow as tf
from d2l import tensorflow as d2l
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps):
# Compute reciprocal of square root of the moving variance elementwise
inv = tf.cast(tf.math.rsqrt(moving_var + eps), X.dtype)
# Scale and shift
inv *= gamma
Y = X * inv + (beta - moving_mean * inv)
return Y
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
class BatchNorm(nn.Module):
# `num_features`: the number of outputs for a fully connected layer
# or the number of output channels for a convolutional layer. `num_dims`:
# 2 for a fully connected layer and 4 for a convolutional layer
def __init__(self, num_features, num_dims):
super().__init__()
if num_dims == 2:
shape = (1, num_features)
else:
shape = (1, num_features, 1, 1)
# The scale parameter and the shift parameter (model parameters) are
# initialized to 1 and 0, respectively
self.gamma = nn.Parameter(torch.ones(shape))
self.beta = nn.Parameter(torch.zeros(shape))
# The variables that are not model parameters are initialized to 0 and
# 1
self.moving_mean = torch.zeros(shape)
self.moving_var = torch.ones(shape)
def forward(self, X):
# If `X` is not on the main memory, copy `moving_mean` and
# `moving_var` to the device where `X` is located
if self.moving_mean.device != X.device:
self.moving_mean = self.moving_mean.to(X.device)
self.moving_var = self.moving_var.to(X.device)
# Save the updated `moving_mean` and `moving_var`
Y, self.moving_mean, self.moving_var = batch_norm(
X, self.gamma, self.beta, self.moving_mean,
self.moving_var, eps=1e-5, momentum=0.1)
return Y
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
class BatchNorm(nn.Block):
# `num_features`: the number of outputs for a fully connected layer
# or the number of output channels for a convolutional layer. `num_dims`:
# 2 for a fully connected layer and 4 for a convolutional layer
def __init__(self, num_features, num_dims, **kwargs):
super().__init__(**kwargs)
if num_dims == 2:
shape = (1, num_features)
else:
shape = (1, num_features, 1, 1)
# The scale parameter and the shift parameter (model parameters) are
# initialized to 1 and 0, respectively
self.gamma = self.params.get('gamma', shape=shape, init=init.One())
self.beta = self.params.get('beta', shape=shape, init=init.Zero())
# The variables that are not model parameters are initialized to 0 and
# 1
self.moving_mean = np.zeros(shape)
self.moving_var = np.ones(shape)
def forward(self, X):
# If `X` is not on the main memory, copy `moving_mean` and
# `moving_var` to the device where `X` is located
if self.moving_mean.ctx != X.ctx:
self.moving_mean = self.moving_mean.copyto(X.ctx)
self.moving_var = self.moving_var.copyto(X.ctx)
# Save the updated `moving_mean` and `moving_var`
Y, self.moving_mean, self.moving_var = batch_norm(
X, self.gamma.data(), self.beta.data(), self.moving_mean,
self.moving_var, eps=1e-12, momentum=0.1)
return Y
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
class BatchNorm(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super(BatchNorm, self).__init__(**kwargs)
def build(self, input_shape):
weight_shape = [input_shape[-1], ]
# The scale parameter and the shift parameter (model parameters) are
# initialized to 1 and 0, respectively
self.gamma = self.add_weight(name='gamma', shape=weight_shape,
initializer=tf.initializers.ones, trainable=True)
self.beta = self.add_weight(name='beta', shape=weight_shape,
initializer=tf.initializers.zeros, trainable=True)
# The variables that are not model parameters are initialized to 0
self.moving_mean = self.add_weight(name='moving_mean',
shape=weight_shape, initializer=tf.initializers.zeros,
trainable=False)
self.moving_variance = self.add_weight(name='moving_variance',
shape=weight_shape, initializer=tf.initializers.ones,
trainable=False)
super(BatchNorm, self).build(input_shape)
def assign_moving_average(self, variable, value):
momentum = 0.1
delta = (1.0 - momentum) * variable + momentum * value
return variable.assign(delta)
@tf.function
def call(self, inputs, training):
if training:
axes = list(range(len(inputs.shape) - 1))
batch_mean = tf.reduce_mean(inputs, axes, keepdims=True)
batch_variance = tf.reduce_mean(tf.math.squared_difference(
inputs, tf.stop_gradient(batch_mean)), axes, keepdims=True)
batch_mean = tf.squeeze(batch_mean, axes)
batch_variance = tf.squeeze(batch_variance, axes)
mean_update = self.assign_moving_average(
self.moving_mean, batch_mean)
variance_update = self.assign_moving_average(
self.moving_variance, batch_variance)
self.add_update(mean_update)
self.add_update(variance_update)
mean, variance = batch_mean, batch_variance
else:
mean, variance = self.moving_mean, self.moving_variance
output = batch_norm(inputs, moving_mean=mean, moving_var=variance,
beta=self.beta, gamma=self.gamma, eps=1e-5)
return output
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
class BNLeNetScratch(d2l.Classifier):
def __init__(self, lr=0.1, num_classes=10):
super().__init__()
self.save_hyperparameters()
self.net = nn.Sequential(
nn.LazyConv2d(6, kernel_size=5), BatchNorm(6, num_dims=4),
nn.Sigmoid(), nn.AvgPool2d(kernel_size=2, stride=2),
nn.LazyConv2d(16, kernel_size=5), BatchNorm(16, num_dims=4),
nn.Sigmoid(), nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(), nn.LazyLinear(120),
BatchNorm(120, num_dims=2), nn.Sigmoid(), nn.LazyLinear(84),
BatchNorm(84, num_dims=2), nn.Sigmoid(),
nn.LazyLinear(num_classes))
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
class BNLeNetScratch(d2l.Classifier):
def __init__(self, lr=0.1, num_classes=10):
super().__init__()
self.save_hyperparameters()
self.net = nn.Sequential()
self.net.add(
nn.Conv2D(6, kernel_size=5), BatchNorm(6, num_dims=4),
nn.Activation('sigmoid'),
nn.AvgPool2D(pool_size=2, strides=2),
nn.Conv2D(16, kernel_size=5), BatchNorm(16, num_dims=4),
nn.Activation('sigmoid'),
nn.AvgPool2D(pool_size=2, strides=2), nn.Dense(120),
BatchNorm(120, num_dims=2), nn.Activation('sigmoid'),
nn.Dense(84), BatchNorm(84, num_dims=2),
nn.Activation('sigmoid'), nn.Dense(num_classes))
self.initialize()
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
class BNLeNetScratch(d2l.Classifier):
def __init__(self, lr=0.1, num_classes=10):
super().__init__()
self.save_hyperparameters()
self.net = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(filters=6, kernel_size=5,
input_shape=(28, 28, 1)),
BatchNorm(), tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.AvgPool2D(pool_size=2, strides=2),
tf.keras.layers.Conv2D(filters=16, kernel_size=5),
BatchNorm(), tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.AvgPool2D(pool_size=2, strides=2),
tf.keras.layers.Flatten(), tf.keras.layers.Dense(120),
BatchNorm(), tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.Dense(84), BatchNorm(),
tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.Dense(num_classes)])
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
trainer = d2l.Trainer(max_epochs=10, num_gpus=1)
data = d2l.FashionMNIST(batch_size=128)
model = BNLeNetScratch(lr=0.1)
model.apply_init([next(iter(data.get_dataloader(True)))[0]], d2l.init_cnn)
trainer.fit(model, data)
.. figure:: output_batch-norm_cf033c_39_0.svg
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
trainer = d2l.Trainer(max_epochs=10, num_gpus=1)
data = d2l.FashionMNIST(batch_size=128)
model = BNLeNetScratch(lr=0.1)
trainer.fit(model, data)
.. figure:: output_batch-norm_cf033c_42_0.svg
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
trainer = d2l.Trainer(max_epochs=10)
data = d2l.FashionMNIST(batch_size=128)
with d2l.try_gpu():
model = BNLeNetScratch(lr=0.5)
trainer.fit(model, data)
.. figure:: output_batch-norm_cf033c_45_0.svg
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
model.net[1].gamma.reshape((-1,)), model.net[1].beta.reshape((-1,))
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
(tensor([1.6421, 1.4214, 1.9684, 1.0295, 2.4108, 1.9272], device='cuda:0',
grad_fn=),
tensor([-0.2378, -1.5404, -1.2467, 0.4280, 1.4989, -1.3052], device='cuda:0',
grad_fn=))
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
model.net[1].gamma.data().reshape(-1,), model.net[1].beta.data().reshape(-1,)
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
(array([1.9814706, 1.5121669, 1.6583698, 1.9642115, 2.1806524, 1.8383621], ctx=gpu(0)),
array([ 1.1152403 , 1.2426152 , -1.1037333 , 0.45082432, 0.52209014,
1.1782306 ], ctx=gpu(0)))
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
tf.reshape(model.net.layers[1].gamma, (-1,)), tf.reshape(
model.net.layers[1].beta, (-1,))
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
(,
)
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
class BNLeNet(d2l.Classifier):
def __init__(self, lr=0.1, num_classes=10):
super().__init__()
self.save_hyperparameters()
self.net = nn.Sequential(
nn.LazyConv2d(6, kernel_size=5), nn.LazyBatchNorm2d(),
nn.Sigmoid(), nn.AvgPool2d(kernel_size=2, stride=2),
nn.LazyConv2d(16, kernel_size=5), nn.LazyBatchNorm2d(),
nn.Sigmoid(), nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(), nn.LazyLinear(120), nn.LazyBatchNorm1d(),
nn.Sigmoid(), nn.LazyLinear(84), nn.LazyBatchNorm1d(),
nn.Sigmoid(), nn.LazyLinear(num_classes))
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
class BNLeNet(d2l.Classifier):
def __init__(self, lr=0.1, num_classes=10):
super().__init__()
self.save_hyperparameters()
self.net = nn.Sequential()
self.net.add(
nn.Conv2D(6, kernel_size=5), nn.BatchNorm(),
nn.Activation('sigmoid'),
nn.AvgPool2D(pool_size=2, strides=2),
nn.Conv2D(16, kernel_size=5), nn.BatchNorm(),
nn.Activation('sigmoid'),
nn.AvgPool2D(pool_size=2, strides=2),
nn.Dense(120), nn.BatchNorm(), nn.Activation('sigmoid'),
nn.Dense(84), nn.BatchNorm(), nn.Activation('sigmoid'),
nn.Dense(num_classes))
self.initialize()
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
class BNLeNet(d2l.Classifier):
def __init__(self, lr=0.1, num_classes=10):
super().__init__()
self.save_hyperparameters()
self.net = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(filters=6, kernel_size=5,
input_shape=(28, 28, 1)),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.AvgPool2D(pool_size=2, strides=2),
tf.keras.layers.Conv2D(filters=16, kernel_size=5),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.AvgPool2D(pool_size=2, strides=2),
tf.keras.layers.Flatten(), tf.keras.layers.Dense(120),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.Dense(84),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.Dense(num_classes)])
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
trainer = d2l.Trainer(max_epochs=10, num_gpus=1)
data = d2l.FashionMNIST(batch_size=128)
model = BNLeNet(lr=0.1)
model.apply_init([next(iter(data.get_dataloader(True)))[0]], d2l.init_cnn)
trainer.fit(model, data)
.. figure:: output_batch-norm_cf033c_75_0.svg
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
trainer = d2l.Trainer(max_epochs=10, num_gpus=1)
data = d2l.FashionMNIST(batch_size=128)
model = BNLeNet(lr=0.1)
trainer.fit(model, data)
.. figure:: output_batch-norm_cf033c_78_0.svg
.. raw:: html

.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
trainer = d2l.Trainer(max_epochs=10)
data = d2l.FashionMNIST(batch_size=128)
with d2l.try_gpu():
model = BNLeNet(lr=0.5)
trainer.fit(model, data)
.. figure:: output_batch-norm_cf033c_81_0.svg
.. raw:: html

.. raw:: html
`Discussions `__
.. raw:: html

.. raw:: html
`Discussions `__
.. raw:: html

.. raw:: html
`Discussions `__
.. raw:: html

.. raw:: html