Back to Notes

CNN From Scratch

NumPy implementation of a 2D convolution layer

A minimal implementation of a CNN convolution layer using only NumPy. No PyTorch, no TensorFlow - just matrix operations.

Key Concepts

He initialization

Weights initialized with std = sqrt(2/fan_in) for better gradient flow

Padding

Zero-padding to control output dimensions

Stride

Step size for sliding the kernel

Output formula

((H - K + 2P) / S) + 1

Implementation

python
import numpy as np


class CNNFromScratch:
    def __init__(self, in_channels, out_channels, kernel_size, padding, stride=1, bias=False):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.padding = padding
        self.stride = stride
        self.bias = np.random.randn(self.out_channels) if bias else None  # (C_out,)

        # He initialization
        f_in = self.kernel_size ** 2 * self.in_channels  # fan_in = K * K * C_in
        std = np.sqrt(2 / f_in)
        self.weight = np.random.randn(
            self.out_channels, self.in_channels, self.kernel_size, self.kernel_size
        ) * std  # (C_out, C_in, K, K)

    def forward(self, x):  # x: (B, C_in, H, W)
        B, C, H, W = x.shape

        # Apply padding
        if self.padding > 0:
            x = np.pad(
                x,
                pad_width=(
                    (0, 0),                          # batch
                    (0, 0),                          # channel
                    (self.padding, self.padding),   # height
                    (self.padding, self.padding)    # width
                ),
                mode='constant'
            )  # x: (B, C_in, H + 2P, W + 2P)

        # Output dimensions
        H_out = ((H - self.kernel_size + 2 * self.padding) // self.stride) + 1
        W_out = ((W - self.kernel_size + 2 * self.padding) // self.stride) + 1

        output = np.zeros((B, self.out_channels, H_out, W_out))  # (B, C_out, H_out, W_out)

        # Convolution
        for b in range(B):
            for c_out in range(self.out_channels):
                for i in range(H_out):
                    for j in range(W_out):
                        h_start = i * self.stride
                        h_end = h_start + self.kernel_size
                        w_start = j * self.stride
                        w_end = w_start + self.kernel_size

                        x_patch = x[b, :, h_start:h_end, w_start:w_end]  # (C_in, K, K)
                        conv_sum = np.sum(x_patch * self.weight[c_out])  # (C_in, K, K) * (C_in, K, K) -> scalar

                        if self.bias is not None:
                            conv_sum += self.bias[c_out]  # scalar + scalar

                        output[b, c_out, i, j] = conv_sum

        return output  # (B, C_out, H_out, W_out)


# Test
if __name__ == "__main__":
    conv = CNNFromScratch(in_channels=3, out_channels=5, kernel_size=3, padding=1, stride=1, bias=True)
    x = np.random.randn(1, 3, 8, 8)  # (B=1, C_in=3, H=8, W=8)
    out = conv.forward(x)
    print(out.shape)  # (1, 5, 8, 8) -> (B, C_out, H_out, W_out)

Complexity

Time

O(B * C_out * H_out * W_out * C_in * K * K)

Space

O(B * C_out * H_out * W_out) for output

Further Reading