PyTorchで独自(カスタム)レイヤーを実装する方法を解説

深層学習の研究やプロダクション開発では、既存のレイヤー(nn.Linear, nn.Conv2dなど)だけでは実現できない独自の処理が必要になることがあります。PyTorchでは、nn.Module を継承して独自のレイヤーやモデルを柔軟に定義できます。

特に最新の研究論文を読み解く際には、著者が実装したカスタムレイヤーのPyTorchコードを理解する能力が不可欠です。

本記事の内容

nn.Moduleの基本構造と設計思想
カスタム線形層の実装
カスタムAttention層の実装
パラメータの自動管理と勾配計算

前提知識

この記事を読む前に、PyTorchの基本的な使い方（テンソル操作、自動微分）を押さえておくと理解が深まります。

nn.Moduleの基本構造

PyTorchのすべてのレイヤーとモデルは nn.Module を継承しています。カスタムレイヤーを作成する際に最低限実装する必要があるのは以下の2つです。

__init__: パラメータ（重み、バイアス）の定義
forward: 順伝播の計算

import torch
import torch.nn as nn

class MyLayer(nn.Module):
    def __init__(self, ...):
        super().__init__()
        # パラメータの定義

    def forward(self, x):
        # 順伝播の計算
        return output

nn.Parameter で定義したテンソルは自動的にモデルのパラメータとして登録され、model.parameters() で取得できます。

カスタム線形層の実装

まず、nn.Linear と同等の機能を持つカスタム線形層を実装してみましょう。

線形層の順伝播は以下の数式で表されます。

$$ \bm{y} = \bm{x}\bm{W}^T + \bm{b} $$

import torch
import torch.nn as nn
import numpy as np

class CustomLinear(nn.Module):
    """カスタム線形層（nn.Linearの再実装）"""

    def __init__(self, in_features, out_features, bias=True):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        # 重みをnn.Parameterとして定義
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        if bias:
            self.bias = nn.Parameter(torch.zeros(out_features))
        else:
            self.register_parameter('bias', None)

        # Kaiming初期化
        nn.init.kaiming_uniform_(self.weight, a=np.sqrt(5))

    def forward(self, x):
        output = x @ self.weight.t()
        if self.bias is not None:
            output = output + self.bias
        return output

    def extra_repr(self):
        return f'in_features={self.in_features}, out_features={self.out_features}'

# --- 動作確認 ---
torch.manual_seed(42)
custom_layer = CustomLinear(10, 5)
x = torch.randn(3, 10)
y = custom_layer(x)

print(f"入力: {x.shape}")
print(f"出力: {y.shape}")
print(f"パラメータ一覧:")
for name, param in custom_layer.named_parameters():
    print(f"  {name}: {param.shape}")

カスタムScaled Dot-Product Attention

Transformerの基本であるScaled Dot-Product Attentionをカスタムレイヤーとして実装します。

$$ \text{Attention}(\bm{Q}, \bm{K}, \bm{V}) = \text{softmax}\left(\frac{\bm{Q}\bm{K}^T}{\sqrt{d_k}}\right)\bm{V} $$

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

class ScaledDotProductAttention(nn.Module):
    """Scaled Dot-Product Attention"""

    def __init__(self, d_model, d_k, d_v):
        super().__init__()
        self.d_k = d_k
        self.W_q = nn.Linear(d_model, d_k)
        self.W_k = nn.Linear(d_model, d_k)
        self.W_v = nn.Linear(d_model, d_v)

    def forward(self, x, mask=None):
        """
        x: (batch, seq_len, d_model)
        """
        Q = self.W_q(x)  # (batch, seq_len, d_k)
        K = self.W_k(x)  # (batch, seq_len, d_k)
        V = self.W_v(x)  # (batch, seq_len, d_v)

        # スコアの計算
        scores = Q @ K.transpose(-2, -1) / (self.d_k ** 0.5)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        # Attention重み
        attn_weights = F.softmax(scores, dim=-1)

        # 出力
        output = attn_weights @ V
        return output, attn_weights

# --- 動作確認 ---
torch.manual_seed(42)
d_model = 16
d_k = 8
d_v = 8
seq_len = 5
batch_size = 2

attention = ScaledDotProductAttention(d_model, d_k, d_v)
x = torch.randn(batch_size, seq_len, d_model)
output, weights = attention(x)

print(f"入力: {x.shape}")
print(f"出力: {output.shape}")
print(f"Attention重み: {weights.shape}")
print(f"\nAttention重み（バッチ0）:\n{weights[0].detach().numpy().round(3)}")

カスタムレイヤーを組み合わせたモデル

複数のカスタムレイヤーを組み合わせてモデルを構築します。

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

class CustomGatedLayer(nn.Module):
    """ゲート付きの線形層"""

    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.gate = nn.Linear(in_features, out_features)

    def forward(self, x):
        # ゲート機構: 出力の各要素を0-1の値でスケーリング
        h = torch.relu(self.linear(x))
        g = torch.sigmoid(self.gate(x))
        return h * g  # 要素積

class CustomModel(nn.Module):
    """カスタムレイヤーを組み合わせたモデル"""

    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.gated1 = CustomGatedLayer(input_dim, hidden_dim)
        self.gated2 = CustomGatedLayer(hidden_dim, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h = self.gated1(x)
        h = self.gated2(h)
        return self.output_layer(h)

# --- 回帰タスクでテスト ---
torch.manual_seed(42)

# データ生成
n = 200
X = torch.randn(n, 3)
y_true = torch.sin(X[:, 0]) + 0.5 * X[:, 1] ** 2 - X[:, 2]
y = y_true + torch.randn(n) * 0.1

# モデルと学習
model = CustomModel(input_dim=3, hidden_dim=32, output_dim=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

losses = []
for epoch in range(500):
    pred = model(X).squeeze()
    loss = F.mse_loss(pred, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

# パラメータ数の確認
total = sum(p.numel() for p in model.parameters())
print(f"総パラメータ数: {total}")
print(f"最終損失: {losses[-1]:.6f}")

# 学習曲線
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(losses, 'b-')
ax.set_xlabel('Epoch')
ax.set_ylabel('MSE Loss')
ax.set_title('Training with Custom Gated Layers')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# モデル構造の表示
print(f"\nモデル構造:\n{model}")

まとめ

本記事では、PyTorchでカスタムレイヤーを実装する方法を解説しました。

nn.Module を継承し、__init__ でパラメータを定義、forward で順伝播を実装する
nn.Parameter で定義したテンソルは自動的にパラメータとして管理される
extra_repr をオーバーライドすると、print(model) での表示をカスタマイズできる
カスタムレイヤーを組み合わせることで、ゲート機構やAttentionなど柔軟なアーキテクチャを実現できる

次のステップとして、以下の記事も参考にしてください。

機械学習と情報技術

PyTorchで独自(カスタム)レイヤーを実装する方法を解説

前提知識

nn.Moduleの基本構造

カスタム線形層の実装

カスタムScaled Dot-Product Attention

カスタムレイヤーを組み合わせたモデル

まとめ

ロジスティック回帰の理論と実装をわかりやすく解説

【初心者】覚えておきたいpytorchの基本(関数やクラス)

PyTorchで独自(カスタム)レイヤーを実装する方法を解説

前提知識

nn.Moduleの基本構造

カスタム線形層の実装

カスタムScaled Dot-Product Attention

カスタムレイヤーを組み合わせたモデル

まとめ

関連記事

Early Stoppingの理論 — L2正則化との等価性を導出する

データ拡張の手法一覧 — 画像・テキスト・時系列への適用

【PyTorch】CNNの仕組みと実装をわかりやすく解説

ロジスティック回帰の理論と実装をわかりやすく解説

【初心者】覚えておきたいpytorchの基本(関数やクラス)