第 8 名解决方案

首先，我要感谢组织者和 Kaggle 举办这次比赛。比赛数据的质量很棒。虽然过程中存在一些问题，但无论如何，我们取得了让大多数人满意的结果。

这是我的第一枚个人金牌，我也成为了竞赛 Grandmaster。这七年的旅程相当漫长且令人兴奋。

解决方案总结

我认为我的解决方案非常简单，基本上是基于从 BiLSTM 衍生的 seq2seq 模型。

(bs, 60, 25) --> seq2seq --> (bs, 60, 14) --> (bs, 368)

模型

在过去 6 个月的样本数据上进行验证

模型	CV (交叉验证)	LB ( leaderboard)
BiLSTM (layers=6)	0.7844	0.7812
BiGRU (layers=8)	0.7835	0.7802
BiLSTM + Transformer	0.7858	0.7821
BiLSTM + Attention	0.7865	0.7834
BiLSTM + TCN	0.7855	0.7832
BiLSTM + CNN	0.7842	0.7821
模型集成 (ensemble on models)	0.7923	0.7890
目标集成 (ensemble on targets)	0.7933	0.7884

基础 BiLSTM 模型:

class LeapModel(nn.Module):
    def __init__(self,
                 input_size,
                 seq_len,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 bidirectional=False,
                 dropout=.3,
                 hidden_layers=[128, 256]):

        super().__init__()
        self.input_size = input_size
        self.seq_len = seq_len
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional=bidirectional
        self.output_size=output_size

        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bidirectional=bidirectional,
                           batch_first=True,
                           dropout=dropout)

        if hidden_layers and len(hidden_layers):
            first_layer  = nn.Linear(hidden_size*2 if bidirectional else hidden_size, hidden_layers[0])
            self.hidden_layers = nn.ModuleList(
                [first_layer] + \
                [nn.Linear(hidden_layers[i], hidden_layers[i+1]) for i in range(len(hidden_layers) - 1)]
            )
            for layer in self.hidden_layers:
                nn.init.kaiming_normal_(layer.weight.data)
            self.intermediate_layer = nn.Linear(hidden_layers[-1], self.input_size)
            self.output_layer = nn.Linear(hidden_layers[-1], output_size)
            nn.init.kaiming_normal_(self.output_layer.weight.data)
        else:
            self.hidden_layers = []
            self.intermediate_layer = nn.Linear(hidden_size*2 if bidirectional else hidden_size, self.input_size)
            self.output_layer = nn.Linear(hidden_size*2 if bidirectional else hidden_size, output_size)
            nn.init.kaiming_normal_(self.output_layer.weight.data)

        self.activation_fn = torch.nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        outputs, hidden = self.rnn(x)

        x = self.dropout(self.activation_fn(outputs))
        for hidden_layer in self.hidden_layers:
            x = self.activation_fn(hidden_layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)

        # (-1,60,14) -> (-1,386)
        o_s = x[:, :, :6]
        o_s = o_s.permute(0,2,1).reshape(-1,360)
        o_g = x[:, :, 6:]
        o_g = o_g.mean(dim=1)
        out = torch.cat([o_s, o_g], dim=1)

        return out

input_size = 25
output_size = 14
seq_len = 60

hidden_size = 256
hidden_layers = [256, 512]
num_layers = 6
dropout = 0.1

model = LeapModel(
    input_size=input_size,
    seq_len=seq_len,
    hidden_size=hidden_size,
    output_size=output_size,
    num_layers=num_layers,
    hidden_layers=hidden_layers,
    dropout=dropout,
    bidirectional=True,
).to(device)

参考链接: https://www.kaggle.com/code/brandenkmurray/seq2seq-rnn-with-gru

带有 Transformer 的 BiLSTM 模型

class LeapModel(nn.Module):
    def __init__(self,
                 input_size,
                 seq_len,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 bidirectional=False,
                 dropout=0.3,
                 hidden_layers=[128, 256],
                 nhead=8,
                 num_transformer_layers=2):

        super().__init__()
        self.input_size = input_size
        self.seq_len = seq_len
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.output_size = output_size

        # LSTM 层
        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bidirectional=bidirectional,
                           batch_first=True,
                           dropout=dropout)

        # Transformer 层
        transformer_input_size = hidden_size * 2 if bidirectional else hidden_size
        self.transformer_layer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=transformer_input_size, nhead=nhead, dropout=dropout),
            num_layers=num_transformer_layers
        )

        # 全连接层
        if hidden_layers and len(hidden_layers):
            first_layer = nn.Linear(transformer_input_size, hidden_layers[0])
            self.hidden_layers = nn.ModuleList(
                [first_layer] + \
                [nn.Linear(hidden_layers[i], hidden_layers[i+1]) for i in range(len(hidden_layers) - 1)]
            )
            for layer in self.hidden_layers:
                nn.init.kaiming_normal_(layer.weight.data)
            self.intermediate_layer = nn.Linear(hidden_layers[-1], self.input_size)
            self.output_layer = nn.Linear(hidden_layers[-1], output_size)
            nn.init.kaiming_normal_(self.output_layer.weight.data)
        else:
            self.hidden_layers = []
            self.intermediate_layer = nn.Linear(transformer_input_size, self.input_size)
            self.output_layer = nn.Linear(transformer_input_size, output_size)
            nn.init.kaiming_normal_(self.output_layer.weight.data)

        self.activation_fn = torch.nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # LSTM 层
        lstm_output, _ = self.rnn(x)
        
        # Transformer 层
        transformer_output = self.transformer_layer(lstm_output)
        
        # 应用 dropout 和激活函数
        x = self.dropout(self.activation_fn(transformer_output))
        
        # 全连接层
        for hidden_layer in self.hidden_layers:
            x = self.activation_fn(hidden_layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)

        # 重塑输出
        o_s = x[:, :, :6]
        o_s = o_s.permute(0, 2, 1).reshape(-1, 360)
        o_g = x[:, :, 6:]
        o_g = o_g.mean(dim=1)
        out = torch.cat([o_s, o_g], dim=1)

        return out

input_size = 25
output_size = 14
seq_len = 60

hidden_size = 256
hidden_layers = [256, 512]
num_layers = 6
dropout = 0.1
nhead = 8
num_transformer_layers = 1

model = LeapModel(
    input_size=input_size,
    seq_len=seq_len,
    hidden_size=hidden_size,
    output_size=output_size,
    num_layers=num_layers,
    hidden_layers=hidden_layers,
    dropout=dropout,
    bidirectional=True,
    nhead=nhead,
    num_transformer_layers=num_transformer_layers
).to(device)

带有 TCN 的 BiLSTM 模型

class TCNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation):
        super(TCNBlock, self).__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, 
                              padding=(kernel_size-1) * dilation // 2, dilation=dilation)
        self.bn = nn.BatchNorm1d(out_channels)
        self.activation_fn = nn.GELU()

    def forward(self, x):
        return self.activation_fn(self.bn(self.conv(x)))

class LeapModel(nn.Module):
    def __init__(self,
                 input_size,
                 seq_len,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 bidirectional=False,
                 dropout=0.3):

        super().__init__()
        self.input_size = input_size
        self.seq_len = seq_len
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.output_size = output_size

        # LSTM 层
        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bidirectional=bidirectional,
                           batch_first=True,
                           dropout=dropout)

        self.se = nn.Sequential(
            nn.Linear(hidden_size*2, hidden_size//2),
            nn.GELU(),
            nn.Linear(hidden_size//2, hidden_size*2),
            nn.Sigmoid()
        )
        
        self.tcn = nn.Sequential(
            TCNBlock(hidden_size*2, hidden_size*2, kernel_size=3, dilation=1),
            TCNBlock(hidden_size*2, hidden_size*2, kernel_size=3, dilation=2),
            TCNBlock(hidden_size*2, hidden_size*2, kernel_size=3, dilation=4),
            TCNBlock(hidden_size*2, hidden_size*2, kernel_size=3, dilation=8),
        )
        
        self.fc = nn.Linear(hidden_size*2, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # RNN 层
        outputs, _ = self.rnn(x)
        
        se_weights = self.se(torch.mean(outputs, dim=1)).unsqueeze(1)
        outputs = outputs * se_weights
        
        tcn_input = outputs.permute(0, 2, 1)
        tcn_output = self.tcn(tcn_input)
        tcn_output = tcn_output.permute(0, 2, 1)
        
        x = self.dropout(tcn_output)
        x = self.fc(x)

        # 重塑输出
        o_s = x[:, :, :6]
        o_s = o_s.permute(0, 2, 1).reshape(-1, 360)
        o_g = x[:, :, 6:]
        o_g = o_g.mean(dim=1)
        out = torch.cat([o_s, o_g], dim=1)  # (bs,368)

        return out

input_size = 25
output_size = 14
seq_len = 60

hidden_size = 256
num_layers = 6
dropout = 0.1

model = LeapModel(
    input_size=input_size,
    seq_len=seq_len,
    hidden_size=hidden_size,
    output_size=output_size,
    num_layers=num_layers,
    dropout=dropout,
    bidirectional=True,
).to(device)

带有 Attention 的 BiLSTM 模型

class LeapModel(nn.Module):
    def __init__(self,
                 input_size,
                 seq_len,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 bidirectional=False,
                 dropout=.3,
                 hidden_layers=[128, 256]):

        super().__init__()
        self.input_size = input_size
        self.seq_len = seq_len
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional=bidirectional
        self.output_size=output_size
        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bidirectional=bidirectional,
                           batch_first=True,
                           dropout=0.1)

        self.attention = nn.MultiheadAttention(embed_dim=hidden_size*2 if bidirectional else hidden_size,
                                               num_heads=8,
                                               batch_first=True)

        if hidden_layers and len(hidden_layers):
            first_layer  = nn.Linear(hidden_size*2 if bidirectional else hidden_size, hidden_layers[0])
            self.hidden_layers = nn.ModuleList(
                [first_layer] + \
                [nn.Linear(hidden_layers[i], hidden_layers[i+1]) for i in range(len(hidden_layers) - 1)]
            )
            for layer in self.hidden_layers:
                nn.init.kaiming_normal_(layer.weight.data)
            self.intermediate_layer = nn.Linear(hidden_layers[-1], self.input_size)
            self.output_layer = nn.Linear(hidden_layers[-1], output_size)
            nn.init.kaiming_normal_(self.output_layer.weight.data)
        else:
            self.hidden_layers = []
            self.intermediate_layer = nn.Linear(hidden_size*2 if bidirectional else hidden_size, self.input_size)
            self.output_layer = nn.Linear(hidden_size*2 if bidirectional else hidden_size, output_size)
            nn.init.kaiming_normal_(self.output_layer.weight.data)

        self.activation_fn = torch.nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size = x.size(0)
        outputs, hidden = self.rnn(x)

        outputs = outputs.permute(1, 0, 2)  # (seq_len, batch_size, hidden_size)
        attn_output, _ = self.attention(outputs, outputs, outputs)
        attn_output = attn_output.permute(1, 0, 2)  # (batch_size, seq_len, hidden_size)

        x = self.dropout(self.activation_fn(attn_output))
        for hidden_layer in self.hidden_layers:
            x = self.activation_fn(hidden_layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)

        # (-1,60,14) -> (-1,386)
        o_s = x[:, :, :6]
        o_s = o_s.permute(0,2,1).reshape(-1,360)
        o_g = x[:, :, 6:]
        o_g = o_g.mean(dim=1)
        out = torch.cat([o_s, o_g], dim=1)

        return out

input_size = 25
output_size = 14
seq_len = 60

hidden_size = 256
hidden_layers = [256, 512]
num_layers = 6
dropout = 0.1

model = LeapModel(
    input_size=input_size,
    seq_len=seq_len,
    hidden_size=hidden_size,
    output_size=output_size,
    num_layers=num_layers,
    hidden_layers=hidden_layers,
    dropout=dropout,
    bidirectional=True,
).to(device)

数据集

从 Huggingface 下载所有 0001-02 到 0009-01 (低分辨率) 数据。
0001-02 到 0008-06 作为训练集，0008-07 到 0009-01 作为验证集 (采样至约 625000 行)。

一些训练细节

损失函数: nn.SmoothL1Loss(reduction='mean') (比 mse 好 0.005~0.008)
调度器: get_cosine_schedule_with_warmup
激活函数: GELU (比 relu 好 0.002~0.004)
在 4*RTX4090 上训练，360G 内存，7.5 年的训练数据集，每 epoch 约 1 小时

后处理

targets_unpredictable = []
for target in weights:
    if weights[target] == 0.:
        targets_unpredictable.append(target)
for target in targets_unpredictable:
    df_pred[target] = 0.
for target in [f'ptend_q0002_{i}' for i in range(12, 28)]:
    df_pred[target] = -df_test[target.replace("ptend", "state")] * weights[target] / 1200.

参考链接: https://www.kaggle.com/competitions/leap-atmospheric-physics-ai-climsim/discussion/502484

集成 (Ensemble)

模型集成: w0 * pred0 + w1 * pred1 + ... + w5 * pred5
目标集成:

selects = []
for idx_t, target in tqdm(enumerate(TARGETCOLS), total=len(TARGETCOLS)):
    di = {}
    for idx_p, prob in enumerate(probs):
        di[idx_p] = r2_score(df_valid[target], probs[idx_p][:, idx_t])
    selects.append(sorted(di, key=di.get, reverse=True)[:4])

无效的方法

使用所有 8 年数据集，固定 epochs， without validation (无验证)。
数据增强：mask 10% 输入和 TTA (测试时增强)。

8th solution

第 8 名解决方案

解决方案总结

模型

数据集

一些训练细节

后处理

集成 (Ensemble)

无效的方法

相关链接

同比赛其他方案