Problem of freeze metrics after first epoch

3 weeks ago 12
ARTICLE AD BOX

I encountered a problem with metrics fading after the first training epoch. During the first epoch, the model training proceeds normally. The loss metrics decrease, and the accuracy increases. The accuracy reaches 20% for training and 6% for validation. After that, the accuracy remains unchanged or even starts to decrease, and the loss increases for both training and validation.
My implemented 1D-CNN model:

class Model(nn.Module): def __init__(self, num_classes=8705, input_length=224): super(Model, self).__init__() self.act_fn = nn.LeakyReLU(negative_slope=0.05) self.conv1_a = nn.Conv1d(1, 32, kernel_size=1) self.bn1_a = nn.BatchNorm1d(32) self.conv1_b = nn.Conv1d(1, 32, kernel_size=3, padding=1) self.bn1_b = nn.BatchNorm1d(32) self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2) self.conv2_a = nn.Conv1d(64, 32, kernel_size=1) self.bn2_a = nn.BatchNorm1d(32) self.conv2_b = nn.Conv1d(64, 32, kernel_size=3, padding=1) self.bn2_b = nn.BatchNorm1d(32) self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2) self.conv3_a = nn.Conv1d(64, 32, kernel_size=1) self.bn3_a = nn.BatchNorm1d(32) self.conv3_b = nn.Conv1d(64, 32, kernel_size=3, padding=1) self.bn3_b = nn.BatchNorm1d(32) self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2) self.conv4_a = nn.Conv1d(64, 32, kernel_size=1) self.bn4_a = nn.BatchNorm1d(32) self.conv4_b = nn.Conv1d(64, 32, kernel_size=3, padding=1) self.bn4_b = nn.BatchNorm1d(32) self.pool4 = nn.MaxPool1d(kernel_size=2, stride=2) self.conv5_a = nn.Conv1d(64, 32, kernel_size=1) self.bn5_a = nn.BatchNorm1d(32) self.conv5_b = nn.Conv1d(64, 32, kernel_size=3, padding=1) self.bn5_b = nn.BatchNorm1d(32) self.pool5 = nn.MaxPool1d(kernel_size=2, stride=2) self.final_conv = nn.Conv1d(64, num_classes, kernel_size=1) self.final_bn = nn.BatchNorm1d(num_classes) self.gap = nn.AdaptiveAvgPool1d(1) def forward(self, x): b1_a = self.act_fn(self.bn1_a(self.conv1_a(x))) b1_b = self.act_fn(self.bn1_b(self.conv1_b(x))) x = torch.cat([b1_a, b1_b], dim=1) # Concat по каналам (32+32=64) x = self.pool1(x) b2_a = self.act_fn(self.bn2_a(self.conv2_a(x))) b2_b = self.act_fn(self.bn2_b(self.conv2_b(x))) x = torch.cat([b2_a, b2_b], dim=1) x = self.pool2(x) b3_a = self.act_fn(self.bn3_a(self.conv3_a(x))) b3_b = self.act_fn(self.bn3_b(self.conv3_b(x))) x = torch.cat([b3_a, b3_b], dim=1) x = self.pool3(x) b4_a = self.act_fn(self.bn4_a(self.conv4_a(x))) b4_b = self.act_fn(self.bn4_b(self.conv4_b(x))) x = torch.cat([b4_a, b4_b], dim=1) x = self.pool4(x) b5_a = self.act_fn(self.bn5_a(self.conv5_a(x))) b5_b = self.act_fn(self.bn5_b(self.conv5_b(x))) x = torch.cat([b5_a, b5_b], dim=1) x = self.pool5(x) x = self.final_conv(x) x = self.final_bn(x) x = self.act_fn(x) x = self.gap(x) x = x.squeeze(-1) return x

The model takes input data in the form of a 1x224 vector. The model must work with 8705 classes. The data is implemented in two forms: 8705*3600 in training and 8705 in validation. The training set is also divided into 80/20 proportions of 3600, while maintaining 8705 classes. I use SGD, CrossEntropyLoss, and Step in training. LR = 0.01, Momentum = 0.9, weight_decay = 0.00005, lr_step_size = 10000, lr_gamma = 0.01

def train_epoch(self, epoch, total_epochs): self.model.train() running_loss = 0.0 correct = 0 total = 0 pbar = tqdm( self.train_loader, desc=f"Ep {epoch}/{total_epochs}", leave=False, ncols=80, bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{postfix}]' ) for batch_idx, (data, targets) in enumerate(pbar): data, targets = data.to(self.device), targets.to(self.device) self.optimizer.zero_grad() outputs = self.model(data) loss = self.criterion(outputs, targets) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0) self.optimizer.step() batch_loss = loss.item() _, predicted = torch.max(outputs.data, 1) batch_correct = (predicted == targets).sum().item() batch_total = targets.size(0) running_loss += batch_loss correct += batch_correct total += batch_total avg_loss = running_loss / (batch_idx + 1) avg_acc = 100 * correct / total pbar.set_postfix_str( f"Loss={avg_loss:.4f}, Acc={avg_acc:.2f}%, LR={self.optimizer.param_groups[0]['lr']:.1e}" ) self.history['batch_loss'].append(avg_loss) self.history['batch_acc'].append(avg_acc) return running_loss / len(self.train_loader), 100 * correct / total

Can you tell me what the problem is, because I’ve been struggling with this issue for a long time. I’ve tried switching from SGD to AdamW optimizer, and the metrics have improved significantly, but I find it suspicious.

Thanks

Read Entire Article