The 'valid_loss_manual' is myself calculate batch average one, while 'val_loss_epoch' is the output from the lightning. There are always differences. Besides, the valid loss could not plot each step. I tried rich progress bar, it still do not show class RegressiveModule(pl.LightningModule): def __init__(self, model_name, train_config, model_config): super().__init__() self.save_hyperparameters(ignore=["model_name"]) # self.model = model self.load_model(model_name, model_config) self.train_config = train_config self.criterion = torch.nn.MSELoss() # compute_sequence_loss(water_pred_seq, target_seq) self.training_step_outputs = [] self.validation_step_outputs = [] self.test_step_outputs = [] self.testTime = [] self.predictions= [] self.targets =[] def load_model(self, model_name, model_config): name = model_name # Change the `snake_case.py` file name to `CamelCase` class name. # Please always name your model file name as `snake_case.py` and # class name corresponding `CamelCase`. camel_name = ''.join([i.capitalize() for i in name.split('_')]) try: Model = getattr(importlib.import_module( '.' + name, package=__package__), camel_name) except: raise ValueError( f'Invalid Module File Name or Invalid Class Name {name}.{camel_name}!') self.model = self.instancialize(Model) def instancialize(self, Model, **other_args): """ Instancialize a model using the corresponding parameters from self.hparams dictionary. You can also input any args to overwrite the corresponding value in self.hparams. """ # model_config = Model[1] # Model = Model[0] class_args = inspect.getargspec(Model.__init__).args[1:] inkeys = self.hparams.keys() args1 = {} for arg in class_args: if arg in inkeys: args1[arg] = getattr(self.hparams, arg) args1.update(other_args) return Model(**args1) def forward(self, img): return self.model(img) def training_step(self, batch, batch_idx): print('in train loop') signal, time_feat, target_seq, pattern_labels = batch water_pred_seq = self.model(signal,time_feat) loss = self.criterion(water_pred_seq, target_seq) self.training_step_outputs.append(loss) self.log("train_loss",loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) return {"loss": loss} def on_train_epoch_end(self, *arg, **kwargs): epoch_average = torch.stack(self.training_step_outputs).mean() self.log("training_epoch_average", epoch_average) self.training_step_outputs.clear() # free memory # print("\n\ntrain_loss_manual", epoch_average) def validation_step(self, batch, batch_idx): # print('\n\nvalid step') signal, time_feat, target_seq, pattern_labels = batch pred_seq = self.model(signal,time_feat) val_loss = self.criterion(pred_seq, target_seq) self.validation_step_outputs.append(val_loss) self.log_dict({'val_loss': val_loss}, on_step=True, on_epoch=True, prog_bar=True,logger=True) # Critical: on_epoch aggregates batch losses return {"val_loss": val_loss} def on_validation_epoch_end(self, *arg, **kwargs): epoch_average = torch.stack(self.validation_step_outputs).mean() self.log("validation_epoch_average", epoch_average) print("\n\nvalid_loss_manual", epoch_average) self.validation_step_outputs.clear() def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.train_config.get('learningRate', 1e-3), weight_decay=self.train_config.get('weight_decay', 1e-3), betas=(0.9, 0.999), eps=1e-08, ) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=self.train_config.get('patience', 1e-3), factor=self.train_config.get('factor', 1e-3)) return { "optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss", 'interval': 'epoch', 'frequency': 1 }
valid_loss_manual tensor(0.4408, device='cuda:0')Epoch 13/199 ━━━━━━━━━━━━━━━━━ 9/9 0:00:00 • 0:00:00 95.71it/s v_num: 273.000 train_loss_step: 0.539 val_loss_step: 0.501 val_loss_epoch: 0.421 train_loss_epoch: 0.313 in train loopin train loopin train loopin train loopin train loopin train loopin train loopin train loopin train loopvalid_loss_manual tensor(0.3893, device='cuda:0')Epoch 14/199 ━━━━━━━━━━━━━━━━━ 9/9 0:00:00 • 0:00:00 95.85it/s v_num: 273.000 train_loss_step: 0.489 val_loss_step: 0.388 val_loss_epoch: 0.390 train_loss_epoch: 0.309 in train loopin train loopin train loopin train loopin train loopin train loopin train loopin train loopin train loopvalid_loss_manual tensor(0.3756, device='cuda:0')Epoch 15/199 ━━━━━━━━━━━━━━━━━ 9/9 0:00:00 • 0:00:00 95.63it/s v_num: 273.000 train_loss_step: 0.465 val_loss_step: 0.420 val_loss_epoch: 0.361 train_loss_epoch: 0.306
|