From 1b209a456204bca4f73c786d36159d2d8d1fdfe7 Mon Sep 17 00:00:00 2001 From: Victor Mylle Date: Fri, 29 Dec 2023 09:03:09 +0000 Subject: [PATCH] Added training script to remotely train diffusion model --- src/losses/crps_metric.py | 31 ++++++++++- src/trainers/diffusion_trainer.py | 37 ++++++++++--- src/training_scripts/diffusion_training.py | 60 ++++++++++++++++++++++ 3 files changed, 120 insertions(+), 8 deletions(-) create mode 100644 src/training_scripts/diffusion_training.py diff --git a/src/losses/crps_metric.py b/src/losses/crps_metric.py index fea9a97..57fb4e4 100644 --- a/src/losses/crps_metric.py +++ b/src/losses/crps_metric.py @@ -43,4 +43,33 @@ class CRPSLoss(nn.Module): crps_per_sample = np.trapz(tiled_probs ** 2, imbalances, axis=-1) crps = np.mean(crps_per_sample) - return crps \ No newline at end of file + return crps + +def crps_from_samples(samples, targets): + """ + Compute the Continuous Ranked Probability Score (CRPS) from multi-day samples and targets + using a vectorized approach with PyTorch tensors. + + :param samples: (day, n_samples, n_timesteps) tensor of forecasted samples + :param targets: (day, n_timesteps) tensor of observed values + :return: (day, n_timesteps) tensor of CRPS for each timestep for each day + """ + days, n_samples, n_timesteps = samples.shape + + # Reshape targets to broadcast along the samples dimension (n_samples) + targets_reshaped = targets.unsqueeze(1) + + # Compute the absolute differences of forecasts and observations + abs_diff = torch.abs(samples - targets_reshaped) + # Compute the average of the absolute differences along the samples dimension + term1 = torch.mean(abs_diff, dim=1) + + # Compute the pairwise absolute differences between all samples for each day + pairwise_abs_diff = torch.abs(samples.unsqueeze(2) - samples.unsqueeze(1)) + # Compute the average of these differences along the sample dimensions + term2 = torch.mean(pairwise_abs_diff, dim=(1, 2)) / 2 + + # CRPS for each timestep for each day + crps = term1 - term2 + + return crps diff --git a/src/trainers/diffusion_trainer.py b/src/trainers/diffusion_trainer.py index caba111..f4dcfe7 100644 --- a/src/trainers/diffusion_trainer.py +++ b/src/trainers/diffusion_trainer.py @@ -2,7 +2,7 @@ from clearml import Task import torch import torch.nn as nn from torchinfo import summary -from tqdm import tqdm +from src.losses.crps_metric import crps_from_samples from src.data.preprocessing import DataProcessor from src.models.diffusion_model import DiffusionModel @@ -51,7 +51,7 @@ class DiffusionTrainer: model.eval() with torch.no_grad(): x = torch.randn(inputs.shape[0], self.ts_length).to(self.device) - for i in tqdm(reversed(range(1, self.noise_steps)), position=0): + for i in reversed(range(1, self.noise_steps)): t = (torch.ones(inputs.shape[0]) * i).long().to(self.device) predicted_noise = model(x, t, inputs) alpha = self.alpha[t][:, None] @@ -127,6 +127,9 @@ class DiffusionTrainer: running_loss /= len(train_loader.dataset) + if epoch % 20 == 0 and epoch != 0: + self.test(test_loader, epoch, task) + if task: task.get_logger().report_scalar( title=criterion.__class__.__name__, @@ -196,12 +199,32 @@ class DiffusionTrainer: plt.close() - def test(self, data_loader: torch.utils.data.DataLoader): + def test(self, data_loader: torch.utils.data.DataLoader, epoch: int, task: Task = None): + all_crps = [] for inputs, targets, _ in data_loader: inputs, targets = inputs.to(self.device), targets.to(self.device) - sample = self.sample(self.model, 10, inputs) + number_of_samples = 100 + sample = self.sample(self.model, number_of_samples, inputs) - # reduce sample from (batch_size, time_steps) to (batch_size / 10, time_steps) by taking mean of each 10 samples - sample = sample.view(-1, 10, self.ts_length) - sample = torch.mean(sample, dim=1) \ No newline at end of file + # reduce samples from (batch_size*number_of_samples, time_steps) to (batch_size, number_of_samples, time_steps) + samples_batched = sample.reshape(inputs.shape[0], number_of_samples, 96) + + # calculate crps + crps = crps_from_samples(samples_batched, targets) + crps_mean = crps.mean(axis=1) + + # add all values from crps_mean to all_crps + all_crps.extend(crps_mean.tolist()) + + all_crps = np.array(all_crps) + mean_crps = all_crps.mean() + + if task: + task.get_logger().report_scalar( + title="CRPS", + series='test', + value=mean_crps, + iteration=epoch + ) + diff --git a/src/training_scripts/diffusion_training.py b/src/training_scripts/diffusion_training.py new file mode 100644 index 0000000..83f4b85 --- /dev/null +++ b/src/training_scripts/diffusion_training.py @@ -0,0 +1,60 @@ +from clearml import Task +from src.data import DataProcessor, DataConfig +from src.trainers.trainer import Trainer +from src.utils.clearml import ClearMLHelper +from src.models import * +from src.losses import * +import torch +import numpy as np +from torch.nn import MSELoss, L1Loss +from datetime import datetime +import torch.nn as nn +from src.models.time_embedding_layer import TimeEmbedding +from src.models.diffusion_model import SimpleDiffusionModel +from src.trainers.diffusion_trainer import DiffusionTrainer + + +clearml_helper = ClearMLHelper(project_name="Thesis/NrvForecast") +task = clearml_helper.get_task(task_name="Diffusion Training") + +# execute remotely +task.execute_remotely(queue_name="default", exit_process=True) + + +#### Data Processor #### +data_config = DataConfig() +data_config.NRV_HISTORY = True +data_config.LOAD_HISTORY = True +data_config.LOAD_FORECAST = True + +data_config.WIND_FORECAST = True +data_config.WIND_HISTORY = True + +data_config.QUARTER = False +data_config.DAY_OF_WEEK = False + +data_config.NOMINAL_NET_POSITION = True + +data_config = Task.connect(data_config, name="data_features") + +data_processor = DataProcessor(data_config, path="", lstm=False) +data_processor.set_batch_size(8192) +data_processor.set_full_day_skip(True) + +inputDim = data_processor.get_input_size() + +model_parameters = { + "epochs": 5000, + "learning_rate": 0.0001, + "hidden_sizes": [512, 512, 512], + "time_dim": 64, +} + +model_parameters = task.connect(model_parameters, name="model_parameters") + +#### Model #### +model = SimpleDiffusionModel(96, model_parameters["hidden_sizes"], other_inputs_dim=inputDim[1], time_dim=model_parameters["time_dim"]) + +#### Trainer #### +trainer = DiffusionTrainer(model, data_processor, "cuda") +trainer.train(model_parameters["epochs"], model_parameters["learning_rate"], task) \ No newline at end of file