Added training script to remotely train diffusion model

2023-12-29 09:03:09 +00:00
parent 3264b5ac53
commit 1b209a4562
3 changed files with 120 additions and 8 deletions
--- a/src/losses/crps_metric.py
+++ b/src/losses/crps_metric.py
@@ -43,4 +43,33 @@ class CRPSLoss(nn.Module):
        crps_per_sample = np.trapz(tiled_probs ** 2, imbalances, axis=-1)
        crps = np.mean(crps_per_sample)

-        return crps
+        return crps
+
+def crps_from_samples(samples, targets):
+    """
+    Compute the Continuous Ranked Probability Score (CRPS) from multi-day samples and targets
+    using a vectorized approach with PyTorch tensors.
+    
+    :param samples: (day, n_samples, n_timesteps) tensor of forecasted samples
+    :param targets: (day, n_timesteps) tensor of observed values
+    :return: (day, n_timesteps) tensor of CRPS for each timestep for each day
+    """
+    days, n_samples, n_timesteps = samples.shape
+    
+    # Reshape targets to broadcast along the samples dimension (n_samples)
+    targets_reshaped = targets.unsqueeze(1)
+
+    # Compute the absolute differences of forecasts and observations
+    abs_diff = torch.abs(samples - targets_reshaped)
+    # Compute the average of the absolute differences along the samples dimension
+    term1 = torch.mean(abs_diff, dim=1)
+    
+    # Compute the pairwise absolute differences between all samples for each day
+    pairwise_abs_diff = torch.abs(samples.unsqueeze(2) - samples.unsqueeze(1))
+    # Compute the average of these differences along the sample dimensions
+    term2 = torch.mean(pairwise_abs_diff, dim=(1, 2)) / 2
+    
+    # CRPS for each timestep for each day
+    crps = term1 - term2
+    
+    return crps
--- a/src/trainers/diffusion_trainer.py
+++ b/src/trainers/diffusion_trainer.py
@@ -2,7 +2,7 @@ from clearml import Task
 import torch
 import torch.nn as nn
 from torchinfo import summary
-from tqdm import tqdm
+from src.losses.crps_metric import crps_from_samples
 from src.data.preprocessing import DataProcessor

 from src.models.diffusion_model import DiffusionModel
@@ -51,7 +51,7 @@ class DiffusionTrainer:
        model.eval()
        with torch.no_grad():
            x = torch.randn(inputs.shape[0], self.ts_length).to(self.device)
-            for i in tqdm(reversed(range(1, self.noise_steps)), position=0):
+            for i in reversed(range(1, self.noise_steps)):
                t = (torch.ones(inputs.shape[0]) * i).long().to(self.device)
                predicted_noise = model(x, t, inputs)
                alpha = self.alpha[t][:, None]
@@ -127,6 +127,9 @@ class DiffusionTrainer:
            
            running_loss /= len(train_loader.dataset)

+            if epoch % 20 == 0 and epoch != 0:
+                self.test(test_loader, epoch, task)
+
            if task:
                task.get_logger().report_scalar(
                    title=criterion.__class__.__name__,
@@ -196,12 +199,32 @@ class DiffusionTrainer:

            plt.close()

-    def test(self, data_loader: torch.utils.data.DataLoader):
+    def test(self, data_loader: torch.utils.data.DataLoader, epoch: int, task: Task = None):
+        all_crps = []
        for inputs, targets, _ in data_loader:
            inputs, targets = inputs.to(self.device), targets.to(self.device)
            
-            sample = self.sample(self.model, 10, inputs)
+            number_of_samples = 100
+            sample = self.sample(self.model, number_of_samples, inputs)
            
-            # reduce sample from (batch_size, time_steps) to (batch_size / 10, time_steps) by taking mean of each 10 samples
-            sample = sample.view(-1, 10, self.ts_length)
-            sample = torch.mean(sample, dim=1)
+            # reduce samples from (batch_size*number_of_samples, time_steps) to (batch_size, number_of_samples, time_steps)
+            samples_batched = sample.reshape(inputs.shape[0], number_of_samples, 96)
+
+            # calculate crps
+            crps = crps_from_samples(samples_batched, targets)
+            crps_mean = crps.mean(axis=1)
+
+            # add all values from crps_mean to all_crps
+            all_crps.extend(crps_mean.tolist())
+
+        all_crps = np.array(all_crps)
+        mean_crps = all_crps.mean()
+
+        if task:
+            task.get_logger().report_scalar(
+                title="CRPS",
+                series='test',
+                value=mean_crps,
+                iteration=epoch
+            )
+        
--- a/src/training_scripts/diffusion_training.py
+++ b/src/training_scripts/diffusion_training.py
@@ -0,0 +1,60 @@
+from clearml import Task
+from src.data import DataProcessor, DataConfig
+from src.trainers.trainer import Trainer
+from src.utils.clearml import ClearMLHelper
+from src.models import *
+from src.losses import *
+import torch
+import numpy as np
+from torch.nn import MSELoss, L1Loss
+from datetime import datetime
+import torch.nn as nn
+from src.models.time_embedding_layer import TimeEmbedding
+from src.models.diffusion_model import SimpleDiffusionModel
+from src.trainers.diffusion_trainer import DiffusionTrainer
+
+
+clearml_helper = ClearMLHelper(project_name="Thesis/NrvForecast")
+task = clearml_helper.get_task(task_name="Diffusion Training")
+
+# execute remotely
+task.execute_remotely(queue_name="default", exit_process=True)
+
+
+#### Data Processor ####
+data_config = DataConfig()
+data_config.NRV_HISTORY = True
+data_config.LOAD_HISTORY = True
+data_config.LOAD_FORECAST = True
+
+data_config.WIND_FORECAST = True
+data_config.WIND_HISTORY = True
+
+data_config.QUARTER = False
+data_config.DAY_OF_WEEK = False
+
+data_config.NOMINAL_NET_POSITION = True
+
+data_config = Task.connect(data_config, name="data_features")
+
+data_processor = DataProcessor(data_config, path="", lstm=False)
+data_processor.set_batch_size(8192)
+data_processor.set_full_day_skip(True)
+
+inputDim = data_processor.get_input_size()
+
+model_parameters = {
+    "epochs": 5000,
+    "learning_rate": 0.0001,
+    "hidden_sizes": [512, 512, 512],
+    "time_dim": 64,
+}
+
+model_parameters = task.connect(model_parameters, name="model_parameters")
+
+#### Model ####
+model = SimpleDiffusionModel(96, model_parameters["hidden_sizes"], other_inputs_dim=inputDim[1], time_dim=model_parameters["time_dim"])
+
+#### Trainer ####
+trainer = DiffusionTrainer(model, data_processor, "cuda")
+trainer.train(model_parameters["epochs"], model_parameters["learning_rate"], task)