Added trainer for Diffusion model

2023-12-28 16:07:33 +00:00
parent d0fa815b68
commit 3264b5ac53
9 changed files with 3127 additions and 195 deletions
--- a/src/trainers/diffusion_trainer.py
+++ b/src/trainers/diffusion_trainer.py
@@ -0,0 +1,207 @@
+from clearml import Task
+import torch
+import torch.nn as nn
+from torchinfo import summary
+from tqdm import tqdm
+from src.data.preprocessing import DataProcessor
+
+from src.models.diffusion_model import DiffusionModel
+import numpy as np
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+import matplotlib.patches as mpatches
+
+class DiffusionTrainer:
+    def __init__(self, model: nn.Module, data_processor: DataProcessor, device: torch.device):
+        self.model = model
+        self.device = device
+
+        self.noise_steps = 1000
+        self.beta_start = 1e-4
+        self.beta_end = 0.02
+        self.ts_length = 96
+        
+        self.data_processor = data_processor
+
+        self.beta = torch.linspace(self.beta_start, self.beta_end, self.noise_steps).to(self.device)
+        self.alpha = 1. - self.beta
+        self.alpha_hat = torch.cumprod(self.alpha, dim=0)
+
+    def noise_time_series(self, x: torch.tensor, t: int):
+        """ Add noise to time series
+        Args:
+            x (torch.tensor): shape (batch_size, time_steps)
+            t (int): index of time step
+        """
+        sqrt_alpha_hat = torch.sqrt(self.alpha_hat[t])[:, None]
+        sqrt_one_minus_alpha_hat = torch.sqrt(1. - self.alpha_hat[t])[:, None]
+        noise = torch.randn_like(x)
+        return sqrt_alpha_hat * x + sqrt_one_minus_alpha_hat * noise, noise
+    
+    def sample_timesteps(self, n: int):
+        """ Sample timesteps for noise
+        Args:
+            n (int): number of samples
+        """
+        return torch.randint(low=1, high=self.noise_steps, size=(n,))
+
+    def sample(self, model: DiffusionModel, n: int, inputs: torch.tensor):
+        inputs = inputs.repeat(n, 1).to(self.device)
+        model.eval()
+        with torch.no_grad():
+            x = torch.randn(inputs.shape[0], self.ts_length).to(self.device)
+            for i in tqdm(reversed(range(1, self.noise_steps)), position=0):
+                t = (torch.ones(inputs.shape[0]) * i).long().to(self.device)
+                predicted_noise = model(x, t, inputs)
+                alpha = self.alpha[t][:, None]
+                alpha_hat = self.alpha_hat[t][:, None]
+                beta = self.beta[t][:, None]
+
+                if i > 1:
+                    noise = torch.randn_like(x)
+                else:
+                    noise = torch.zeros_like(x)
+
+                x = 1/torch.sqrt(alpha) * (x-((1-alpha) / (torch.sqrt(1 - alpha_hat))) * predicted_noise) + torch.sqrt(beta) * noise
+        model.train()
+        return x
+    
+    def random_samples(self, train: bool = True, num_samples: int = 10):
+        train_loader, test_loader = self.data_processor.get_dataloaders(
+            predict_sequence_length=96
+        )
+
+        if train:
+            loader = train_loader
+        else:
+            loader = test_loader
+
+        indices = np.random.randint(0, len(loader.dataset) - 1, size=num_samples)
+        return indices
+    
+    def init_clearml_task(self, task):
+        task.add_tags(self.model.__class__.__name__)
+        task.add_tags(self.__class__.__name__)
+
+        input_data = torch.randn(1024, 96).to(self.device)
+        time_steps = torch.randn(1024).long().to(self.device)
+        other_input_data = torch.randn(1024, self.model.other_inputs_dim).to(self.device)
+
+        task.set_configuration_object("model", str(summary(self.model, input_data=[input_data, time_steps, other_input_data])))
+
+        self.data_processor = task.connect(self.data_processor, name="data_processor")
+        
+    def train(self, epochs: int, learning_rate: float, task: Task = None):
+        optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
+        criterion = nn.MSELoss()
+        self.model.to(self.device)
+
+        if task:
+            self.init_clearml_task(task)
+
+        train_loader, test_loader = self.data_processor.get_dataloaders(
+            predict_sequence_length=self.ts_length
+        )
+
+        train_sample_indices = self.random_samples(train=True, num_samples=10)
+        test_sample_indices = self.random_samples(train=False, num_samples=10)
+
+        for epoch in range(epochs):
+            running_loss = 0.0
+            for i, k in enumerate(train_loader):
+                time_series, base_pattern = k[1], k[0]
+                time_series = time_series.to(self.device)
+                base_pattern = base_pattern.to(self.device)
+
+                t = self.sample_timesteps(time_series.shape[0]).to(self.device)
+                x_t, noise = self.noise_time_series(time_series, t)
+                predicted_noise = self.model(x_t, t, base_pattern)
+                loss = criterion(predicted_noise, noise)
+
+                running_loss += loss.item()
+
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+            
+            running_loss /= len(train_loader.dataset)
+
+            if task:
+                task.get_logger().report_scalar(
+                    title=criterion.__class__.__name__,
+                    series='train',
+                    iteration=epoch,
+                    value=loss.item(),
+                )
+
+                if epoch % 100 == 0 and epoch != 0:
+                    self.debug_plots(task, True, train_loader, train_sample_indices, epoch)
+                    self.debug_plots(task, False, test_loader, test_sample_indices, epoch)
+
+        if task:
+            task.close()
+
+        
+    def debug_plots(self, task, training: bool, data_loader, sample_indices, epoch):
+        for i, idx in enumerate(sample_indices):
+            features, target, _ = data_loader.dataset[idx]
+
+            features = features.to(self.device)
+
+            self.model.eval()
+            with torch.no_grad():
+                samples = self.sample(self.model, 100, features).cpu().numpy()
+               
+            ci_99_upper = np.quantile(samples, 0.99, axis=0)
+            ci_99_lower = np.quantile(samples, 0.01, axis=0)
+
+            ci_95_upper = np.quantile(samples, 0.95, axis=0)
+            ci_95_lower = np.quantile(samples, 0.05, axis=0)
+
+            ci_90_upper = np.quantile(samples, 0.9, axis=0)
+            ci_90_lower = np.quantile(samples, 0.1, axis=0)
+
+            ci_50_upper = np.quantile(samples, 0.5, axis=0)
+            ci_50_lower = np.quantile(samples, 0.5, axis=0)
+
+            sns.set_theme()
+            time_steps = np.arange(0, 96)
+
+            fig, ax = plt.subplots(figsize=(20, 10))
+            ax.plot(time_steps, samples.mean(axis=0), label="Mean of NRV samples", linewidth=3)
+            # ax.fill_between(time_steps, ci_lower, ci_upper, color='b', alpha=0.2, label='Full Interval')
+
+            ax.fill_between(time_steps, ci_99_lower, ci_99_upper, color='b', alpha=0.2, label='99% Interval')
+            ax.fill_between(time_steps, ci_95_lower, ci_95_upper, color='b', alpha=0.2, label='95% Interval')
+            ax.fill_between(time_steps, ci_90_lower, ci_90_upper, color='b', alpha=0.2, label='90% Interval')
+            ax.fill_between(time_steps, ci_50_lower, ci_50_upper, color='b', alpha=0.2, label='50% Interval')
+
+            ax.plot(target, label="Real NRV", linewidth=3)
+            # full_interval_patch = mpatches.Patch(color='b', alpha=0.2, label='Full Interval')
+            ci_99_patch = mpatches.Patch(color='b', alpha=0.3, label='99% Interval')
+            ci_95_patch = mpatches.Patch(color='b', alpha=0.4, label='95% Interval')
+            ci_90_patch = mpatches.Patch(color='b', alpha=0.5, label='90% Interval')
+            ci_50_patch = mpatches.Patch(color='b', alpha=0.6, label='50% Interval')
+
+
+            ax.legend(handles=[ci_99_patch, ci_95_patch, ci_90_patch, ci_50_patch, ax.lines[0], ax.lines[1]])
+
+            task.get_logger().report_matplotlib_figure(
+                title="Training" if training else "Testing",
+                series=f'Sample {i}',
+                iteration=epoch,
+                figure=fig,
+            )
+
+            plt.close()
+
+    def test(self, data_loader: torch.utils.data.DataLoader):
+        for inputs, targets, _ in data_loader:
+            inputs, targets = inputs.to(self.device), targets.to(self.device)
+            
+            sample = self.sample(self.model, 10, inputs)
+            
+            # reduce sample from (batch_size, time_steps) to (batch_size / 10, time_steps) by taking mean of each 10 samples
+            sample = sample.view(-1, 10, self.ts_length)
+            sample = torch.mean(sample, dim=1)