Sped up sampling 20x

This commit is contained in:
Victor Mylle
2023-11-25 18:09:42 +00:00
parent 5de3f64a1a
commit 300f268286
10 changed files with 498 additions and 238 deletions

View File

@@ -7,8 +7,18 @@ import numpy as np
import plotly.subplots as sp
from plotly.subplots import make_subplots
class Trainer:
def __init__(self, model: torch.nn.Module, optimizer: torch.optim.Optimizer, criterion: torch.nn.Module, data_processor: DataProcessor, device: torch.device, clearml_helper: ClearMLHelper = None, debug: bool = True):
def __init__(
self,
model: torch.nn.Module,
optimizer: torch.optim.Optimizer,
criterion: torch.nn.Module,
data_processor: DataProcessor,
device: torch.device,
clearml_helper: ClearMLHelper = None,
debug: bool = True,
):
self.model = model
self.optimizer = optimizer
self.criterion = criterion
@@ -49,7 +59,7 @@ class Trainer:
task = self.clearml_helper.get_task(task_name=task_name)
if self.debug:
task.add_tags('Debug')
task.add_tags("Debug")
change_description = input("Enter a change description: ")
if change_description:
@@ -70,9 +80,11 @@ class Trainer:
task.connect(self.data_processor.data_config, name="data_features")
return task
def random_samples(self, train: bool = True, num_samples: int = 10):
train_loader, test_loader = self.data_processor.get_dataloaders(predict_sequence_length=self.model.output_size)
train_loader, test_loader = self.data_processor.get_dataloaders(
predict_sequence_length=self.model.output_size
)
if train:
loader = train_loader
@@ -82,10 +94,11 @@ class Trainer:
indices = np.random.randint(0, len(loader.dataset) - 1, size=num_samples)
return indices
def train(self, epochs: int):
try:
train_loader, test_loader = self.data_processor.get_dataloaders(predict_sequence_length=self.model.output_size)
train_loader, test_loader = self.data_processor.get_dataloaders(
predict_sequence_length=self.model.output_size
)
train_samples = self.random_samples(train=True)
test_samples = self.random_samples(train=False)
@@ -99,7 +112,7 @@ class Trainer:
self.model.train()
running_loss = 0.0
for inputs, targets in train_loader:
for inputs, targets, _ in train_loader:
inputs, targets = inputs.to(self.device), targets.to(self.device)
self.optimizer.zero_grad()
@@ -110,33 +123,48 @@ class Trainer:
self.optimizer.step()
running_loss += loss.item()
running_loss /= len(train_loader.dataset)
test_loss = self.test(test_loader)
if self.patience is not None:
if self.best_score is None or test_loss < self.best_score + self.delta:
if (
self.best_score is None
or test_loss < self.best_score + self.delta
):
self.save_checkpoint(test_loss, task, epoch)
counter = 0
else:
counter += 1
if counter >= self.patience:
print('Early stopping triggered')
print("Early stopping triggered")
break
if task:
task.get_logger().report_scalar(title=self.criterion.__class__.__name__, series="train", value=running_loss, iteration=epoch)
task.get_logger().report_scalar(title=self.criterion.__class__.__name__, series="test", value=test_loss, iteration=epoch)
task.get_logger().report_scalar(
title=self.criterion.__class__.__name__,
series="train",
value=running_loss,
iteration=epoch,
)
task.get_logger().report_scalar(
title=self.criterion.__class__.__name__,
series="test",
value=test_loss,
iteration=epoch,
)
if epoch % self.plot_every_n_epochs == 0:
self.debug_plots(task, True, train_loader, train_samples, epoch)
self.debug_plots(task, False, test_loader, test_samples, epoch)
if hasattr(self, 'plot_quantile_percentages'):
self.plot_quantile_percentages(task, train_loader, True, epoch)
self.plot_quantile_percentages(task, test_loader, False, epoch)
if hasattr(self, "plot_quantile_percentages"):
self.plot_quantile_percentages(
task, train_loader, True, epoch
)
self.plot_quantile_percentages(
task, test_loader, False, epoch
)
if task:
self.finish_training(task=task)
@@ -147,23 +175,32 @@ class Trainer:
task.set_archived(True)
raise
def log_final_metrics(self, task, dataloader, train: bool = True):
metrics = { metric.__class__.__name__: 0.0 for metric in self.metrics_to_track }
transformed_metrics = { metric.__class__.__name__: 0.0 for metric in self.metrics_to_track }
metrics = {metric.__class__.__name__: 0.0 for metric in self.metrics_to_track}
transformed_metrics = {
metric.__class__.__name__: 0.0 for metric in self.metrics_to_track
}
with torch.no_grad():
for inputs, targets in dataloader:
for inputs, targets, _ in dataloader:
inputs, targets = inputs.to(self.device), targets
outputs = self.model(inputs)
inversed_outputs = torch.tensor(self.data_processor.inverse_transform(outputs))
inversed_inputs = torch.tensor(self.data_processor.inverse_transform(targets))
inversed_outputs = torch.tensor(
self.data_processor.inverse_transform(outputs)
)
inversed_inputs = torch.tensor(
self.data_processor.inverse_transform(targets)
)
for metric in self.metrics_to_track:
transformed_metrics[metric.__class__.__name__] += metric(outputs, targets.to(self.device))
metrics[metric.__class__.__name__] += metric(inversed_outputs, inversed_inputs)
transformed_metrics[metric.__class__.__name__] += metric(
outputs, targets.to(self.device)
)
metrics[metric.__class__.__name__] += metric(
inversed_outputs, inversed_inputs
)
for metric in self.metrics_to_track:
metrics[metric.__class__.__name__] /= len(dataloader)
@@ -171,74 +208,109 @@ class Trainer:
for metric_name, metric_value in metrics.items():
if train:
metric_name = f'train_{metric_name}'
metric_name = f"train_{metric_name}"
else:
metric_name = f'test_{metric_name}'
task.get_logger().report_single_value(name=metric_name, value=metric_value)
metric_name = f"test_{metric_name}"
task.get_logger().report_single_value(
name=metric_name, value=metric_value
)
for metric_name, metric_value in transformed_metrics.items():
if train:
metric_name = f'train_transformed_{metric_name}'
metric_name = f"train_transformed_{metric_name}"
else:
metric_name = f'test_transformed_{metric_name}'
metric_name = f"test_transformed_{metric_name}"
task.get_logger().report_single_value(name=metric_name, value=metric_value)
task.get_logger().report_single_value(
name=metric_name, value=metric_value
)
def finish_training(self, task):
if self.best_score is not None:
self.model.load_state_dict(torch.load('checkpoint.pt'))
self.model.load_state_dict(torch.load("checkpoint.pt"))
self.model.eval()
train_loader, test_loader = self.data_processor.get_dataloaders(predict_sequence_length=self.model.output_size)
train_loader, test_loader = self.data_processor.get_dataloaders(
predict_sequence_length=self.model.output_size
)
if not hasattr(self, 'plot_quantile_percentages'):
if not hasattr(self, "plot_quantile_percentages"):
self.log_final_metrics(task, train_loader, train=True)
self.log_final_metrics(task, test_loader, train=False)
def test(self, test_loader: torch.utils.data.DataLoader):
self.model.eval()
test_loss = 0
with torch.no_grad():
for data, target in test_loader:
for data, target, _ in test_loader:
data, target = data.to(self.device), target.to(self.device)
output = self.model(data)
test_loss += self.criterion(output, target).item()
test_loss /= len(test_loader.dataset)
return test_loss
def save_checkpoint(self, val_loss, task, iteration: int):
torch.save(self.model.state_dict(), 'checkpoint.pt')
task.update_output_model(model_path='checkpoint.pt', iteration=iteration, auto_delete_file=False)
torch.save(self.model.state_dict(), "checkpoint.pt")
task.update_output_model(
model_path="checkpoint.pt", iteration=iteration, auto_delete_file=False
)
self.best_score = val_loss
def get_plot(self, current_day, next_day, predictions, show_legend: bool = True):
def get_plot(
self,
current_day,
next_day,
predictions,
show_legend: bool = True,
retransform: bool = True,
):
if retransform:
current_day = self.data_processor.inverse_transform(current_day)
next_day = self.data_processor.inverse_transform(next_day)
predictions = self.data_processor.inverse_transform(predictions)
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(96), y=current_day.view(-1).cpu().numpy(), name="Current Day"))
fig.add_trace(go.Scatter(x=96 + np.arange(96), y=next_day.view(-1).cpu().numpy(), name="Next Day"))
fig.add_trace(
go.Scatter(
x=np.arange(96),
y=current_day.view(-1).cpu().numpy(),
name="Current Day",
)
)
fig.add_trace(
go.Scatter(
x=96 + np.arange(96), y=next_day.view(-1).cpu().numpy(), name="Next Day"
)
)
fig.add_trace(go.Scatter(x=96 + np.arange(96), y=predictions.reshape(-1), name="Predictions"))
fig.add_trace(
go.Scatter(
x=96 + np.arange(96), y=predictions.reshape(-1), name="Predictions"
)
)
fig.update_layout(title="Predictions of the Linear Model")
return fig
def debug_plots(self, task, train: bool, data_loader, sample_indices, epoch):
num_samples = len(sample_indices)
rows = num_samples # One row per sample since we only want one column
cols = 1
fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'Sample {i+1}' for i in range(num_samples)])
fig = make_subplots(
rows=rows,
cols=cols,
subplot_titles=[f"Sample {i+1}" for i in range(num_samples)],
)
for i, idx in enumerate(sample_indices):
features, target = data_loader.dataset[idx]
features, target, _ = data_loader.dataset[idx]
features = features.to(self.device)
target = target.to(self.device)
@@ -247,29 +319,29 @@ class Trainer:
with torch.no_grad():
predictions = self.model(features).cpu()
sub_fig = self.get_plot(features[:96], target, predictions, show_legend=(i == 0))
sub_fig = self.get_plot(
features[:96], target, predictions, show_legend=(i == 0)
)
row = i + 1
col = 1
for trace in sub_fig.data:
fig.add_trace(trace, row=row, col=col)
# loss = self.criterion(predictions.to(self.device), target.squeeze(-1).to(self.device)).item()
# fig['layout']['annotations'][i].update(text=f"{loss.__class__.__name__}: {loss:.6f}")
# y axis same for all plots
fig.update_yaxes(range=[-1, 1], col=1)
# fig.update_yaxes(range=[-1, 1], col=1)
fig.update_layout(height=300 * rows)
fig.update_layout(height=1000 * rows)
task.get_logger().report_plotly(
title=f"{'Training' if train else 'Test'} Samples",
series="full_day",
iteration=epoch,
figure=fig
figure=fig,
)
def debug_scatter_plot(self, task, train: bool, samples, epoch):
@@ -285,7 +357,11 @@ class Trainer:
rows = -(-num_samples // 2) # Ceiling division to handle odd number of samples
cols = 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'Sample {i+1}' for i in range(num_samples)])
fig = make_subplots(
rows=rows,
cols=cols,
subplot_titles=[f"Sample {i+1}" for i in range(num_samples)],
)
for i, (current_day, next_value, pred) in enumerate(zip(X, y, predictions)):
sub_fig = self.scatter_plot(current_day, pred, next_value)
@@ -299,14 +375,16 @@ class Trainer:
title=f"{'Training' if train else 'Test'} Samples",
series="scatter",
iteration=epoch,
figure=fig
figure=fig,
)
def scatter_plot(self, x, y, real_y):
fig = go.Figure()
# 96 values of x
fig.add_trace(go.Scatter(x=np.arange(96), y=x.view(-1).cpu().numpy(), name="Current Day"))
fig.add_trace(
go.Scatter(x=np.arange(96), y=x.view(-1).cpu().numpy(), name="Current Day")
)
# add one value of y
fig.add_trace(go.Scatter(x=[96], y=[y.item()], name="Next Day"))
@@ -315,4 +393,4 @@ class Trainer:
fig.add_trace(go.Scatter(x=[96], y=[real_y.item()], name="Real Next Day"))
fig.update_layout(title="Predictions of the Linear Model")
return fig
return fig