diff --git a/experiments/datasets/mnist_rot/own_transforms.py b/experiments/datasets/mnist_rot/own_transforms.py index 1e1d08c..6f04364 100644 --- a/experiments/datasets/mnist_rot/own_transforms.py +++ b/experiments/datasets/mnist_rot/own_transforms.py @@ -116,7 +116,7 @@ def __call__(self, img): :type tensor: torch.FloatTensor :param tensor: image tensor to which channel is added """ - img = np.array(img, np.float32, copy=False)[np.newaxis, ...] # add channel dimension + img = np.array(img, np.float32, copy=False)[np.newaxis, ...].copy() # add channel dimension return torch.from_numpy(img) diff --git a/experiments/optimizers_L1L2.py b/experiments/optimizers_L1L2.py index 38ffc23..6e1a51d 100644 --- a/experiments/optimizers_L1L2.py +++ b/experiments/optimizers_L1L2.py @@ -189,7 +189,7 @@ def step(self, closure=None): class Adam(Optimizer): - """Implements Adam algorithm. + r"""Implements Adam algorithm. It has been proposed in `Adam: A Method for Stochastic Optimization`_. @@ -243,13 +243,13 @@ def step(self, closure=None): state['step'] += 1 if group['lamb_L1'] != 0: - grad.add_(group['lamb_L1'], p.sign().data) + grad.add_(p.sign().data, alpha=group['lamb_L1']) if group['lamb_L2'] != 0: - grad.add_(group['lamb_L2'], p.data) + grad.add_(p.data, alpha=group['lamb_L2']) # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) denom = exp_avg_sq.sqrt().add_(group['eps']) @@ -257,8 +257,8 @@ def step(self, closure=None): bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - p.data.addcdiv_(-step_size, exp_avg, denom) - + p.data.addcdiv_(exp_avg, denom, value=-step_size) + return loss