import numpy as np ############################################### # Thanks to Dr. Chuck Anderson for his neural # # network with optimizers implementation # # https://www.cs.colostate.edu/~anderson/wp/ # ############################################### class Optimizers(): def __init__(self, all_weights): '''all_weights is a vector of all of a neural networks weights concatenated into a one-dimensional vector''' self.all_weights = all_weights # The following initializations are only used by adam. # Only initializing m, v, beta1t and beta2t here allows multiple calls to adam to handle training # with multiple subsets (batches) of training data. self.mt = np.zeros_like(all_weights) self.vt = np.zeros_like(all_weights) self.beta1 = 0.9 self.beta2 = 0.999 self.beta1t = 1 self.beta2t = 1 def sgd(self, error_f, gradient_f, fargs=[], n_epochs=100, learning_rate=0.001, verbose=True, error_convert_f=None): ''' error_f: function that requires X and T as arguments (given in fargs) and returns mean squared error. gradient_f: function that requires X and T as arguments (in fargs) and returns gradient of mean squared error with respect to each weight. error_convert_f: function that converts the standardized error from error_f to original T units. ''' error_trace = [] epochs_per_print = n_epochs // 10 for epoch in range(n_epochs): error = error_f(*fargs) grad = gradient_f(*fargs) # Update all weights using -= to modify their values in-place. self.all_weights -= learning_rate * grad if error_convert_f: error = error_convert_f(error) error_trace.append(error) if verbose and ((epoch + 1) % max(1, epochs_per_print) == 0): print(f'sgd: Epoch {epoch+1:d} Error={error:.5f}') return error_trace def adam(self, error_f, gradient_f, fargs=[], n_epochs=100, learning_rate=0.001, verbose=True, error_convert_f=None): ''' error_f: function that requires X and T as arguments (given in fargs) and returns mean squared error. gradient_f: function that requires X and T as arguments (in fargs) and returns gradient of mean squared error with respect to each weight. error_convert_f: function that converts the standardized error from error_f to original T units. ''' alpha = learning_rate # learning rate called alpha in original paper on adam epsilon = 1e-8 error_trace = [] epochs_per_print = n_epochs // 10 for epoch in range(n_epochs): error = error_f(*fargs) grad = gradient_f(*fargs) self.mt[:] = self.beta1 * self.mt + (1 - self.beta1) * grad self.vt[:] = self.beta2 * self.vt + (1 - self.beta2) * grad * grad self.beta1t *= self.beta1 self.beta2t *= self.beta2 m_hat = self.mt / (1 - self.beta1t) v_hat = self.vt / (1 - self.beta2t) # Update all weights using -= to modify their values in-place. self.all_weights -= alpha * m_hat / (np.sqrt(v_hat) + epsilon) if error_convert_f: error = error_convert_f(error) error_trace.append(error) if verbose and ((epoch + 1) % max(1, epochs_per_print) == 0): print(f'Adam: Epoch {epoch+1:d} Error={error:.5f}') return error_trace if __name__ == '__main__': import matplotlib.pyplot as plt plt.ion() def parabola(wmin): return ((w - wmin) ** 2)[0] def parabola_gradient(wmin): return 2 * (w - wmin) w = np.array([0.0]) optimizer = Optimizers(w) wmin = 5 optimizer.sgd(parabola, parabola_gradient, [wmin], n_epochs=500, learning_rate=0.1) print(f'sgd: Minimum of parabola is at {wmin}. Value found is {w}') w = np.array([0.0]) optimizer = Optimizers(w) optimizer.adam(parabola, parabola_gradient, [wmin], n_epochs=500, learning_rate=0.1) print(f'adam: Minimum of parabola is at {wmin}. Value found is {w}')