Gradient Descent Construction#

# %pip install -q --progress-bar off d2l==1.0.0a1.post0 # replace ! with % for jupyter notebook

Ref: https://d2l.ai/chapter_optimization/gd.html

import copy
from abc import ABC, abstractmethod
from typing import Any, Callable, List, Optional, Tuple, TypeVar, Union

import autograd.numpy as np  # import numpy as np
import torch
from autograd import grad
from d2l import torch as d2l

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[2], line 8
      6 import torch
      7 from autograd import grad
----> 8 from d2l import torch as d2l

ModuleNotFoundError: No module named 'd2l'

T = TypeVar("T", bound=Union[float, np.ndarray, torch.Tensor])

class Function:
    def __init__(self, f: Callable) -> None:
        self._f = f

    def __call__(self, x: T, *args: Any, **kwargs: Any) -> T:
        # a function returns a scalar float
        fx = self._f(*args, **kwargs)
        return fx


class GradFunction:
    def __init__(self, grad_f: Callable) -> None:
        self._grad_f = grad_f

    def __call__(self, x: T, *args: Any, **kwargs: Any) -> T:
        # grad fn/vector returns an array of vector
        grad_fx = self._grad_f(*args, **kwargs)
        return grad_fx

Base#

# pylint: disable=invalid-name, redefined-outer-name, too-few-public-methods
class Optimizer(ABC):
    """Abstract class for optimizers.
    Currently, you need to define the f and grad_f functions in the constructor.
    """

    def __init__(self, f: Callable, grad_f: Callable, lr: float, **kwargs: Any) -> None:
        self.f = f
        self.grad_f = grad_f
        self.lr = lr
        self.kwargs = kwargs

    @abstractmethod
    def step(
        self,
        weights: T,
        biases: Optional[T] = None,
    ) -> None:
        """Update the weights and biases.

        Args:
            weights (np.ndarray): The weights to update.
            biases (np.ndarray): The biases to update.

        Raises:
            NotImplementedError:
        """
        raise NotImplementedError

Gradient Descent#

class GradientDescent(Optimizer):
    """Gradient descent optimizer."""

    weights_history: List[T]

    def __init__(self, f: Callable, grad_f: Callable, lr: float, **kwargs: Any) -> None:
        super().__init__(f, grad_f, lr, **kwargs)
        self.weights_history = []

    def step(
        self,
        weights: T,
        biases: Optional[T] = None,
    ) -> Tuple[T, T, List[T]]:
        # grad vec at each point
        gradient_vector = self.grad_f(weights)
        weights -= self.lr * gradient_vector

        if biases is not None:
            raise NotImplementedError("Biases are not implemented yet.")

        print(f"weights {weights}, biases {biases}")

        # must use copy if not weights (np array) mutatable:
        # https://tinyurl.com/bdff69by
        self.weights_history.append(copy.copy(weights))
        return weights, biases, self.weights_history

def f(w: T) -> T:  # Objective function
    """f(w) = w^2"""
    return w**2


def grad_f(w: T) -> T:  # Gradient (derivative) of the objective function
    """f'(w) = 2w"""
    return 2 * w

batch_gd = GradientDescent(f=f, grad_f=grad_f, lr=0.2)

initial_weight = 10
initial_bias = None # we ignore bias for simplicity
num_epochs = 10
for epoch in range(num_epochs):
    initial_weight, initial_bias, weights_history = batch_gd.step(
        initial_weight, initial_bias
    )

weights 6.0, biases None
weights 3.5999999999999996, biases None
weights 2.1599999999999997, biases None
weights 1.2959999999999998, biases None
weights 0.7775999999999998, biases None
weights 0.46655999999999986, biases None
weights 0.2799359999999999, biases None
weights 0.16796159999999993, biases None
weights 0.10077695999999996, biases None
weights 0.06046617599999997, biases None

def show_trace(f: Callable, results: List[T]) -> None:
    """Show the trace of variables during optimization."""
    n = max(abs(min(results)), abs(max(results)))
    f_line = torch.arange(-n, n, 0.01)
    d2l.set_figsize()
    d2l.plot(
        [f_line, results],
        [[f(x) for x in f_line], [f(x) for x in results]],
        "x",
        "f(x)",
        fmts=["-", "-o"],
    )
    d2l.plt.show()

show_trace(f, weights_history)

../../_images/8b561b7676bffe4101ecd1a541edadb29a0eb056f2ad574d6c3017a524cdd9d8.svg

2D Example#

Let the problem be a linear model involving 2 variables:

\[ y = X \beta = \beta_1 x_1 + \beta_2 x_2 \]

As usual, we know \(X\) is the design matrix and we want to find optimal weights for \(\beta\).

Later we will use \(\beta\) and \(w\) interchangeably.

Let the loss function be

\[ L(\beta_1, \beta_2) = \beta_1^2 + 2\beta_2^2 \]

Then its gradient vector at a point \((\beta_1, \beta_2)\) is:

\[\begin{split} \nabla L(\beta_1, \beta_2) = \begin{bmatrix} 2\beta_1 \\ 4\beta_2 \end{bmatrix} \end{split}\]

In the original text, the author defines the function as follows:

def f_2d(w1, w2):  # Objective function
    return w1 ** 2 + 2 * w2 ** 2

def grad_f_2d(w1, w2):  # Gradient of the objective function
    return (2 * w1, 4 * w2)

We reframe the same functions below to take in only 1 weight vector so autograd can do its work.

def f_2d(w: T) -> T:  # Objective function
    """w = [w1, w2] \\in R^2.
    f(w) = w_1^2 + 2w_2^2"""
    w1, w2 = w[0], w[1]
    return w1**2 + 2 * w2**2


def grad_f_2d(w: T) -> T:  # Gradient (derivative) of the objective function
    """\\nabla f = f'(w) = [2w_1, 4w_2] \\in R^2."""
    return np.asarray([2 * w[0], 4 * w[1]])

initial_weights = np.asarray([-5, -2]).astype('float32') 
initial_biases = None # we ignore bias for simplicity
num_epochs = 20

We can use autograd to directly compute the gradient vector for us.

autograd_f = grad(f_2d)
assert autograd_f(initial_weights).all() == grad_f_2d(initial_weights).all() == np.asarray([10, -8]).all()

batch_gd_no_autograd = GradientDescent(f=f_2d, grad_f=grad_f_2d, lr=0.1)  # no autograd
batch_gd_with_autograd = GradientDescent(f=f_2d, grad_f=autograd_f, lr=0.1)  # autograd

for epoch in range(num_epochs):
    initial_weights, initial_biases, weights_history = batch_gd_no_autograd.step(
        initial_weights, initial_biases
    )

weights [-4.  -1.2], biases None
weights [-3.2  -0.72], biases None
weights [-2.56  -0.432], biases None
weights [-2.0479999 -0.2592   ], biases None
weights [-1.6383998 -0.15552  ], biases None
weights [-1.3107198 -0.093312 ], biases None
weights [-1.0485759 -0.0559872], biases None
weights [-0.8388607  -0.03359232], biases None
weights [-0.6710886  -0.02015539], biases None
weights [-0.53687084 -0.01209324], biases None
weights [-0.42949668 -0.00725594], biases None
weights [-0.34359735 -0.00435357], biases None
weights [-0.27487788 -0.00261214], biases None
weights [-0.2199023  -0.00156728], biases None
weights [-0.17592184 -0.00094037], biases None
weights [-0.14073747 -0.00056422], biases None
weights [-0.11258998 -0.00033853], biases None
weights [-0.09007198 -0.00020312], biases None
weights [-0.07205759 -0.00012187], biases None
weights [-5.7646073e-02 -7.3123170e-05], biases None

initial_weights = np.asarray([-5, -2]).astype('float32') 
initial_biases = None # we ignore bias for simplicity
num_epochs = 20

for epoch in range(num_epochs):
    initial_weights, initial_biases, weights_history = batch_gd_with_autograd.step(
        initial_weights, initial_biases
    )

weights [-4.  -1.2], biases None
weights [-3.2  -0.72], biases None
weights [-2.56  -0.432], biases None
weights [-2.0479999 -0.2592   ], biases None
weights [-1.6383998  -0.15551999], biases None
weights [-1.3107198 -0.093312 ], biases None
weights [-1.0485759 -0.0559872], biases None
weights [-0.8388607  -0.03359232], biases None
weights [-0.6710886  -0.02015539], biases None
weights [-0.53687084 -0.01209324], biases None
weights [-0.42949668 -0.00725594], biases None
weights [-0.34359735 -0.00435356], biases None
weights [-0.27487788 -0.00261214], biases None
weights [-0.2199023  -0.00156728], biases None
weights [-0.17592184 -0.00094037], biases None
weights [-0.14073747 -0.00056422], biases None
weights [-0.11258998 -0.00033853], biases None
weights [-0.09007198 -0.00020312], biases None
weights [-0.07205759 -0.00012187], biases None
weights [-5.7646073e-02 -7.3123156e-05], biases None

weights_history

[array([-4. , -1.2], dtype=float32),
 array([-3.2 , -0.72], dtype=float32),
 array([-2.56 , -0.432], dtype=float32),
 array([-2.0479999, -0.2592   ], dtype=float32),
 array([-1.6383998 , -0.15551999], dtype=float32),
 array([-1.3107198, -0.093312 ], dtype=float32),
 array([-1.0485759, -0.0559872], dtype=float32),
 array([-0.8388607 , -0.03359232], dtype=float32),
 array([-0.6710886 , -0.02015539], dtype=float32),
 array([-0.53687084, -0.01209324], dtype=float32),
 array([-0.42949668, -0.00725594], dtype=float32),
 array([-0.34359735, -0.00435356], dtype=float32),
 array([-0.27487788, -0.00261214], dtype=float32),
 array([-0.2199023 , -0.00156728], dtype=float32),
 array([-0.17592184, -0.00094037], dtype=float32),
 array([-0.14073747, -0.00056422], dtype=float32),
 array([-0.11258998, -0.00033853], dtype=float32),
 array([-0.09007198, -0.00020312], dtype=float32),
 array([-0.07205759, -0.00012187], dtype=float32),
 array([-5.7646073e-02, -7.3123156e-05], dtype=float32)]

def show_trace_2d(f: Callable, results: List[T]) -> None:
    """Show the trace of 2D variables during optimization."""
    d2l.set_figsize()
    d2l.plt.plot(*zip(*results), "-o", color="#ff7f0e")
    x1, x2 = torch.meshgrid(
        torch.arange(-5.5, 1.0, 0.1), torch.arange(-3.0, 1.0, 0.1), indexing="ij"
    )
    d2l.plt.contour(x1, x2, f([x1, x2]), colors="#1f77b4")
    d2l.plt.xlabel("x1")
    d2l.plt.ylabel("x2")
    d2l.plt.show()

show_trace_2d(f_2d, weights_history)

../../_images/39e051779da0a647589d7bf33fac4f6acd575ec748aa721319304171abe3cda3.svg

Probability & Statistics

Gradient Descent Construction

Contents

Gradient Descent Construction#

Base#

Gradient Descent#

2D Example#