# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold inputs and outputs.
y = torch.randn(N, D_out)
# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.Linear(H, D_out),
loss_fn = torch.nn.MSELoss(reduction='sum')
# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use AdamW; the optim package contains many other
# optimization algorithms. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01, amsgrad=False)
# Forward pass: compute predicted y by passing x to the model.
# Compute and print loss.
loss = loss_fn(y_pred, y)
# Before the backward pass, use the optimizer object to zero all of the
# gradients for the Tensors it will update (which are the learnable weights
# Backward pass: compute gradient of the loss with respect to model parameters
# Calling the step function on an Optimizer makes an update to its parameters