opt = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1
# First step is `- learning_rate * grad`
step_count = opt.minimize(loss, [var]).numpy()
print((val0 - val1).numpy())
# On later steps, step-size increases because of momentum
step_count = opt.minimize(loss, [var]).numpy()
print((val1 - val2).numpy())