import numpy as np
from sklearn import linear_model
Poisson_reg = linear_model.PoissonRegressor()
X_val = np.array([[1, 2], [2, 3], [3, 5], [4, 3]])
y_val = np.array([12, 17, 22, 21])

Poisson_reg.fit(X_val, y_val)
Poisson_reg.score(X_val, y_val)

0.9716538991248989

Poisson_reg.coef_

array([0.13488995, 0.09463608])

from scipy.optimize import minimize

#Since we are doing things by hand, let's add a column of ones to account for a constant
X_val = np.concatenate((np.ones((X_val.shape[0], 1)), X_val), axis=1)
X_val

array([[1., 1., 2.],
       [1., 2., 3.],
       [1., 3., 5.],
       [1., 4., 3.]])

def logL_Poisson(β, X, y):
    sum_logl = 0
    for i, x_i in enumerate(X):
        y_i = y[i]
        sum_logl += -np.exp(x_i @ β) + y_i*(x_i @ β) #- np.log(float(factorial(y_i)))
    nobs = X.shape[0]  
    return -(sum_logl/nobs)

#Why '-' ? The scipy optimisation function is written explicitly for minimization. Therefore,
#we will minimize (-likelihood), which is equivalent as maximising the likelihood.

#Option 1: half-DIY
minimize(logL_Poisson, x0 = np.array([0, 0, 0]), args=(X_val, y_val))

      fun: -34.46797636613751
 hess_inv: array([[ 0.69498571, -0.0820981 , -0.12018362],
       [-0.0820981 ,  0.05822026, -0.02222006],
       [-0.12018362, -0.02222006,  0.05237927]])
      jac: array([4.76837158e-07, 1.43051147e-06, 2.86102295e-06])
  message: 'Optimization terminated successfully.'
     nfev: 70
      nit: 10
     njev: 14
   status: 0
  success: True
        x: array([2.19900248, 0.14059105, 0.09672533])

# To go full DIY, we need to adapt a bit our functions from yesterday to fit them to maximum likelihood
#Small coding tutorial: our likelihood function seeks to minimize with respect to (vector) parameter β, but it uses
#Sample data (X, y) as additional arguments. To avoid hard-coding (X, y) into our function arguments, we use *args, that 
#will unpack any additional argument we wish to include in the inputs of the function we want to minimize.

def num_gradient(f, x, *args):
    partial_derivatives_vector=np.empty(x.shape[0])
    for index in range(x.shape[0]):
        e_ind = np.zeros(x.shape[0])
        e_ind[index] = 1.0
        h = 1e-5
        partial_derivatives_vector[index] = (f(x + e_ind*h, *args) - f(x - e_ind*h, *args))/(2*h)
    return partial_derivatives_vector

def num_hessian(f, x, *args):
    hessian = np.empty((x.shape[0], x.shape[0]))
    for index in range(x.shape[0]):
          for jindex in range(x.shape[0]):
                e_ind = np.zeros(x.shape[0])
                e_ind[jindex] = 1.0
                h = 1e-5
                hessian[index, jindex] = (num_gradient(f, x + e_ind*h, *args)[index] - num_gradient(f, x - e_ind*h, *args)[index])/(2*h)
    return hessian

def gradient_descent(f, x0, *args, step=1, tol=1e-5, verbose=False):
    """Implements Gradient Descent using the Newton-Raphson algorithm. Gradients and Hessians are computed numerically.\n
    x0: starting point of the iterative process\n
    *args: additional arguments of f (e.g for maximum likelihood: X, y)\n
    step: step size of the descent process (default: step=1) \n
    tol: value beyond which the descent stops (default: tol=1e-5) \n
    verbose: prints intermediary steps (default: verbose=False)
    """
    
    
    error = 1e6
    x_hist = []
    maxiter=100
    
    x = x0
    n_iter=1
    
    while error > tol:
        g_k = num_gradient(f, x, *args)
        if verbose==True:
            print(f'iteration: {n_iter}, θ_hat: {np.round(x, 3)}, gradient: {np.round(g_k, 3)}, likelihood: {np.round(f(x, *args), 3)}')
        
        H_k = num_hessian(f, x, *args)
        try:
            A_k = -np.linalg.inv(H_k)
        except:
            A_k = np.random.normal(0, 1, size=x.shape[0])*np.identity(x.shape[0])
            print("ERROR: Failure to invert the Hessian")
            
        x = x + step*A_k@g_k
        x_hist.append(x)
        error = max(abs(g_k))
        n_iter+=1
    return x, x_hist

beta_hat, beta_hist = gradient_descent(logL_Poisson, np.zeros(3), X_val, y_val, verbose=True)

iteration: 1, θ_hat: [0. 0. 0.], gradient: [-17.   -46.5  -58.75], likelihood: 1.0
iteration: 2, θ_hat: [5.571 2.343 1.714], gradient: [5.24741957e+08 1.70511137e+09 2.35691439e+09], likelihood: 524741654.067
iteration: 3, θ_hat: [4.571 2.343 1.714], gradient: [1.93041730e+08 6.27275294e+08 8.67060434e+08], likelihood: 193041444.819
iteration: 4, θ_hat: [3.571 2.343 1.714], gradient: [7.10160932e+07 2.30761695e+08 3.18973692e+08], likelihood: 71015825.809
iteration: 5, θ_hat: [2.571 2.343 1.714], gradient: [2.61253529e+07 8.48924787e+07 1.17343854e+08], likelihood: 26125103.548
iteration: 6, θ_hat: [1.571 2.343 1.714], gradient: [ 9610974.302 31230188.948 43168381.77 ], likelihood: 9610742.934
iteration: 7, θ_hat: [0.57  2.343 1.714], gradient: [ 3535673.589 11488932.313 15880744.275], likelihood: 3535460.222
iteration: 8, θ_hat: [-0.427  2.343  1.714], gradient: [1300696.892 4226528.713 5842183.073], likelihood: 1300501.513
iteration: 9, θ_hat: [-1.42   2.341  1.714], gradient: [ 478494.644 1554840.033 2149202.754], likelihood: 478317.243
iteration: 10, θ_hat: [-2.401  2.338  1.712], gradient: [176023.447 571980.856 790631.213], likelihood: 175863.98
iteration: 11, θ_hat: [-3.341  2.327  1.706], gradient: [ 64750.684 210407.113 290840.679], likelihood: 64609.008
iteration: 12, θ_hat: [-4.181  2.298  1.692], gradient: [ 23815.799  77391.701 106978.233], likelihood: 23691.568
iteration: 13, θ_hat: [-4.774  2.224  1.655], gradient: [ 8756.858 28458.403 39339.515], likelihood: 8649.215
iteration: 14, θ_hat: [-4.825  2.052  1.568], gradient: [ 3217.397 10457.645 14457.83 ], likelihood: 3124.459
iteration: 15, θ_hat: [-4.031  1.728  1.404], gradient: [1180.326 3837.019 5306.629], likelihood: 1099.143
iteration: 16, θ_hat: [-2.615  1.295  1.182], gradient: [ 431.67  1402.737 1942.127], likelihood: 360.014
iteration: 17, θ_hat: [-1.182  0.868  0.954], gradient: [156.384 507.287 704.556], likelihood: 93.967
iteration: 18, θ_hat: [0.049 0.503 0.735], gradient: [ 55.054 177.696 248.933], likelihood: 1.952
iteration: 19, θ_hat: [1.029 0.243 0.517], gradient: [17.867 56.945 81.604], likelihood: -26.635
iteration: 20, θ_hat: [1.726 0.13  0.303], gradient: [ 4.68  14.519 21.913], likelihood: -33.488
iteration: 21, θ_hat: [2.098 0.129 0.146], gradient: [0.743 2.226 3.606], likelihood: -34.429
iteration: 22, θ_hat: [2.194 0.14  0.099], gradient: [0.036 0.106 0.177], likelihood: -34.468
iteration: 23, θ_hat: [2.199 0.141 0.097], gradient: [0.    0.    0.001], likelihood: -34.468
iteration: 24, θ_hat: [2.199 0.141 0.097], gradient: [0. 0. 0.], likelihood: -34.468

beta_hat

array([2.19900227, 0.1405911 , 0.09672535])

import numpy as np
import seaborn as sns

X = np.random.gumbel(size=100000)
sns.kdeplot(X, label="Standard Gumbel Distribution", bw_adjust=1)

<matplotlib.axes._subplots.AxesSubplot at 0x23293e8bf48>

Y = np.random.gumbel(size=100000)
Z = X-Y

sns.kdeplot(Z, label="Logistic Distribution", bw_adjust=1)

<matplotlib.axes._subplots.AxesSubplot at 0x232955b3a88>

#in numpy: implemented by np.kron
I = 3
J = 2
fe_matrix = np.kron(np.identity(I), np.ones((J, 1)))
fe_matrix

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])

Lecture 4: Maximum likelihood and logit¶

Antoine Chapel (Sciences Po & PSE) ¶

Alfred Galichon's math+econ+code prerequisite class on numerical optimization and econometrics, in Python ¶

References:¶

Introduction and Outline¶

Maximum Likelihood:¶

Poisson Regression¶

Logit¶

Multinomial Logit¶

An IO application¶

A word on adding fixed effects by hand¶