Gradient descent

import torch
import numpy as np
import sympy as sp
from scipy.optimize import linprog

def gradient(f, X):
    X = X.clone().detach().requires_grad_(True)
    f = f(*X)
    f.backward()
    return X.grad.data

def to_series(X, e):
    S = [e * i.item() for i in X]
    return torch.cat(S, 0)

def lpt(nabla_f, A):
    l = nabla_f.shape[0]
    e = torch.ones(l)
    e[1::2] = -1
    c = to_series(nabla_f, e)
    a = [np.ones(c.shape[0])]
    for i in A:
        tmp = to_series(i, e)
        a.append(tmp.detach().cpu().numpy())

    b = np.zeros(A.shape[0]+1)
    b[0] = 1

    bounds = [(0, None)] * c.shape[0]
    res = linprog(c, A_ub=a, b_ub=b, bounds=bounds, method='revised simplex')
    res_x = res.x
    if res_x[res_x > 0].size:
        return e * torch.tensor(res_x[res_x > 0])
    else:
        return torch.zeros(l)

def isin(A, B):
    return torch.stack([b == A for b in B]).sum(0).bool()

def get_h(X0, Xi):
    r = sp.symbols('r')
    Xir = [x0 + xi*r for x0, xi in zip(X0, Xi)]
    h = sp.simplify(f(*Xir))
    if str(h)[0] == '-':
        return sp.lambdify(r, -h)
    else:
        return sp.lambdify(r, h)

def gss(f, a, b, eps=1e-8):
    gr = (np.sqrt(5) - 1) / 2
    lmbda = a + (1 - gr) * (b - a)
    mu = a + gr * (b - a)
    while abs(a - b) > eps:
        if f(lmbda) < f(mu):
            b = mu
        else:
            a = lmbda
        lmbda = a + (1 - gr) * (b - a)
        mu = a + gr * (b - a)
    return (b + a) / 2

def calc_r(X0, Xi):
    h = get_h(X0, Xi)
    r = gss(h, -1000, 1000)
    return r

def gradient_descent(f, X0, A, B, precision=10e-4):
    i = 0
    while True:
        ax = torch.sum(A * X0, 1)
        if ax[ax < B].shape[0] == ax.shape[0]:
            E = -1 * gradient(f, X0.data)
        elif ax[ax == B].shape[0] > 0:
            mask = isin(ax, B)
            ax_masked = ax[mask]
            A_id = [torch.where(B == i)[0] for i in ax_masked]
            A_id = torch.cat(A_id, 0)
            nabla_f = gradient(f, X0.data)
            E = lpt(nabla_f, A[A_id])
        if torch.norm(E).item() <= precision:
            break

        alpha_is = (B - torch.sum(A * X0, 1)) / torch.sum(A * E, 1)
        ae = torch.sum(A * E, 1)
        ae[ae <= 0] = float('inf')
        alpha_is[torch.isinf(ae)] = float('inf')
        alpha_i = alpha_is.min().item()

        Ejk = E.detach().clone()
        alpha_js = -1 * X0 / Ejk
        Ejk[Ejk >= 0] = float('inf')
        alpha_js[torch.isinf(Ejk)] = float('inf')
        alpha_j = alpha_js.min().item()

        alpha_astrsk = calc_r(X0, E)
        r = np.min([alpha_i, alpha_j, alpha_astrsk]).item()
        X0 = X0 + r*E
        #print(f' iter {i}: \n E = {E}, X0 = {X0}, alpha_i = {alpha_i}, alpha_j = {alpha_j}, alpha_* = {alpha_astrsk}, r = {r}, Norm = {torch.norm(E).item()}')
        i += 1
    return X0.data, i


X0 = torch.tensor([1/2, 17/12])

A = torch.tensor([[1, 1],
                  [1, 2]], dtype=torch.float64)

B = torch.tensor([3,4], dtype=torch.float64)

f = lambda x1, x2: (x1-4)**2 + (x2-2)**2

X, i = gradient_descent(f, X0, A, B, 10e-4)

print(f' Maximum = {X} \n f(Maximum) = {f(*X)} \n number of iterations = {i}')