Least squares

Linear LS

Let's generate some noisy data

In [1]:

import matplotlib.pyplot as plt
import numpy as np

figsize = (10, 10)

np.random.seed(123)

In [2]:

x_max = 10
x_step = 0.01
x = np.arange(0, x_max, x_step)
y = 3 * x - 3

# add noise to data
std = 1
y = y + np.random.normal(scale=std, size=x.shape)

plt.figure(figsize=figsize)
plt.plot(x, y, "*")
plt.title("noisy data")
plt.show()

No description has been provided for this image

calc LS matrices

In [3]:

x_vec = x.reshape(-1, 1)
X = np.concatenate((x_vec, np.ones(x_vec.shape)), axis=1)
y_vec = y.reshape(-1, 1)

print(f"Original x shape: {x.shape}")
print(f"x_vec shape: {x_vec.shape}")
print(f"and the same for y: {y.shape=}; {y_vec.shape=}")
print(f"Let's see how the matrix X is (first 5 lines):\n{X[:5]}")
print(f"{X.shape=}")

Original x shape: (1000,)
x_vec shape: (1000, 1)
and the same for y: y.shape=(1000,); y_vec.shape=(1000, 1)
Let's see how the matrix X is (first 5 lines):
[[0.   1.  ]
 [0.01 1.  ]
 [0.02 1.  ]
 [0.03 1.  ]
 [0.04 1.  ]]
X.shape=(1000, 2)

Calc $\beta$

Remember the LS equation? $$\beta = (X^TX)^{-1}X^Ty$$

In [4]:

b = np.linalg.inv(X.T @ X) @ X.T @ y_vec
print(b)

# plot fit results
plt.figure(figsize=figsize)
plt.plot(x, y, "*")
plt.plot(x, b[0] * x + b[1], "r")
plt.title("noisy data + best LS fit. $b^T$=" + str(b.T))
plt.show()

[[ 2.98942061]
 [-2.98672009]]

Comparison to np.linalg.lstsq

In [5]:

b_np = np.linalg.lstsq(X, y_vec, rcond=None)[0]
mse = np.mean((b - b_np) ** 2)

print(f"Mean square error is essentially 0 between our direct equation and numpy's: {mse}")

Mean square error is essentially 0 between our direct equation and numpy's: 1.2818989709841442e-30

Example of non linear dataset for LS

In [6]:

x_step = 0.01
x = np.arange(-10, 10 + x_step, x_step)

y = 0.5 * x**2 + 2 * x + 5

# add noise to data
std = 5
y = y + np.random.normal(scale=std, size=y.shape)

plt.figure(figsize=figsize)
plt.plot(x, y, "*")
plt.title("noisy parabola data")
plt.show()

In [7]:

# calc LS matrices
x_vec = x.reshape(-1, 1)
X = np.concatenate((x_vec**2, x_vec, np.ones(x_vec.shape)), axis=1)
y_vec = y.reshape(-1, 1)

b = np.linalg.lstsq(X, y_vec, rcond=None)[0]
print(b)

[[0.49655506]
 [1.98671548]
 [5.16065246]]

In [8]:

plt.figure(figsize=figsize)
plt.plot(x, y, "*")
plt.plot(x, b[0] * x**2 + b[1] * x + b[2], "r")
plt.title("data + best LS fit. $b^T$=" + str(b.T))
plt.show()

vertical dataset

As mentioned in the lecture, LS is not goog at fitting vertical dataset. Let's generate data:

In [9]:

data_sz = 100
x = np.ones(data_sz)
y = np.arange(data_sz)

# add noise to data
std = 0.1
x = x + np.random.normal(scale=std, size=x.shape)

plt.figure(figsize=figsize)
plt.plot(x, y, "*")
axes = plt.gca()
axes.set_xlim([-3, 3])
plt.title("noisy vertical data")
plt.show()

WRONG WAY - Calc LS matrices and plot result

In [10]:

x_vec = x.reshape(-1, 1)
X = np.concatenate((x_vec, np.ones(x_vec.shape)), axis=1)
y_vec = y.reshape(-1, 1)

In [11]:

b = np.linalg.inv(X.T @ X) @ X.T @ y_vec
print(b)

# plot fit results
plt.figure(figsize=figsize)
plt.plot(x, y, "*")
x_axis = np.arange(3)
plt.plot(x_axis, b[0] * x_axis + b[1], "r")
plt.title("vertical data + best LS fit. $b^T$=" + str(b.T))
plt.show()

[[-19.65687338]
 [ 69.40291145]]

TLS

Same vertical data, now with total least squares TLS equation: $$ \left\{\begin{matrix} minimize & \beta^T\mathbf{X}^T\mathbf{X}\beta\\ s.t. & ||\beta||=1 \end{matrix}\right. $$

For 2d linear data we want to build a line like so: $$ax+by+c=0$$

This is our variabels:

$$\mathbf{X} = \begin{bmatrix} x_1-\overline{x}&y_1-\overline{y}\\ \vdots&\vdots\\ x_N-\overline{x}&y_N-\overline{y} \end{bmatrix} $$

$$ \beta = \begin{bmatrix} a\\ b \end{bmatrix} $$

$$ c=-a\overline{x}-b\overline{y}\$$

In [12]:

X = np.concatenate((x.reshape(-1, 1) - np.mean(x), y.reshape(-1, 1) - np.mean(y)), axis=1)


def linear_tls(X):
    """The eigenvector corresponding to the smallest eigenvalue of X.T @ X is the TLS result"""
    # v - matrix of column vectors- each is an eigenvector.
    # w - vector of eigenvalues.
    w, v = np.linalg.eig(X.T @ X)
    return v[:, np.argmin(w)]


tls_res = linear_tls(X)

a = tls_res[0]
b = tls_res[1]

c = -a * np.mean(x) - b * np.mean(y)
x_fit = np.array([x.min(), x.max()])
y_fit = -a / b * x_fit - c / b

plt.figure(figsize=figsize)
plt.plot(x, y, "*")
plt.plot(x_fit, y_fit)
axes = plt.gca()
axes.set_xlim([-3, 3])
axes.set_ylim([0, 100])
plt.title("noisy vertical data + linear TLS fit")
plt.show()

Outliers

As mentioned, LS has a problem with outliers:

In [13]:

x_max = 10

x = np.arange(10)
y = 4 * x + 2

# let's change the last data point
y[-1] -= 20

plt.figure(figsize=figsize)
plt.plot(x, y, "*")
plt.title("data with outlier")
plt.show()

In [14]:

# WRONG WAY - calc LS matrices
x_vec = x.reshape(-1, 1)
X = np.concatenate((x_vec, np.ones(x_vec.shape)), axis=1)
y_vec = y.reshape(-1, 1)

In [15]:

b = np.linalg.inv(X.T @ X) @ X.T @ y_vec
print(b)

# plot fit results
plt.figure(figsize=figsize)
plt.plot(x, y, "*")
x_axis = np.arange(x_max)
plt.plot(x_axis, b[0] * x_axis + b[1], "r")
plt.title("data with outlier + best LS fit. $b^T$=" + str(b.T))
plt.show()

[[2.90909091]
 [4.90909091]]

RANSAC

arrange data

In [16]:

x = np.arange(0, 10, 0.1)
y = 3 * x - 3

# add noise to data
std = 1
y = y + np.random.normal(scale=std, size=x.shape)

# add random noise unrelated to noisy line
noise_sz = int(x.shape[0] * 1)
x_noise = np.random.uniform(x.min(), x.max(), size=noise_sz)
y_noise = np.random.uniform(y.min(), y.max(), size=noise_sz)

plt.figure(figsize=figsize)
plt.plot(x, y, "*")
plt.plot(x_noise, y_noise, "*")
plt.show()

In [17]:

x = np.concatenate((x, x_noise))
y = np.concatenate((y, y_noise))

Run RANSAC

In [18]:

def basic_ransac(x, TH):
    # ====== choose 2 random inds
    rand_indices = np.random.choice(x.shape[0], size=2)

    # ====== build LS data:
    x_vec = x[rand_indices].reshape(-1, 1)
    X = np.concatenate((x_vec, np.ones(x_vec.shape)), axis=1)
    y_vec = y[rand_indices].reshape(-1, 1)

    b = np.linalg.lstsq(X, y_vec, rcond=None)[0].flatten()

    # ====== build fitted line
    line_p1 = np.array([x.min(), b[0] * x.min() + b[1]])
    line_p2 = np.array([x.max(), b[0] * x.max() + b[1]])
    inliers_ind = []

    # ====== distance of fit line from each sample to determine inliers
    for j in range(x.shape[0]):
        p_j = np.array([x[j], y[j]])

        # https://en.wikipedia.org/wiki/Cross_product#Geometric_meaning
        # |a X b| = |a||b|sin(t) -> |a X b|/|b| = |a|sin(t)
        d_j = np.linalg.norm(np.cross(line_p1 - p_j, line_p2 - line_p1)) / np.linalg.norm(line_p2 - line_p1)
        if d_j <= TH:
            inliers_ind.append(j)

    inliers_ind = np.array(inliers_ind)
    return b, inliers_ind

In [19]:

TH = 1
num_cycles = 10

num_best_inliers = 0
best_cycle_ind = -1
inliers_ind_list = []
b_list = []

for i in range(num_cycles):
    b, inliers_ind = basic_ransac(x, TH)
    inliers_ind_list.append(inliers_ind)
    b_list.append(b)

    # ====== save best model
    if num_best_inliers < inliers_ind.shape[0]:
        num_best_inliers = inliers_ind.shape[0]
        best_cycle_ind = i

In [20]:

# plot best fit
plt.rcParams["figure.figsize"] = [20, 20]

for i in range(num_cycles):
    plt.subplot(int(num_cycles / 2), 2, i + 1)
    ax = plt.gca()
    x_axis = np.arange(11)
    ax.plot(x_axis, b_list[i][0] * x_axis + b_list[i][1], "r")
    ax.plot(x, y, "*")
    ax.plot(x[inliers_ind_list[i]], y[inliers_ind_list[i]], "*k")
    ax.set_xlim([0, 10])
    ax.set_ylim([0, 25])
    if i == best_cycle_ind:
        plt.title("!!! BEST FIT !!! num inliers: " + str(inliers_ind_list[i].shape[0]))
    else:
        plt.title("num inliers: " + str(inliers_ind_list[i].shape[0]))
plt.show()