## Linear LS

Let's generate some noisy data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
figsize = (10,10)

np.random.seed(123)

In [2]:
x_max = 10
x_step = 0.01
x = np.arange(0, x_max, x_step)
y = 3*x-3

std = 1
y = y+np.random.normal(scale=std, size=x.shape)

plt.figure(figsize=figsize)
plt.plot(x, y, '*')
plt.title("noisy data")

Out[2]:
Text(0.5, 1.0, 'noisy data')

### calc LS matrices and result:

$$Xb = y$$
In [3]:
x_vec = x.reshape(-1, 1)
X = np.concatenate((x_vec, np.ones(x_vec.shape)), axis=1)
print(X.shape)
y_vec = y.reshape(-1, 1)

(1000, 2)

In [4]:
b = np.linalg.inv(X.T@X)@X.T@y_vec
print(b)

# plot fit results
plt.figure(figsize=figsize)
plt.plot(x, y, '*')
plt.plot(x, b[0]*x+b[1], 'r')
plt.title("noisy data + best LS fit. $b^T$="+str(b.T))

[[ 2.98942061]
[-2.98672009]]

Out[4]:
Text(0.5, 1.0, 'noisy data + best LS fit. $b^T$=[[ 2.98942061 -2.98672009]]')

## Comparison to np.linalg.lstsq

In [5]:
b_np = np.linalg.lstsq(X, y_vec, rcond=None)[0]
mse = np.mean((b-b_np)**2)
print(mse)

1.9129876951609536e-29


## vertical dataset

As mentioned in the lecture, LS is not goog at fitting vertical dataset. Let's generate data:

In [6]:
data_sz = 100
x = np.ones(data_sz)
y = np.arange(data_sz)

std = 0.1
x = x+np.random.normal(scale=std, size=x.shape)

plt.figure(figsize=figsize)
plt.plot(x, y, '*')
axes = plt.gca()
axes.set_xlim([-3, 3])
plt.title("noisy vertical data")

Out[6]:
Text(0.5, 1.0, 'noisy vertical data')

Calc LS matrices and result

In [7]:
x_vec = x.reshape(-1, 1)
X = np.concatenate((x_vec, np.ones(x_vec.shape)), axis=1)
y_vec = y.reshape(-1, 1)

In [8]:
b = np.linalg.inv(X.T@X)@X.T@y_vec
print(b)

# plot fit results
plt.figure(figsize=figsize)
plt.plot(x, y, '*')
x_axis = np.arange(3)
plt.plot(x_axis, b[0]*x_axis+b[1], 'r')
plt.title("vertical data + best LS fit. $b^T$="+str(b.T))

[[29.75660996]
[19.74053351]]

Out[8]:
Text(0.5, 1.0, 'vertical data + best LS fit. $b^T$=[[29.75660996 19.74053351]]')

## TLS

Same vertical data, now with total least squares

In [9]:
X = np.concatenate((x.reshape(-1, 1)-np.mean(x),
y.reshape(-1, 1)-np.mean(y)), axis=1)

def linear_tls(X):
w, v = np.linalg.eig(X.T@X)
return v[:, np.argmin(w)]

tls_res = linear_tls(X)

a = tls_res[0]
b = tls_res[1]

c = -a*np.mean(x)-b*np.mean(y)
x_fit = np.array([x.min(), x.max()])
y_fit = -a/b*x_fit-c/b

plt.figure(figsize=figsize)
plt.plot(x, y, '*')
plt.plot(x_fit, y_fit)
axes = plt.gca()
axes.set_xlim([-3, 3])
axes.set_ylim([0, 100])
plt.title("noisy vertical data + linear TLS fit")

Out[9]:
Text(0.5, 1.0, 'noisy vertical data + linear TLS fit')

## Example for non linear LS

In [10]:
x_step = 0.01
x = np.arange(-10, 10+x_step, x_step)

y = 0.5*x**2+2*x+5

std = 5
y = y+np.random.normal(scale=std, size=y.shape)

plt.figure(figsize=figsize)
plt.plot(x, y, '*')
plt.title("noisy parabola data")

Out[10]:
Text(0.5, 1.0, 'noisy parabola data')
In [11]:
# calc LS matrices
x_vec = x.reshape(-1, 1)
X = np.concatenate((x_vec**2, x_vec, np.ones(x_vec.shape)), axis=1)
y_vec = y.reshape(-1, 1)

b = np.linalg.lstsq(X, y_vec, rcond=None)[0]
print(b)

[[0.49962701]
[1.99396636]
[5.08918783]]

In [12]:
plt.figure(figsize=figsize)
plt.plot(x, y, '*')
plt.plot(x, b[0]*x**2+b[1]*x+b[2], 'r')
plt.title("data + best LS fit. $b^T$="+str(b.T))

Out[12]:
Text(0.5, 1.0, 'data + best LS fit. $b^T$=[[0.49962701 1.99396636 5.08918783]]')

## Outliers

As mentioned, LS has a problem with outliers:

In [13]:
x_max = 10

x = np.arange(10)
y = 4*x+2

# let's change the last data point
y[-1] -= 20

plt.figure(figsize=figsize)
plt.plot(x, y, '*')
plt.title("data with outlier")

Out[13]:
Text(0.5, 1.0, 'data with outlier')
In [14]:
# calc LS matrices
x_vec = x.reshape(-1, 1)
X = np.concatenate((x_vec, np.ones(x_vec.shape)), axis=1)
y_vec = y.reshape(-1, 1)

In [15]:
b = np.linalg.inv(X.T@X)@X.T@y_vec
print(b)

# plot fit results
plt.figure(figsize=figsize)
plt.plot(x, y, '*')
x_axis = np.arange(x_max)
plt.plot(x_axis, b[0]*x_axis+b[1], 'r')
plt.title("data with outlier + best LS fit. $b^T$="+str(b.T))

[[2.90909091]
[4.90909091]]

Out[15]:
Text(0.5, 1.0, 'data with outlier + best LS fit. $b^T$=[[2.90909091 4.90909091]]')

## RANSAC

### arrange data

In [16]:
x = np.arange(0, 10, 0.1)
y = 3*x-3

std = 1
y = y+np.random.normal(scale=std, size=x.shape)

# add random noise unrelated to noisy line
noise_sz = int(x.shape[0]*1)
x_noise = np.random.uniform(x.min(), x.max(), size=noise_sz)
y_noise = np.random.uniform(y.min(), y.max(), size=noise_sz)

plt.figure(figsize=figsize)
plt.plot(x, y, '*')
plt.plot(x_noise, y_noise, '*')

Out[16]:
[<matplotlib.lines.Line2D at 0x16d360eeda0>]
In [17]:
x = np.concatenate((x, x_noise))
y = np.concatenate((y, y_noise))


### Run RANSAC

In [18]:
def basic_ransac(x,TH):
#====== choose 2 random inds
rand_indices = np.random.choice(x.shape[0], size=2)

#====== build LS data:
x_vec = x[rand_indices].reshape(-1, 1)
X = np.concatenate((x_vec, np.ones(x_vec.shape)), axis=1)
y_vec = y[rand_indices].reshape(-1, 1)

b = np.linalg.lstsq(X, y_vec, rcond=None)[0].flatten()

#====== build fitted line
line_p1 = np.array([x.min(), b[0]*x.min()+b[1]])
line_p2 = np.array([x.max(), b[0]*x.max()+b[1]])
inliers_ind = []

#====== distance of fit line from each sample to determine inliers
for j in range(x.shape[0]):
p_j = np.array([x[j], y[j]])

# https://en.wikipedia.org/wiki/Cross_product#Geometric_meaning
# |a X b| = |a||b|sin(t) -> |a X b|/|b| = |a|sin(t)
d_j = np.linalg.norm(np.cross(line_p1-p_j, line_p2-line_p1))/np.linalg.norm(line_p2-line_p1)
if d_j <= TH:
inliers_ind.append(j)

inliers_ind = np.array(inliers_ind)
return b, inliers_ind

In [19]:
TH = 1
num_cycles = 10

num_best_inliers = 0
best_cycle_ind = -1
inliers_ind_list = []
b_list = []

for i in range(num_cycles):
b, inliers_ind = basic_ransac(x,TH)
inliers_ind_list.append(inliers_ind)
b_list.append(b)

#====== save best model
if num_best_inliers < inliers_ind.shape[0]:
num_best_inliers = inliers_ind.shape[0]
best_cycle_ind = i

In [20]:
# plot best fit
plt.rcParams['figure.figsize'] = [20, 20]

for i in range(num_cycles):
plt.subplot(int(num_cycles/2), 2, i+1)
ax = plt.gca()
x_axis = np.arange(11)
ax.plot(x_axis, b_list[i][0]*x_axis+ b_list[i][1], 'r')
ax.plot(x, y, '*')
ax.plot(x[inliers_ind_list[i]], y[inliers_ind_list[i]], '*k')
ax.set_xlim([0, 10])
ax.set_ylim([0, 25])
if i== best_cycle_ind:
plt.title("!!! BEST FIT !!! num inliers: "+ str(inliers_ind_list[i].shape[0]))
else:
plt.title("num inliers: "+ str(inliers_ind_list[i].shape[0]))


### Test RANSAC with sklearn package (a known machine learning package in python)

In [21]:
from sklearn import linear_model

X = x.reshape(-1, 1)

lr = linear_model.LinearRegression()
lr.fit(X, y)

# Robustly fit linear model with RANSAC algorithm
ransac = linear_model.RANSACRegressor()
ransac.fit(X, y)

# Predict data of estimated models
line_X = np.arange(X.min(), X.max())[:, np.newaxis]
line_y = lr.predict(line_X)
line_y_ransac = ransac.predict(line_X)

plt.figure()