Data Mining Basics
Objective 1
Familiarize with vector operations for manipulating multi-dimensional arrays in numpy.
- Using numpy, initialize two arrays (1000 rows, 50 and 10 columns) of random numbers each number ranging between 0 and 1;
- Create the correlation matrix (shape=[1000, 1000]) of pearson correlations between all pairs of rows from above;
- Using matplotlib, plot a 100-bin histogram, using values from lower triangle of \(1000\times1000\) correlation coefficient (r-values) matrix obtained from above (omit the diagonal and all cells above the diagonal);
- Using the histogram, estimate the probability of obtaining an r-value > 0.75 or < -0.75 from correlating two random vectors of size 50 and 10.
import numpy as np
import matplotlib.pyplot as plt
num_row = 1000
num_cols = [50, 10]
num_bins= 100
idx_low = np.tril_indices(num_row, -1)
fig, axs = plt.subplots(2, constrained_layout=True)
for idx, col in enumerate(num_cols):
arr = np.random.rand(num_row, col)
corr_mat = np.corrcoef(arr)
low_tri_arr = corr_mat[idx_low]
hist, bin_edges = np.histogram(low_tri_arr, bins=num_bins, density=True)
indices = np.logical_or(bin_edges[:-1]<-0.75, bin_edges[:-1]>0.75)
est = np.sum(hist[indices] * np.diff(bin_edges)[indices])
axs[idx].hist(low_tri_arr, bins=num_bins, density=True)
axs[idx].set_title('Probability of r-value > 0.75 or <-0.75 is {:.2f}% (size {})'.format(est*100, col))
axs[idx].set(xlabel='r-value', ylabel='probabilty')
plt.show()
The histogram is shaped like a gaussian.
Objective 2
Basic data visualization (scatter plots and line charts).
- Get the Linnerrud dataset (import using:
from sklearn.datasets import load_linnerud
). Weight, waist, and heartrate are attributes, chinups, situps, and jumps are outcomes; - Using numpy matrix functions (
np.dot
,np.transpose
, etc), compute the linear-least-square solution, finding the intercept and slope of best fit line for each [attribute, outcome] pair (attribute on x-axis, outcome on y-axis). Make sure to augment the attribute vectors with a column of 1’s, so LLS can find the intercept.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_linnerud
(data, target) = load_linnerud(return_X_y=True)
x_weight = target[:,0]
x_waist = target[:,1]
x_heartrate = target[:,2]
y_chinups = data[:,0]
y_situps = data[:,1]
y_jumps = data[:,2]
A_weight = np.vstack([x_weight, np.ones(len(x_weight))]).T
A_waist = np.vstack([x_waist, np.ones(len(x_waist))]).T
A_heartrate = np.vstack([x_heartrate, np.ones(len(x_heartrate))]).T
m1, c1 = np.linalg.lstsq(A_weight, data, rcond=None)[0]
m2, c2 = np.linalg.lstsq(A_waist, data, rcond=None)[0]
m3, c3 = np.linalg.lstsq(A_heartrate, data, rcond=None)[0]
fig, axs = plt.subplots(3, 3, constrained_layout=True)
axs[0, 0].plot(x_weight, y_chinups, 'o')
axs[0, 0].plot(x_weight, m1[0]*x_weight + c1[0])
axs[0, 0].set_title('Slope: {:.2f}\nIntercept: {:.2f}'.format(m1[0], c1[0]))
axs[0, 0].set(xlabel='weight', ylabel='chinups')
axs[0, 1].plot(x_weight, y_situps, 'o')
axs[0, 1].plot(x_weight, m1[1]*x_weight + c1[1])
axs[0, 1].set_title('Slope: {:.2f}\nIntercept: {:.2f}'.format(m1[1], c1[1]))
axs[0, 1].set(xlabel='weight', ylabel='situps')
axs[0, 2].plot(x_weight, y_jumps, 'o')
axs[0, 2].plot(x_weight, m1[2]*x_weight + c1[2])
axs[0, 2].set_title('Slope: {:.2f}\nIntercept: {:.2f}'.format(m1[2], c1[2]))
axs[0, 2].set(xlabel='weight', ylabel='jumps')
axs[1, 0].plot(x_waist, y_chinups, 'o')
axs[1, 0].plot(x_waist, m2[0]*x_waist + c2[0])
axs[1, 0].set_title('Slope: {:.2f}\nIntercept: {:.2f}'.format(m2[0], c2[0]))
axs[1, 0].set(xlabel='waist', ylabel='chinups')
axs[1, 1].plot(x_waist, y_situps, 'o')
axs[1, 1].plot(x_waist, m2[1]*x_waist + c2[1])
axs[1, 1].set_title('Slope: {:.2f}\nIntercept: {:.2f}'.format(m2[1], c2[1]))
axs[1, 1].set(xlabel='waist', ylabel='situps')
axs[1, 2].plot(x_waist, y_jumps, 'o')
axs[1, 2].plot(x_waist, m2[1]*x_waist + c2[1])
axs[1, 2].set_title('Slope: {:.2f}\nIntercept{:.2f}'.format(m2[2], c2[2]))
axs[1, 2].set(xlabel='waist', ylabel='jumps')
axs[2, 0].plot(x_heartrate, y_chinups, 'o')
axs[2, 0].plot(x_heartrate, m3[0]*x_heartrate + c3[0])
axs[2, 0].set_title('Slope: {:.2f}\nIntercept: {:.2f}'.format(m3[0], c3[0]))
axs[2, 0].set(xlabel='heartrate', ylabel='chinups')
axs[2, 1].plot(x_heartrate, y_situps, 'o')
axs[2, 1].plot(x_heartrate, m3[1]*x_heartrate + c3[1])
axs[2, 1].set_title('Slope: {:.2f}\nIntercept: {:.2f}'.format(m3[1], c3[1]))
axs[2, 1].set(xlabel='heartrate', ylabel='situps')
axs[2, 2].plot(x_heartrate, y_jumps, 'o')
axs[2, 2].plot(x_heartrate, m3[2]*x_heartrate + c3[2])
axs[2, 2].set_title('Slope: {:.2f}\nIntercept: {:.2f}'.format(m3[2], c3[2]))
axs[2, 2].set(xlabel='heartrate', ylabel='jumps')
plt.show()
Objective 3
To understand the basic machine learning algorithms, here I implement the following two algorithms, from scratch, in python (using only numpy import). And test these algorithms on the Linnerrud dataset (import using: from sklearn.datasets import load_linnerud
) using all 3 attributes, and only the chinups outcome. Define new vector assigning binary classes to the outcome of chinups as follows:
if (chinups > median(chinups)) then chinups = 0 else chinups = 1
import numpy as np
from sklearn.datasets import load_linnerud
(data, target) = load_linnerud(return_X_y=True)
chinups = data[:,0]
median = np.median(chinups)
chinups = np.where(chinups > median, 1, 0)
then use these classes to build the probability table and train the perceptron.
Gaussian Naive Bayes
Gaussian Naive Bayes is a probabilistic modeling algorithm.
- Gaussian Naive Bayes (probabilistic modeling)
class GNB():
def __init__(self):
self._mu0 = []
self._mu1 = []
self._sigma0 = []
self._sigma1 = []
self._prior = {}
def _prior_prob(self, y):
unique, counts = np.unique(y, return_counts=True)
prob = counts/len(y)
return dict(zip(unique, prob))
def fit(self, X, y):
self._mu0 = np.mean(X[y==0], axis=0)
self._mu1 = np.mean(X[y==1], axis=0)
self._sigma0 = np.std(X[y==0], axis=0)
self._sigma1 = np.std(X[y==1], axis=0)
self._prior = self._prior_prob(y)
def pred(self, x):
x0 = x
x1 = x
probs0 = np.ones(x.shape)
probs1 = np.ones(x.shape)
for i in range(x.shape[1]):
gaussian0 = lambda x: 1/(np.sqrt(2*np.pi)*self._sigma0[i])*np.exp((-(x-self._mu0[i])**2)/(2*self._sigma0[i]**2))
gaussian1 = lambda x: 1/(np.sqrt(2*np.pi)*self._sigma1[i])*np.exp((-(x-self._mu1[i])**2)/(2*self._sigma1[i]**2))
v_gaussian0 = np.vectorize(gaussian0)
v_gaussian1 = np.vectorize(gaussian1)
probs0[:,i] = v_gaussian0(x0[:,i])
probs1[:,i] = v_gaussian1(x1[:,i])
likeli0 = np.prod(probs0, axis=1)*self._prior[0]
likeli1 = np.prod(probs1, axis=1)*self._prior[1]
post = likeli1 / (likeli0+likeli1)
return post
gnb = GNB()
gnb.fit(target, chinups)
outputs = gnb.pred(target)
print(((outputs > 0.5) == chinups).sum() / len(chinups))
0.7
Perceptron
Perceptron learning rule is linear modeling algorithm.
class Perceptron(object):
# if perceptron does not converge run for 1000 iterations
def __init__(self, iters=1000):
self._iters = iters
self._weights = []
def fit(self, X, y):
# set all weights to zero
self._weights = np.zeros(X.shape[1]+1)
# until all instances in training set are correctly classified
# or run enough iterations
for _ in range(self._iters):
y_pred = self.pred(X)
y_pred = y_pred>0
if np.all(y_pred==y):
break
for inst, label in zip(X, y): # for each instance
pred = self._pred(inst)
if pred != label: # if inst is classed incorrectly
# if inst belongs to first class, add to weight vec
# else subtract it from weight vec
self._weights[1:] += (label - pred) * inst
self._weights[0] += (label - pred)
def _pred(self, inst):
prediction = np.dot(inst, self._weights[1:]) + self._weights[0]
if prediction > 0:
prediction_class = 1
else:
prediction_class = 0
return prediction_class
def pred(self, x):
weighted_sums = np.dot(x, self._weights[1:]) + self._weights[0]
return weighted_sums
perceptron = Perceptron()
perceptron.fit(target, chinups)
outputs = perceptron.pred(target)
print(((outputs > 0) == chinups).sum() / len(chinups))
0.5