Importing things
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
Load the data with "dtype=None" to load it as strings (not all of them are numbers).
csv = np.genfromtxt("../data/training.csv", delimiter=",", dtype=None)
print csv
It seems the first line is a header, the first column is an event ID and can be ignored. We decide to also ignore the two last columns (a probability for the simulation, and the label). There are actually two types of events: 'b' for background event, and 's' for signal events.
The loaded csv data were in string formats, for the actual "data" features, we force it to be interpreted as 'float' (real number).
head = csv[0,:]
data = np.array(csv[1:,1:-2], dtype=float) # col 1 is an ID, last are weight and label
weights = np.array(csv[1:,-2], dtype=float)
labels = csv[1:,-1]
print head.shape
print labels.shape
print data.shape
print "Missing values: ",(data == -999).sum()
print "Rows with missing values: ", (data == -999).any(axis=1).sum()
print "Rows without missing values: ", (data != -999).all(axis=1).sum()
indicesOfCleanRows = (data != -999).all(axis=1)
onlyclean = data[ indicesOfCleanRows, :]
onlycleanlabels = labels[ indicesOfCleanRows ]
print onlycleanlabels.shape
print onlyclean.shape
assert len(onlyclean)>0
assert len(onlyclean)<len(data)
mu = onlyclean.mean(0)
mu.shape
std = (onlyclean - mu).std(0)
std.shape
cov = np.matrix(onlyclean - mu)/std
cov = cov.T * cov
cov.shape
plt.imshow(cov, interpolation='nearest')
We extract the eigenvectors and eigenvalues (we print the eigenvalues, we'll only use the vectors)
eigenValues,v = np.linalg.eig(cov)
print eigenValues
Extract and plot the first 4 eigenvectors.
firstVs = v[:,0:4]
plt.imshow(firstVs, interpolation='nearest')
firstVs.shape
proj = ((onlyclean-mu)/std) * firstVs
proj.shape
s = proj[onlycleanlabels == 's'] #signal
b = proj[onlycleanlabels == 'b'] #background
plt.scatter(s[:,0], s[:,1], c='r', marker='o')
plt.scatter(b[:,0], b[:,1], c='g', marker='o', alpha=.3)
plt.show()
Make a plotting function. Plot both red over white and white over red.
def draw(d1, d2, sub1, sub2):
print "Plotting principal component",d1,"versus principal component", d2
fig,(p1,p2) = plt.subplots(nrows=2)
p1.scatter(sub1[:,d1], sub1[:,d2], c='r')
p1.scatter(sub2[:,d1], sub2[:,d2], c='g', alpha=.5)
p2.scatter(sub2[:,d1], sub2[:,d2], c='g')
p2.scatter(sub1[:,d1], sub1[:,d2], c='r', alpha=.5)
plt.show()
s = proj[onlycleanlabels == 's'] #signal
b = proj[onlycleanlabels == 'b'] #background
draw(0,1, s,b)
draw(2,3, s,b)
draw(0,2, s,b)
draw(1,3, s,b)
data.shape
pow2 = 1 << np.arange(0, 30)
def binaryInfo(pow2):
for i in (pow2[::4]):
print "{0:b}".format(i)
print "{0:b}".format(pow2.sum())
print pow2.shape
binaryInfo(pow2)
group = ((data == -999) * pow2).sum(axis=1)
types = set(group)
# may be better to just use the group==t indices
splits = {}
splitLabels = {}
for t in types:
splits[t] = data[group == t,:]
splitLabels[t] = labels[group == t]
print t,":",splits[t].shape,splitLabels[t].shape
def testStd():
N=10
a = np.linspace(1, 10, N)
w = np.linspace(1, 5, N)
print a.std()
print (a - a.mean()).std()
c = a - a.mean()
print np.sqrt((c*c).sum()/len(c))
c = a - np.average(a, weights=w)
print np.sqrt( (c*c*w).sum()/w.sum())
a = np.array([1,1,3])
w = np.array([1,1,2])
print np.average(a, weights=w)
c = a - np.average(a, weights=w)
print np.sqrt( (c*c*w).sum()/w.sum())
a = np.array([1,1,3,3])
w = np.array([1,1,1,1])
print np.average(a, weights=w)
c = a - np.average(a, weights=w)
print np.sqrt( (c*c*w).sum()/w.sum())
testStd()
We expect the -999 to vanish by whitening etc.
d = data[group==0]
w = weights[group==0]
W = w.sum()
mu = np.average(d, axis=0, weights=w)
print mu
def whitenData(d, w):
W = w.sum()
mu = np.average(d, axis=0, weights=w)
wrepeated = np.repeat(w.reshape((-1,1)),d.shape[1], axis=1)
c = d - mu
std = (c*c*wrepeated).sum(axis=0) / W
std = np.sqrt(std) #d.std(0)
std[std==0] = 1 # avoid div by 0 (happens with constant -999 values)
print std
return (d-mu)/std
def doEigenStuff(d, labels):
cov = np.dot(d.T, d)
eigenValues,v = np.linalg.eig(cov)
firstVs = v[:,:4]
proj = np.dot(d, firstVs)
s = proj[labels == 's'] #signal
b = proj[labels == 'b'] #background
fig,(p0,p1,p2) = plt.subplots(ncols=3)
p0.imshow(firstVs, interpolation='nearest')
p1.scatter(s[:,0], s[:,1], c='r')
p1.scatter(b[:,0], b[:,1], c='g', alpha=.5)
p2.scatter(b[:,0], b[:,1], c='g')
p2.scatter(s[:,0], s[:,1], c='r', alpha=.5)
plt.show()
#doEigenStuff(whitenData(onlyclean), onlycleanlabels)
for t in types:
selector = group==t
print 'Working on {0:b} of size {1}'.format(t, selector.sum())
doEigenStuff(whitenData(data[selector], weights[selector]), labels[selector])