In [6]:

print csv

[['EventId' 'DER_mass_MMC' 'DER_mass_transverse_met_lep' ...,
  'PRI_jet_all_pt' 'Weight' 'Label']
 ['100000' '138.47' '51.655' ..., '113.497' '0.00265331133733' 's']
 ['100001' '160.937' '68.768' ..., '46.226' '2.23358448717' 'b']
 ..., 
 ['349997' '105.457' '60.526' ..., '41.992' '0.018636116672' 's']
 ['349998' '94.951' '19.362' ..., '0.0' '1.68161144262' 'b']
 ['349999' '-999.0' '72.756' ..., '0.0' '1.87747381063' 'b']]

It seems the first line is a header, the first column is an event ID and can be ignored. We decide to also ignore the two last columns (a probability for the simulation, and the label). There are actually two types of events: 'b' for background event, and 's' for signal events.

The loaded csv data were in string formats, for the actual "data" features, we force it to be interpreted as 'float' (real number).

In [95]:

head = csv[0,:]
data = np.array(csv[1:,1:-2], dtype=float) # col 1 is an ID, last are weight and label
weights = np.array(csv[1:,-2], dtype=float)
labels = csv[1:,-1]

In [8]:

print head.shape
print labels.shape
print data.shape

(33,)
(250000,)
(250000, 30)

In [9]:

print "Missing values: ",(data == -999).sum()
print "Rows with missing values: ", (data == -999).any(axis=1).sum()
print "Rows without missing values: ", (data != -999).all(axis=1).sum()

Missing values:  1580052
Rows with missing values:  181886
Rows without missing values:  68114

In [12]:

indicesOfCleanRows = (data != -999).all(axis=1)
onlyclean = data[ indicesOfCleanRows, :]
onlycleanlabels = labels[ indicesOfCleanRows ]
print onlycleanlabels.shape
print onlyclean.shape
assert len(onlyclean)>0
assert len(onlyclean)<len(data)

(68114,)
(68114, 30)

In [13]:

mu = onlyclean.mean(0)
mu.shape

Out[13]:

(30,)

In [14]:

std = (onlyclean - mu).std(0)
std.shape

Out[14]:

(30,)

In [15]:

cov = np.matrix(onlyclean - mu)/std
cov = cov.T * cov
cov.shape

Out[15]:

(30, 30)

In [16]:

plt.imshow(cov, interpolation='nearest')

Out[16]:

<matplotlib.image.AxesImage at 0x49f4890>

We extract the eigenvectors and eigenvalues (we print the eigenvalues, we'll only use the vectors)

In [17]:

eigenValues,v = np.linalg.eig(cov)
print eigenValues

[  4.23978262e+05   2.30349604e+05   1.78049697e+05   1.19636842e+05
   1.18767172e+05   1.17582688e+05   1.09316022e+05   8.47009200e+04
   7.26893494e+04   7.08421850e+04   7.21005472e+04   6.11520676e+04
   5.42604906e+04   4.41399754e+04   4.23068089e+04   3.98430356e+04
   3.69762429e+04   3.55157362e+04   2.70862010e+04   2.68289269e+04
   2.08626077e+04   1.34699297e+04   1.22331591e+04   8.74368204e+03
   6.77054802e+03   7.35478192e+03   3.70977403e+03   2.35462071e+03
   1.79812192e+03   7.47080856e-07]

Extract and plot the first 4 eigenvectors.

In [18]:

firstVs = v[:,0:4]
plt.imshow(firstVs, interpolation='nearest')
firstVs.shape

Out[18]:

(30, 4)

In [19]:

proj = ((onlyclean-mu)/std) * firstVs
proj.shape

Out[19]:

(68114, 4)

In [20]:

s = proj[onlycleanlabels == 's'] #signal
b = proj[onlycleanlabels == 'b'] #background
plt.scatter(s[:,0], s[:,1], c='r', marker='o')
plt.scatter(b[:,0], b[:,1], c='g', marker='o', alpha=.3)
plt.show()

Make a plotting function. Plot both red over white and white over red.

In [21]:

def draw(d1, d2, sub1, sub2):
    print "Plotting principal component",d1,"versus principal component", d2
    fig,(p1,p2) = plt.subplots(nrows=2)
    p1.scatter(sub1[:,d1], sub1[:,d2], c='r')
    p1.scatter(sub2[:,d1], sub2[:,d2], c='g', alpha=.5)
    p2.scatter(sub2[:,d1], sub2[:,d2], c='g')
    p2.scatter(sub1[:,d1], sub1[:,d2], c='r', alpha=.5)
    plt.show()

In [22]:

s = proj[onlycleanlabels == 's'] #signal
b = proj[onlycleanlabels == 'b'] #background
draw(0,1, s,b)
draw(2,3, s,b)
draw(0,2, s,b)
draw(1,3, s,b)

Plotting principal component 0 versus principal component 1

Plotting principal component 2 versus principal component 3

Plotting principal component 0 versus principal component 2

Plotting principal component 1 versus principal component 3

Now we will split the dataset based on what values are missing

In [23]:

data.shape

Out[23]:

(250000, 30)

In [42]:

pow2 = 1 << np.arange(0, 30)
def binaryInfo(pow2):
  for i in (pow2[::4]):
    print "{0:b}".format(i)
  print "{0:b}".format(pow2.sum())
  print pow2.shape
binaryInfo(pow2)

1
10000
100000000
1000000000000
10000000000000000
100000000000000000000
1000000000000000000000000
10000000000000000000000000000
111111111111111111111111111111
(30,)

In [53]:

group = ((data == -999) * pow2).sum(axis=1)
types = set(group)

In [61]:

# may be better to just use the group==t indices
splits = {}
splitLabels = {}
for t in types:
    splits[t] = data[group == t,:]
    splitLabels[t] = labels[group == t]
    print t,":",splits[t].shape,splitLabels[t].shape

0 : (68114, 30) (68114,)
528486512 : (73790, 30) (73790,)
1 : (4429, 30) (4429,)
469766257 : (7562, 30) (7562,)
469766256 : (69982, 30) (69982,)
528486513 : (26123, 30) (26123,)

For each split (data with the same missing values), do a PCA etc

In [171]:

def testStd():
    N=10
    a = np.linspace(1, 10, N)
    w = np.linspace(1, 5, N)
    print a.std()
    print (a - a.mean()).std()
    c = a - a.mean()
    print np.sqrt((c*c).sum()/len(c))
    c = a - np.average(a, weights=w)
    print np.sqrt( (c*c*w).sum()/w.sum())
    a = np.array([1,1,3])
    w = np.array([1,1,2])
    print np.average(a, weights=w)
    c = a - np.average(a, weights=w)
    print np.sqrt( (c*c*w).sum()/w.sum())
    a = np.array([1,1,3,3])
    w = np.array([1,1,1,1])
    print np.average(a, weights=w)
    c = a - np.average(a, weights=w)
    print np.sqrt( (c*c*w).sum()/w.sum())
testStd()

2.87228132327
2.87228132327
2.87228132327
2.5992639034
2.0
1.0
2.0
1.0

We expect the -999 to vanish by whitening etc.

In [198]:

d = data[group==0]
w = weights[group==0]
W = w.sum()
mu = np.average(d, axis=0, weights=w)
print mu

[  1.39709464e+02   5.31850747e+01   9.48183441e+01   8.58254122e+01
   1.60410486e+00   1.93575845e+02   6.16525876e-01   2.28489113e+00
   2.63916521e+01   2.51132786e+02   1.91248897e+00   1.79742160e-01
   2.97783653e-01   3.66657838e+01   1.18168236e-02   1.33727544e-02
   5.78010991e+01   1.81423195e-02   3.14800454e-02   4.54222566e+01
  -2.07163008e-02   3.04063638e+02   2.26492407e+00   8.89084360e+01
  -6.62149377e-03  -1.02241927e-02   5.22824423e+01  -8.40270243e-03
   1.55279656e-02   1.56665911e+02]

In [208]:

def whitenData(d, w):
    W = w.sum()
    mu = np.average(d, axis=0, weights=w)
    wrepeated = np.repeat(w.reshape((-1,1)),d.shape[1], axis=1)
    c = d - mu
    std = (c*c*wrepeated).sum(axis=0) / W
    std = np.sqrt(std) #d.std(0)
    std[std==0] = 1 # avoid div by 0 (happens with constant -999 values)
    print std
    return (d-mu)/std

def doEigenStuff(d, labels):
    cov = np.dot(d.T, d)
    eigenValues,v = np.linalg.eig(cov)
    firstVs = v[:,:4]
    proj = np.dot(d, firstVs)
    s = proj[labels == 's'] #signal
    b = proj[labels == 'b'] #background
    fig,(p0,p1,p2) = plt.subplots(ncols=3)
    p0.imshow(firstVs, interpolation='nearest')
    p1.scatter(s[:,0], s[:,1], c='r')
    p1.scatter(b[:,0], b[:,1], c='g', alpha=.5)
    p2.scatter(b[:,0], b[:,1], c='g')
    p2.scatter(s[:,0], s[:,1], c='r', alpha=.5)
    plt.show()

In [209]:

#doEigenStuff(whitenData(onlyclean), onlycleanlabels)
for t in types:
    selector = group==t
    print 'Working on {0:b} of size {1}'.format(t, selector.sum())
    doEigenStuff(whitenData(data[selector], weights[selector]), labels[selector])

Working on 0 of size 68114
[  95.72471658   35.63796695   64.45604224   61.38773175    1.24266815
  179.83572194    2.74210648    0.9170581    26.86001606  119.09940376
    1.32642089    1.06082582    0.36360603   22.7944168     1.29093161
    1.82374955   32.61846307    1.31720365    1.81548554   33.40005216
    1.81714854  128.83966928    0.44129277   59.0481216     1.52737555
    1.80669604   29.93360613    1.73707743    1.81258596   99.49358554]

Working on 11111100000000001000001110000 of size 73790
[  6.20258853e+01   2.51303859e+01   4.18374129e+01   1.31601905e+01
   7.81028575e-11   7.81028575e-11   7.81028575e-11   5.78708208e-01
   1.31601905e+01   2.31436001e+01   5.45016782e-01   8.71378650e-01
   7.81028575e-11   1.29850720e+01   1.27026729e+00   1.81198746e+00
   1.44380275e+01   1.37009342e+00   1.81832738e+00   1.13663798e+01
   1.80815053e+00   5.18965455e+01   1.00000000e+00   7.81028575e-11
   7.81028575e-11   7.81028575e-11   7.81028575e-11   7.81028575e-11
   7.81028575e-11   1.00000000e+00]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-209-cde613b13afd> in <module>()
      3     selector = group==t
      4     print 'Working on {0:b} of size {1}'.format(t, selector.sum())
----> 5     doEigenStuff(whitenData(data[selector], weights[selector]), labels[selector])

<ipython-input-208-b78d8facebcd> in doEigenStuff(d, labels)
     18     b = proj[labels == 'b'] #background
     19     fig,(p0,p1,p2) = plt.subplots(ncols=3)
---> 20     p0.imshow(firstVs, interpolation='nearest')
     21     p1.scatter(s[:,0], s[:,1], c='r')
     22     p1.scatter(b[:,0], b[:,1], c='g', alpha=.5)

/usr/lib/pymodules/python2.7/matplotlib/axes.pyc in imshow(self, X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, shape, filternorm, filterrad, imlim, resample, url, **kwargs)
   7103                        filterrad=filterrad, resample=resample, **kwargs)
   7104 
-> 7105         im.set_data(X)
   7106         im.set_alpha(alpha)
   7107         self._set_artist_props(im)

/usr/lib/pymodules/python2.7/matplotlib/image.pyc in set_data(self, A)
    416 
    417         if self._A.dtype != np.uint8 and not np.can_cast(self._A.dtype, np.float):
--> 418             raise TypeError("Image data can not convert to float")
    419 
    420         if (self._A.ndim not in (2, 3) or

TypeError: Image data can not convert to float