In [4]:

data = np.loadtxt('subtraining.csv')
print data

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-5928ea808f7e> in <module>()
----> 1 data = np.loadtxt('subtraining.csv')
      2 print data

/usr/local/lib/python2.7/dist-packages/numpy/lib/npyio.pyc in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)
    858 
    859             # Convert each value according to its column and store
--> 860             items = [conv(val) for (conv, val) in zip(converters, vals)]
    861             # Then pack it according to the dtype's nesting
    862             items = pack_items(items, packing)

ValueError: could not convert string to float: EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,DER_pt_ratio_lep_tau,DER_met_p

In [5]:

data = np.loadtxt('subtraining.csv', delimiter=',')
print data

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-5-e20f5f5eef2d> in <module>()
----> 1 data = np.loadtxt('subtraining.csv', delimiter=',')
      2 print data

/usr/local/lib/python2.7/dist-packages/numpy/lib/npyio.pyc in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)
    858 
    859             # Convert each value according to its column and store
--> 860             items = [conv(val) for (conv, val) in zip(converters, vals)]
    861             # Then pack it according to the dtype's nesting
    862             items = pack_items(items, packing)

ValueError: could not convert string to float: EventId

In [6]:

data = np.loadtxt('subtraining.csv', delimiter=',', skiprows=1, )
print data

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-97fcc3397220> in <module>()
----> 1 data = np.loadtxt('subtraining.csv', delimiter=',', skiprows=1)
      2 print data

/usr/local/lib/python2.7/dist-packages/numpy/lib/npyio.pyc in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)
    858 
    859             # Convert each value according to its column and store
--> 860             items = [conv(val) for (conv, val) in zip(converters, vals)]
    861             # Then pack it according to the dtype's nesting
    862             items = pack_items(items, packing)

ValueError: could not convert string to float: b

In [24]:

def bs_to_float(v):
    if v == 'b':
        return 0
    elif v == 's':
        return 1
    raise Exception('bs_to_float is expecting the value to be b or s')

In [25]:

print bs_to_float("b")
print bs_to_float("s")

0
1

In [26]:

bs_to_float("whatever")

---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-26-e84ea0969111> in <module>()
----> 1 bs_to_float("whatever")

<ipython-input-24-87497aed0157> in bs_to_float(v)
      4     elif v == 's':
      5         return 1
----> 6     raise Exception('bs_to_float is expecting the value to be b or s')

Exception: bs_to_float is expecting the value to be b or s

In [29]:

assert bs_to_float("b") == 0
assert bs_to_float("s") == 1

In [32]:

assert bs_to_float("b") == 1

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-32-191cebef8ef6> in <module>()
----> 1 assert bs_to_float("b") == 1

AssertionError:

In [33]:

assert bs_to_float("b") == 1, "bs_to_float of 'b' should be 1" # WRONG

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-33-cfb6680cef6c> in <module>()
----> 1 assert bs_to_float("b") == 1, "bs_to_float of 'b' should be 1" # WRONG

AssertionError: bs_to_float of 'b' should be 1

In [34]:

def bs_to_float(v):
    if v == 'b':
        return 0
    elif v == 's':
        return 1
    raise Exception('bs_to_float is expecting the value to be b or s')

In [35]:

the_index_of_the_label_column = -1
converter_dictionary = { the_index_of_the_label_column: bs_to_float }

In [36]:

print converter_dictionary

{-1: <function bs_to_float at 0x7f469c1047d0>}

In [38]:

data = np.loadtxt('subtraining.csv', delimiter=',',
                  skiprows=1, converters=converter_dictionary)
print data

[[  3.29331000e+05  -9.99000000e+02   9.58280000e+01 ...,   0.00000000e+00
    4.27704409e+00   0.00000000e+00]
 [  2.16562000e+05   1.36580000e+02   1.41240000e+01 ...,  -0.00000000e+00
    1.86361167e-02   1.00000000e+00]
 [  1.28010000e+05   1.61549000e+02   6.43490000e+01 ...,   3.47070000e+01
    1.86361167e-02   1.00000000e+00]
 ..., 
 [  3.05892000e+05   1.21728000e+02   2.64640000e+01 ...,   1.66913000e+02
    7.13571366e-02   0.00000000e+00]
 [  3.43618000e+05  -9.99000000e+02   9.31840000e+01 ...,   1.52125000e+02
    7.44056247e-01   0.00000000e+00]
 [  2.14522000e+05   8.60860000e+01   2.38160000e+01 ...,   2.06055000e+02
    8.34140313e-02   0.00000000e+00]]

In [39]:

!head -3 subtraining.csv

EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
329331,-999.0,95.828,67.588,26.101,-999.0,-999.0,-999.0,1.968,26.101,79.446,0.971,-1.383,-999.0,40.301,-1.807,-1.786,39.146,-2.279,2.586,65.596,0.108,102.852,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,4.27704409469,b
216562,136.58,14.124,99.436,24.068,-999.0,-999.0,-999.0,2.974,24.068,97.301,0.754,1.393,-999.0,55.488,1.348,2.835,41.813,1.897,-0.526,23.603,-0.979,195.45,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,0.018636116672,s

In [40]:

data.shape

Out[40]:

(10001, 33)

In [42]:

n_signal = np.sum(data[:, -1])
print "There are", n_signal, "'signal' data points"

There are 3431.0 'signal' data points

In [46]:

print data.shape[0]

In [52]:

print np.sum(data[:][-1]) # wrong

216353.898414

In [49]:

print data[1][1]
print data[1, 1]

136.58
136.58

In [55]:

# why it doesn't work
print data[0]
print data[:]
#print data[:][-1]

[  3.29331000e+05  -9.99000000e+02   9.58280000e+01   6.75880000e+01
   2.61010000e+01  -9.99000000e+02  -9.99000000e+02  -9.99000000e+02
   1.96800000e+00   2.61010000e+01   7.94460000e+01   9.71000000e-01
  -1.38300000e+00  -9.99000000e+02   4.03010000e+01  -1.80700000e+00
  -1.78600000e+00   3.91460000e+01  -2.27900000e+00   2.58600000e+00
   6.55960000e+01   1.08000000e-01   1.02852000e+02   0.00000000e+00
  -9.99000000e+02  -9.99000000e+02  -9.99000000e+02  -9.99000000e+02
  -9.99000000e+02  -9.99000000e+02   0.00000000e+00   4.27704409e+00
   0.00000000e+00]
[[  3.29331000e+05  -9.99000000e+02   9.58280000e+01 ...,   0.00000000e+00
    4.27704409e+00   0.00000000e+00]
 [  2.16562000e+05   1.36580000e+02   1.41240000e+01 ...,  -0.00000000e+00
    1.86361167e-02   1.00000000e+00]
 [  1.28010000e+05   1.61549000e+02   6.43490000e+01 ...,   3.47070000e+01
    1.86361167e-02   1.00000000e+00]
 ..., 
 [  3.05892000e+05   1.21728000e+02   2.64640000e+01 ...,   1.66913000e+02
    7.13571366e-02   0.00000000e+00]
 [  3.43618000e+05  -9.99000000e+02   9.31840000e+01 ...,   1.52125000e+02
    7.44056247e-01   0.00000000e+00]
 [  2.14522000e+05   8.60860000e+01   2.38160000e+01 ...,   2.06055000e+02
    8.34140313e-02   0.00000000e+00]]

In [56]:

print data.shape[0] - n_signal

6570.0

In [58]:

last_column = data[:, -1]
print last_column == 0
print np.sum(last_column == 0)

[ True False False ...,  True  True  True]
6570

In [59]:

!head -1 subtraining.csv

EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label

In [60]:

print np.mean(data, axis=0)

[  2.25826013e+05  -5.04341019e+01   4.98140767e+01   8.11949666e+01
   5.68582436e+01  -7.11032207e+02  -6.09370462e+02  -7.11948695e+02
   2.38059754e+00   1.91193526e+01   1.56611679e+02   1.44153055e+00
  -1.28430757e-01  -7.11585811e+02   3.85340469e+01   2.10578942e-03
   3.58691131e-02   4.66502712e+01  -2.41238876e-02   1.66051395e-02
   4.13051991e+01  -1.38857114e-02   2.08597238e+02   9.70402960e-01
  -3.56916776e+02  -4.06344336e+02  -4.06353159e+02  -6.95286099e+02
  -7.11712852e+02  -7.11719353e+02   7.14273654e+01   1.63861198e+00
   3.43065693e-01]

In [62]:

print np.min(data, axis=0)

[  1.00004000e+05  -9.99000000e+02   1.60000000e-02   1.00670000e+01
   0.00000000e+00  -9.99000000e+02  -9.99000000e+02  -9.99000000e+02
   3.71000000e-01   0.00000000e+00   4.62270000e+01   1.05000000e-01
  -1.41400000e+00  -9.99000000e+02   2.00010000e+01  -2.47900000e+00
  -3.14200000e+00   2.60010000e+01  -2.48600000e+00  -3.14200000e+00
   2.92000000e-01  -3.14100000e+00   2.09250000e+01   0.00000000e+00
  -9.99000000e+02  -9.99000000e+02  -9.99000000e+02  -9.99000000e+02
  -9.99000000e+02  -9.99000000e+02   0.00000000e+00   1.50187016e-03
   0.00000000e+00]

In [64]:

print np.sum(data == -999)

In [67]:

# the number of rows (data points) that have no undefined (-999) values
print data == -999
print np.sum(data != -999, axis=1)
print np.sum(data != -999, axis=1).shape

[[False  True False ..., False False False]
 [False False False ..., False False False]
 [False False False ..., False False False]
 ..., 
 [False False False ..., False False False]
 [False  True False ..., False False False]
 [False False False ..., False False False]]
[22 23 26 ..., 33 32 33]
(10001,)

In [69]:

print data.shape
print data.shape[1]
n_cols = data.shape[1]

(10001, 33)
33

In [71]:

defined_in_each_row = np.sum(data != -999, axis=1)
print defined_in_each_row

[22 23 26 ..., 33 32 33]

In [72]:

print np.sum( defined_in_each_row == n_cols )

In [73]:

undefined_in_each_row = np.sum(data == -999, axis=1)
print np.sum( undefined_in_each_row == 0 )

In [76]:

is_row_undefined = np.any(data == -999, axis=1)
print is_row_undefined

[ True  True  True ..., False  True False]

In [78]:

print np.sum(is_row_undefined)
print np.sum(is_row_undefined == False)

7306
2695

In [80]:

is_row_defined = is_row_undefined == False
print is_row_defined
print np.sum(is_row_defined)

[False False False ...,  True False  True]
2695

In [82]:

defined_data = data[ is_row_defined ]
print defined_data.shape

(2695, 33)

In [83]:

%matplotlib inline

In [84]:

import matplotlib.pyplot as plt

In [85]:

centralized_data = defined_data - defined_data.mean(axis=0)
centralized_data = centralized_data / centralized_data.std(axis=0)

In [86]:

print centralized_data.shape

(2695, 33)

In [96]:

cmatrix = np.matrix(centralized_data)
cov = cmatrix.T * cmatrix
print cov.shape
print cov

(33, 33)
[[ 2695.           118.72624927   -65.33328004 ...,    57.53428738
    -35.50570029    68.84113044]
 [  118.72624927  2695.           775.24613261 ...,   -68.06535895
    493.76985698    28.83710758]
 [  -65.33328004   775.24613261  2695.         ...,  -229.64590077
    979.97513352  -645.64680942]
 ..., 
 [   57.53428738   -68.06535895  -229.64590077 ...,  2695.          -426.59523992
   -138.03757627]
 [  -35.50570029   493.76985698   979.97513352 ...,  -426.59523992  2695.
  -1481.7651259 ]
 [   68.84113044    28.83710758  -645.64680942 ...,  -138.03757627
  -1481.7651259   2695.        ]]

In [105]:

plt.imshow(cov, interpolation='nearest')
print centralized_data[0,0]

0.728675763548