import numpy as np
data = np.loadtxt('subtraining.csv')
print data
data = np.loadtxt('subtraining.csv', delimiter=',')
print data
data = np.loadtxt('subtraining.csv', delimiter=',', skiprows=1, )
print data
def bs_to_float(v):
if v == 'b':
return 0
elif v == 's':
return 1
raise Exception('bs_to_float is expecting the value to be b or s')
print bs_to_float("b")
print bs_to_float("s")
bs_to_float("whatever")
assert bs_to_float("b") == 0
assert bs_to_float("s") == 1
assert bs_to_float("b") == 1
assert bs_to_float("b") == 1, "bs_to_float of 'b' should be 1" # WRONG
def bs_to_float(v):
if v == 'b':
return 0
elif v == 's':
return 1
raise Exception('bs_to_float is expecting the value to be b or s')
the_index_of_the_label_column = -1
converter_dictionary = { the_index_of_the_label_column: bs_to_float }
print converter_dictionary
data = np.loadtxt('subtraining.csv', delimiter=',',
skiprows=1, converters=converter_dictionary)
print data
!head -3 subtraining.csv
data.shape
n_signal = np.sum(data[:, -1])
print "There are", n_signal, "'signal' data points"
print data.shape[0]
print np.sum(data[:][-1]) # wrong
print data[1][1]
print data[1, 1]
# why it doesn't work
print data[0]
print data[:]
#print data[:][-1]
print data.shape[0] - n_signal
last_column = data[:, -1]
print last_column == 0
print np.sum(last_column == 0)
!head -1 subtraining.csv
print np.mean(data, axis=0)
print np.min(data, axis=0)
print np.sum(data == -999)
# the number of rows (data points) that have no undefined (-999) values
print data == -999
print np.sum(data != -999, axis=1)
print np.sum(data != -999, axis=1).shape
print data.shape
print data.shape[1]
n_cols = data.shape[1]
defined_in_each_row = np.sum(data != -999, axis=1)
print defined_in_each_row
print np.sum( defined_in_each_row == n_cols )
undefined_in_each_row = np.sum(data == -999, axis=1)
print np.sum( undefined_in_each_row == 0 )
is_row_undefined = np.any(data == -999, axis=1)
print is_row_undefined
print np.sum(is_row_undefined)
print np.sum(is_row_undefined == False)
is_row_defined = is_row_undefined == False
print is_row_defined
print np.sum(is_row_defined)
defined_data = data[ is_row_defined ]
print defined_data.shape
%matplotlib inline
import matplotlib.pyplot as plt
centralized_data = defined_data - defined_data.mean(axis=0)
centralized_data = centralized_data / centralized_data.std(axis=0)
print centralized_data.shape
cmatrix = np.matrix(centralized_data)
cov = cmatrix.T * cmatrix
print cov.shape
print cov
plt.imshow(cov, interpolation='nearest')
print centralized_data[0,0]