project3
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
# prepare data
data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data'
data = pd.read_table(data_url, sep='\s+', header=None)
data = data.drop([0], axis=1)
data.iloc[:, 7] = pd.factorize(data.iloc[:, 7])[0] # 0---7 coding
data = np.array(data)
x = data[:, 0:-2] # input
y = data[:, -1] # output
y = y.astype(int)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def train_ecoc_classify(x=x, y=y, error2stop=0.1, max_iter=150, code_length=16, method='sigmoid'):
# y should be labels e.g. 0,1,2,3....
# code_length: how long do you want to code y labels
i = 0
error = 5
all_error = []
sample_n, feature_n = np.shape(x) # Size of input dataset
y_types = len(np.unique(y))
np.random.seed(132) # set random seed to make results same
code_book = np.random.randint(low=0, high=2, size=[y_types, code_length]) # recode y labels
# to avoid error, labels of y should start with 0
y_matrix = np.vstack([code_book[n, :] for n in y])
w = np.random.uniform(low=-1, high=1, size=[feature_n, code_length])
if method == 'sigmoid':
min_max_scaler = preprocessing.MinMaxScaler() # MinMaxScaler default (0,1)
x_scaled = min_max_scaler.fit_transform(x)
elif method == 'tanh':
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
x_scaled = min_max_scaler.fit_transform(x)
while error < error2stop or i < max_iter:
i += 1
z = x_scaled.dot(w)
if method == 'sigmoid':
q = sigmoid(z)
threshold = 0.5
f = np.copy(q)
f[f >= threshold] = 1
f[f < threshold] = 0
delta_w = 0.005 * x_scaled.T.dot(np.abs((f - y_matrix)) * (0.5 - q) * (q - q**2))
elif method == 'tanh':
q = np.tanh(z)
threshold = 0
f = np.copy(q)
f[f >= threshold] = 1
f[f < threshold] = 0
delta_w = 0.005 * x_scaled.T.dot(np.abs((f - y_matrix)) * (0 - q) * (1 - q**2))
w = w + delta_w
error = np.sum(np.abs(f - y_matrix)) / (code_length * x_scaled.shape[0])
all_error.append(error)
#print('iteration: %d, error, %f' % (i, error))
return [np.array(all_error), w]
def predict_ecoc_classify(x, y, w, method):
# y should be labels e.g. 0,1,2,3....
# code_length: how long do you want to code y labels
code_length = w.shape[1]
y_types = len(np.unique(y))
np.random.seed(132) # set random seed to make results same
code_book = np.random.randint(low=0, high=2, size=[y_types, code_length])
if method == 'sigmoid':
min_max_scaler = preprocessing.MinMaxScaler() # MinMaxScaler default (0,1)
x_scaled = min_max_scaler.fit_transform(x)
elif method == 'tanh':
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
x_scaled = min_max_scaler.fit_transform(x)
z = x_scaled.dot(w)
if method == 'sigmoid':
q = sigmoid(z)
threshold = 0.5
y_pred = np.copy(q)
y_pred[y_pred >= threshold] = 1
y_pred[y_pred < threshold] = 0
elif method == 'tanh':
q = np.tanh(z)
threshold = 0
y_pred = np.copy(q)
y_pred[y_pred >= threshold] = 1
y_pred[y_pred < threshold] = 0
y_pred_label = []
for n in range(y_pred.shape[0]):
dist_vec = np.sum(np.abs(y_pred[n, :] - code_book), 1) / code_length
y_pred_label.append(np.argmin(dist_vec))
return np.array(y_pred_label)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
%matplotlib inline
fig, ax = plt.subplots(1, 2, sharey=True, figsize=[18, 8])
fig.suptitle('Comparing code length')
for i, m in enumerate(list(['sigmoid', 'tanh'])):
for v in list([14, 16, 18, 20]):
errors, w = train_ecoc_classify(
x=x_train, y=y_train, error2stop=0.1, max_iter=20000, code_length=v, method=m)
ax[i].plot(errors, label='code length: %d' % v)
ax[i].legend(loc='upper right')
ax[i].set_title(m + ' activation function')
ax[i].set_xlabel('iter')
if i == 0: # only set one y label
ax[i].set_ylabel('error')
#fig.savefig('Figure 1.tif',dpi=300)
fig, ax = plt.subplots(1, 4, figsize=[20, 5])
fig.suptitle('Comparing code length on prediction results (sigmoid)')
for i, v in enumerate(list([14, 16, 18, 20])):
_, w = train_ecoc_classify(x=x_train, y=y_train, error2stop=0.1,
max_iter=20000, code_length=v, method='sigmoid')
y_pred = predict_ecoc_classify(x_test, y_test, w, 'sigmoid')
confusion_m = confusion_matrix(y_test, y_pred)
miss_rate = 1 - np.sum(np.diag(confusion_m)) / np.sum(confusion_m)
sns.heatmap(confusion_m.T, square=True, annot=True, fmt='d', cbar=False, ax=ax[i])
ax[i].set_title('code length: %d (miss_rate: %f)' % (v, miss_rate))
ax[i].set_xlabel('true label')
if i == 0: # only set one y label
ax[i].set_ylabel('predicted label')
#fig.savefig('Figure 2.tif',dpi=300)
fig, ax = plt.subplots(1, 4, figsize=[20, 5])
fig.suptitle('Comparing code length on prediction results (tanh)')
for i, v in enumerate(list([14, 16, 18, 20])):
_, w = train_ecoc_classify(x=x_train, y=y_train, error2stop=0.1,
max_iter=20000, code_length=v, method='tanh')
y_pred = predict_ecoc_classify(x_test, y_test, w, 'tanh')
confusion_m = confusion_matrix(y_test, y_pred)
miss_rate = 1 - np.sum(np.diag(confusion_m)) / np.sum(confusion_m)
sns.heatmap(confusion_m.T, square=True, annot=True, fmt='d', cbar=False, ax=ax[i])
ax[i].set_title('code length: %d (miss_rate: %f)' % (v, miss_rate))
ax[i].set_xlabel('true label')
if i == 0: # only set one y label
ax[i].set_ylabel('predicted label')
#fig.savefig('Figure 3.tif',dpi=300)
Last updated
Was this helpful?