project3

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
# prepare data
data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data'
data = pd.read_table(data_url, sep='\s+', header=None)
data = data.drop([0], axis=1)
data.iloc[:, 7] = pd.factorize(data.iloc[:, 7])[0]  # 0---7 coding
data = np.array(data)
x = data[:, 0:-2]  # input
y = data[:, -1]  # output
y = y.astype(int)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def train_ecoc_classify(x=x, y=y, error2stop=0.1, max_iter=150,  code_length=16, method='sigmoid'):
    # y should be labels e.g. 0,1,2,3....
    # code_length: how long do you want to code y labels
    i = 0
    error = 5
    all_error = []
    sample_n, feature_n = np.shape(x)  # Size of input dataset
    y_types = len(np.unique(y))
    np.random.seed(132)  # set random seed to make results same
    code_book = np.random.randint(low=0, high=2, size=[y_types, code_length])  # recode y labels
    # to avoid error, labels of y should start with 0
    y_matrix = np.vstack([code_book[n, :] for n in y])
    w = np.random.uniform(low=-1, high=1, size=[feature_n, code_length])
    if method == 'sigmoid':
        min_max_scaler = preprocessing.MinMaxScaler()  # MinMaxScaler default (0,1)
        x_scaled = min_max_scaler.fit_transform(x)
    elif method == 'tanh':
        min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
        x_scaled = min_max_scaler.fit_transform(x)
    while error < error2stop or i < max_iter:
        i += 1
        z = x_scaled.dot(w)
        if method == 'sigmoid':
            q = sigmoid(z)
            threshold = 0.5
            f = np.copy(q)
            f[f >= threshold] = 1
            f[f < threshold] = 0
            delta_w = 0.005 * x_scaled.T.dot(np.abs((f - y_matrix)) * (0.5 - q) * (q - q**2))
        elif method == 'tanh':
            q = np.tanh(z)
            threshold = 0
            f = np.copy(q)
            f[f >= threshold] = 1
            f[f < threshold] = 0
            delta_w = 0.005 * x_scaled.T.dot(np.abs((f - y_matrix)) * (0 - q) * (1 - q**2))
        w = w + delta_w
        error = np.sum(np.abs(f - y_matrix)) / (code_length * x_scaled.shape[0])
        all_error.append(error)
        #print('iteration: %d, error, %f' % (i, error))
    return [np.array(all_error),  w]


def predict_ecoc_classify(x, y, w, method):
    # y should be labels e.g. 0,1,2,3....
    # code_length: how long do you want to code y labels
    code_length = w.shape[1]
    y_types = len(np.unique(y))
    np.random.seed(132)  # set random seed to make results same
    code_book = np.random.randint(low=0, high=2, size=[y_types, code_length])
    if method == 'sigmoid':
        min_max_scaler = preprocessing.MinMaxScaler()  # MinMaxScaler default (0,1)
        x_scaled = min_max_scaler.fit_transform(x)
    elif method == 'tanh':
        min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
        x_scaled = min_max_scaler.fit_transform(x)
    z = x_scaled.dot(w)
    if method == 'sigmoid':
        q = sigmoid(z)
        threshold = 0.5
        y_pred = np.copy(q)
        y_pred[y_pred >= threshold] = 1
        y_pred[y_pred < threshold] = 0
    elif method == 'tanh':
        q = np.tanh(z)
        threshold = 0
        y_pred = np.copy(q)
        y_pred[y_pred >= threshold] = 1
        y_pred[y_pred < threshold] = 0
    y_pred_label = []
    for n in range(y_pred.shape[0]):
        dist_vec = np.sum(np.abs(y_pred[n, :] - code_book), 1) / code_length
        y_pred_label.append(np.argmin(dist_vec))
    return np.array(y_pred_label)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

%matplotlib inline
fig, ax = plt.subplots(1, 2, sharey=True, figsize=[18, 8])
fig.suptitle('Comparing code length')
for i, m in enumerate(list(['sigmoid', 'tanh'])):
    for v in list([14, 16, 18, 20]):
        errors, w = train_ecoc_classify(
            x=x_train, y=y_train, error2stop=0.1, max_iter=20000, code_length=v, method=m)
        ax[i].plot(errors, label='code length: %d' % v)
        ax[i].legend(loc='upper right')
        ax[i].set_title(m + ' activation function')
        ax[i].set_xlabel('iter')
        if i == 0:  # only set one y label
            ax[i].set_ylabel('error')
#fig.savefig('Figure 1.tif',dpi=300)
fig, ax = plt.subplots(1, 4, figsize=[20, 5])
fig.suptitle('Comparing code length on prediction results (sigmoid)')
for i, v in enumerate(list([14, 16, 18, 20])):
    _, w = train_ecoc_classify(x=x_train, y=y_train, error2stop=0.1,
                               max_iter=20000, code_length=v, method='sigmoid')
    y_pred = predict_ecoc_classify(x_test, y_test, w, 'sigmoid')
    confusion_m = confusion_matrix(y_test, y_pred)
    miss_rate = 1 - np.sum(np.diag(confusion_m)) / np.sum(confusion_m)
    sns.heatmap(confusion_m.T, square=True, annot=True, fmt='d', cbar=False, ax=ax[i])
    ax[i].set_title('code length: %d (miss_rate: %f)' % (v, miss_rate))
    ax[i].set_xlabel('true label')
    if i == 0:  # only set one y label
        ax[i].set_ylabel('predicted label')
#fig.savefig('Figure 2.tif',dpi=300)


fig, ax = plt.subplots(1, 4, figsize=[20, 5])
fig.suptitle('Comparing code length on prediction results (tanh)')
for i, v in enumerate(list([14, 16, 18, 20])):
    _, w = train_ecoc_classify(x=x_train, y=y_train, error2stop=0.1,
                               max_iter=20000, code_length=v, method='tanh')
    y_pred = predict_ecoc_classify(x_test, y_test, w, 'tanh')
    confusion_m = confusion_matrix(y_test, y_pred)
    miss_rate = 1 - np.sum(np.diag(confusion_m)) / np.sum(confusion_m)
    sns.heatmap(confusion_m.T, square=True, annot=True, fmt='d', cbar=False, ax=ax[i])
    ax[i].set_title('code length: %d (miss_rate: %f)' % (v, miss_rate))
    ax[i].set_xlabel('true label')
    if i == 0:  # only set one y label
        ax[i].set_ylabel('predicted label')
#fig.savefig('Figure 3.tif',dpi=300)

Last updated

Was this helpful?