Data Prog with Python
  • Introduction
  • Week2
  • Week3
  • Week4
  • Week5
  • Week6
  • Week7
  • Week8
  • Week9
  • Week10
  • project1
  • project2
  • project3
  • Useful codes for exam
Powered by GitBook
On this page

Was this helpful?

project2

import sklearn.preprocessing as skp
import numpy as np
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans,vq
%matplotlib inline
#=======================Task1.Load the data set in and summarize the data=====================================================

#file_path=r'C:\Users\Zhipeng\Desktop\UCD python online\project2\Project2_dataset(1).csv'
x = np.genfromtxt(file_path,delimiter=',')

#summarize the data
df=pd.DataFrame(x)
df.columns=['x','y','z']
df.describe()


#=======================Task3.Implement the Subtractive clustering algorithm in Python========================================
def substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15):
    n_points,n_feature=np.shape(x)
    rb=ra*rb_factor
    if n_feature==1:
        print('This function currently only works for points with at least 2 features')
        return None
#Step1. Normalize each point into a unit hyperbox to make each dimension identical
#This is a MinMaxScaler feature scaling
    scaler = skp.MinMaxScaler()
    scaler.fit(x)
    x_scaled = scaler.transform(x)
#Step2.Compute the initial potentials for each data point
    radii_vector=ra*np.ones([n_points,n_feature])
    p=np.zeros([n_points,1])
    for n in range(n_points):
        point=x[n,:]
        point_vector=np.array([point,]*n_points)
        dx=(point_vector-x_scaled)/radii_vector
        p[n]=np.sum(np.exp(-4*np.sum(dx**2,1)))
#Step3.Find the 1st cluster center with highest potential value
    p1=np.max(p)
    c1_idx=np.argmax(p)
    center1=x_scaled[c1_idx,:]
#Step4-7
# put the 1st center in,compute the potential for the remaining data
    final_center=center1.reshape(-1,3)
    updated_data=x_scaled
    updated_max_idx=c1_idx
    reduced_p=p
    keep_searching=1
    num_iteration=1
    need_calculate_p=1
    while keep_searching==1:
        if need_calculate_p==1:
#Step7.compute the potential for the remaining data
            new_points,new_feature=np.shape(updated_data)
            rbdii_vector=rb*np.ones([new_points,new_feature])
            dx_reduce=(updated_data-np.array([updated_data[updated_max_idx,:],]*new_points))/rbdii_vector
            reduced_p=reduced_p-reduced_p[updated_max_idx]*np.exp(-4*np.sum(dx_reduce**2,1)).reshape(new_points,1)

#remove max point as we should be working on the remaining data
            reduced_p=np.delete(reduced_p,(updated_max_idx),axis=0)
            updated_data=np.delete(updated_data,(updated_max_idx),axis=0)
            updated_max_p=np.max(reduced_p)
            updated_max_idx=np.argmax(reduced_p)
            updated_center=updated_data[updated_max_idx,:]
#Step6.determine the next cluster center
        if updated_max_p/p1<epsilon_lower:
            keep_searching=0
        elif updated_max_p/p1>epsilon_upper:
            final_center=np.vstack([final_center,updated_center.reshape(-1,3)])
            need_calculate_p=1
        else:
            center_vector=np.array([updated_center.reshape(-1,3),]*final_center.shape[0])
            dist_vector=np.sqrt(np.sum((final_center.reshape(-1,3)-center_vector.reshape(-1,3))**2,1))
            dist_min=np.min(dist_vector)

            test=dist_min/ra+updated_max_p/p1
            if test>=1:
                final_center=np.vstack([final_center,updated_center.reshape(-1,3)])
                need_calculate_p=1
            else:
                reduced_p[updated_max_idx]=0
                updated_max_idx=np.argmax(reduced_p)
                updated_center=updated_data[updated_max_idx,:]
                updated_max_p=reduced_p[updated_max_idx]
                need_calculate_p=0
        print('iteration:'+str(num_iteration)+' with px/p1: '+str(updated_max_p/p1))
        num_iteration+=1
#scale back centers
    original_center=scaler.inverse_transform(final_center)
    print('Find %d cluster center(s):' %len(original_center))
    print(original_center)
    return original_center
#=======================Task4.Obtain a clustering for the synthetic data set using substractive clustering algorithm=====================
centers=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15)
labels,_ = vq(x,centers)


fig = plt.figure(figsize=[12,5])
fig.suptitle('Substractive Clustering with Default paremeters \n (ra=0.5,rb=1.25ra,e_up=0.5,e_low=0.15)')
ax = fig.add_subplot(1, 2, 1,projection='3d')
#ax = fig.add_subplot(1,2,1, projection='3d')
ax.scatter(centers[:,0],centers[:,1],centers[:,2],c='r',marker='x',s=60,label='cluster centers')
ax.scatter(x[:,0],x[:,1],x[:,2], c='grey', s=20,marker='+')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.legend(loc=4)
ax.set_title('Clustering results')
ax = fig.add_subplot(1, 2, 2,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('Clustering results')




#=======================Task2.Test a number of parameter values==================================================
centers1=substractive_clustering(x,ra=0.5,rb_factor=1.5,epsilon_upper=0.5,epsilon_lower=0.15)
labels1,_ = vq(x,centers1)
centers2=substractive_clustering(x,ra=0.8,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15)
labels2,_ = vq(x,centers2)
centers3=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.08)
labels3,_ = vq(x,centers3)
centers4=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=1,epsilon_lower=0.15)
labels4,_ = vq(x,centers4)

fig = plt.figure(figsize=[18,3])
fig.suptitle('Test substractive clustering on different parameters')
ax = fig.add_subplot(1, 4, 1,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels1.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('rb=1.5ra')

ax = fig.add_subplot(1, 4, 2,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels2.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('ra=0.8')

ax = fig.add_subplot(1, 4, 3,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels3.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('e_low=0.08')

ax = fig.add_subplot(1, 4, 4,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels4.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('e_up=1')


#=======================Task5.Obtain a clustering for the synthetic data set using the K-means algorithm===================================
test_n=4
fig = plt.figure(figsize=[test_n*4,3])
fig.suptitle('Test k-means with different cluster number')
for n in range(test_n):
    cluster_n=n+2
    centroids,_ = kmeans(x,cluster_n)
    labels,_ = vq(x,centroids)
    ax = fig.add_subplot(1, test_n, n+1,projection='3d')
    ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels.astype(np.float), edgecolor='k')
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_title('Cluster number: '+str(cluster_n))
#=======================Task6.Compare the clustering results from different algorithms
#1. print centers
kmean_c,_ = kmeans(x,3)
subclust_c=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15)

kmeans_labels,_ = vq(x,kmean_c)
subclust_labels,_=vq(x,subclust_c)

print(kmean_c)
print(subclust_c)

#2. plot
fig = plt.figure(figsize=[16,5])
fig.suptitle('compare clustering results from different algorithms')
ax = fig.add_subplot(1, 3, 1,projection='3d')
#ax = fig.add_subplot(1,2,1, projection='3d')
ax.scatter(subclust_c[:,0],subclust_c[:,1],subclust_c[:,2],c='r',marker='x',s=60,label='substractive')
ax.scatter(kmean_c[:,0],kmean_c[:,1],kmean_c[:,2],c='b',marker='o',s=60,label='k-means')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.legend(loc=1)
ax.set_title('centers')
ax = fig.add_subplot(1, 3, 2,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=kmeans_labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('k-means')
ax = fig.add_subplot(1, 3, 3,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=subclust_labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('Subclust')

#3.calculate sum distance
def sum_dist(data=x, center=kmean_c):
    labels,_ = vq(data,center)
    dist={}
    for m in np.unique(labels):
        cluster_points=data[labels==m]
        this_center=center[m]
        this_center_vector=np.array([this_center,]*np.shape(cluster_points)[0])
        sq_dist=np.sum((cluster_points-this_center_vector)**2,1)
        dist['cluster'+str(m)]=np.sum(sq_dist)
    return dist

    sum_dist(x,subclust_c)
    sum_dist(x,kmean_c)

# Task7/8 are included in text or previous steps
Previousproject1Nextproject3

Last updated 5 years ago

Was this helpful?