project2
import sklearn.preprocessing as skp
import numpy as np
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans,vq
%matplotlib inline
#=======================Task1.Load the data set in and summarize the data=====================================================
#file_path=r'C:\Users\Zhipeng\Desktop\UCD python online\project2\Project2_dataset(1).csv'
x = np.genfromtxt(file_path,delimiter=',')
#summarize the data
df=pd.DataFrame(x)
df.columns=['x','y','z']
df.describe()
#=======================Task3.Implement the Subtractive clustering algorithm in Python========================================
def substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15):
n_points,n_feature=np.shape(x)
rb=ra*rb_factor
if n_feature==1:
print('This function currently only works for points with at least 2 features')
return None
#Step1. Normalize each point into a unit hyperbox to make each dimension identical
#This is a MinMaxScaler feature scaling
scaler = skp.MinMaxScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)
#Step2.Compute the initial potentials for each data point
radii_vector=ra*np.ones([n_points,n_feature])
p=np.zeros([n_points,1])
for n in range(n_points):
point=x[n,:]
point_vector=np.array([point,]*n_points)
dx=(point_vector-x_scaled)/radii_vector
p[n]=np.sum(np.exp(-4*np.sum(dx**2,1)))
#Step3.Find the 1st cluster center with highest potential value
p1=np.max(p)
c1_idx=np.argmax(p)
center1=x_scaled[c1_idx,:]
#Step4-7
# put the 1st center in,compute the potential for the remaining data
final_center=center1.reshape(-1,3)
updated_data=x_scaled
updated_max_idx=c1_idx
reduced_p=p
keep_searching=1
num_iteration=1
need_calculate_p=1
while keep_searching==1:
if need_calculate_p==1:
#Step7.compute the potential for the remaining data
new_points,new_feature=np.shape(updated_data)
rbdii_vector=rb*np.ones([new_points,new_feature])
dx_reduce=(updated_data-np.array([updated_data[updated_max_idx,:],]*new_points))/rbdii_vector
reduced_p=reduced_p-reduced_p[updated_max_idx]*np.exp(-4*np.sum(dx_reduce**2,1)).reshape(new_points,1)
#remove max point as we should be working on the remaining data
reduced_p=np.delete(reduced_p,(updated_max_idx),axis=0)
updated_data=np.delete(updated_data,(updated_max_idx),axis=0)
updated_max_p=np.max(reduced_p)
updated_max_idx=np.argmax(reduced_p)
updated_center=updated_data[updated_max_idx,:]
#Step6.determine the next cluster center
if updated_max_p/p1<epsilon_lower:
keep_searching=0
elif updated_max_p/p1>epsilon_upper:
final_center=np.vstack([final_center,updated_center.reshape(-1,3)])
need_calculate_p=1
else:
center_vector=np.array([updated_center.reshape(-1,3),]*final_center.shape[0])
dist_vector=np.sqrt(np.sum((final_center.reshape(-1,3)-center_vector.reshape(-1,3))**2,1))
dist_min=np.min(dist_vector)
test=dist_min/ra+updated_max_p/p1
if test>=1:
final_center=np.vstack([final_center,updated_center.reshape(-1,3)])
need_calculate_p=1
else:
reduced_p[updated_max_idx]=0
updated_max_idx=np.argmax(reduced_p)
updated_center=updated_data[updated_max_idx,:]
updated_max_p=reduced_p[updated_max_idx]
need_calculate_p=0
print('iteration:'+str(num_iteration)+' with px/p1: '+str(updated_max_p/p1))
num_iteration+=1
#scale back centers
original_center=scaler.inverse_transform(final_center)
print('Find %d cluster center(s):' %len(original_center))
print(original_center)
return original_center
#=======================Task4.Obtain a clustering for the synthetic data set using substractive clustering algorithm=====================
centers=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15)
labels,_ = vq(x,centers)
fig = plt.figure(figsize=[12,5])
fig.suptitle('Substractive Clustering with Default paremeters \n (ra=0.5,rb=1.25ra,e_up=0.5,e_low=0.15)')
ax = fig.add_subplot(1, 2, 1,projection='3d')
#ax = fig.add_subplot(1,2,1, projection='3d')
ax.scatter(centers[:,0],centers[:,1],centers[:,2],c='r',marker='x',s=60,label='cluster centers')
ax.scatter(x[:,0],x[:,1],x[:,2], c='grey', s=20,marker='+')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.legend(loc=4)
ax.set_title('Clustering results')
ax = fig.add_subplot(1, 2, 2,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('Clustering results')
#=======================Task2.Test a number of parameter values==================================================
centers1=substractive_clustering(x,ra=0.5,rb_factor=1.5,epsilon_upper=0.5,epsilon_lower=0.15)
labels1,_ = vq(x,centers1)
centers2=substractive_clustering(x,ra=0.8,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15)
labels2,_ = vq(x,centers2)
centers3=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.08)
labels3,_ = vq(x,centers3)
centers4=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=1,epsilon_lower=0.15)
labels4,_ = vq(x,centers4)
fig = plt.figure(figsize=[18,3])
fig.suptitle('Test substractive clustering on different parameters')
ax = fig.add_subplot(1, 4, 1,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels1.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('rb=1.5ra')
ax = fig.add_subplot(1, 4, 2,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels2.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('ra=0.8')
ax = fig.add_subplot(1, 4, 3,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels3.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('e_low=0.08')
ax = fig.add_subplot(1, 4, 4,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels4.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('e_up=1')
#=======================Task5.Obtain a clustering for the synthetic data set using the K-means algorithm===================================
test_n=4
fig = plt.figure(figsize=[test_n*4,3])
fig.suptitle('Test k-means with different cluster number')
for n in range(test_n):
cluster_n=n+2
centroids,_ = kmeans(x,cluster_n)
labels,_ = vq(x,centroids)
ax = fig.add_subplot(1, test_n, n+1,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('Cluster number: '+str(cluster_n))
#=======================Task6.Compare the clustering results from different algorithms
#1. print centers
kmean_c,_ = kmeans(x,3)
subclust_c=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15)
kmeans_labels,_ = vq(x,kmean_c)
subclust_labels,_=vq(x,subclust_c)
print(kmean_c)
print(subclust_c)
#2. plot
fig = plt.figure(figsize=[16,5])
fig.suptitle('compare clustering results from different algorithms')
ax = fig.add_subplot(1, 3, 1,projection='3d')
#ax = fig.add_subplot(1,2,1, projection='3d')
ax.scatter(subclust_c[:,0],subclust_c[:,1],subclust_c[:,2],c='r',marker='x',s=60,label='substractive')
ax.scatter(kmean_c[:,0],kmean_c[:,1],kmean_c[:,2],c='b',marker='o',s=60,label='k-means')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.legend(loc=1)
ax.set_title('centers')
ax = fig.add_subplot(1, 3, 2,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=kmeans_labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('k-means')
ax = fig.add_subplot(1, 3, 3,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=subclust_labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('Subclust')
#3.calculate sum distance
def sum_dist(data=x, center=kmean_c):
labels,_ = vq(data,center)
dist={}
for m in np.unique(labels):
cluster_points=data[labels==m]
this_center=center[m]
this_center_vector=np.array([this_center,]*np.shape(cluster_points)[0])
sq_dist=np.sum((cluster_points-this_center_vector)**2,1)
dist['cluster'+str(m)]=np.sum(sq_dist)
return dist
sum_dist(x,subclust_c)
sum_dist(x,kmean_c)
# Task7/8 are included in text or previous steps
Last updated
Was this helpful?