project2

import sklearn.preprocessing as skp
import numpy as np
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans,vq
%matplotlib inline
#=======================Task1.Load the data set in and summarize the data=====================================================

#file_path=r'C:\Users\Zhipeng\Desktop\UCD python online\project2\Project2_dataset(1).csv'
x = np.genfromtxt(file_path,delimiter=',')

#summarize the data
df=pd.DataFrame(x)
df.columns=['x','y','z']
df.describe()


#=======================Task3.Implement the Subtractive clustering algorithm in Python========================================
def substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15):
    n_points,n_feature=np.shape(x)
    rb=ra*rb_factor
    if n_feature==1:
        print('This function currently only works for points with at least 2 features')
        return None
#Step1. Normalize each point into a unit hyperbox to make each dimension identical
#This is a MinMaxScaler feature scaling
    scaler = skp.MinMaxScaler()
    scaler.fit(x)
    x_scaled = scaler.transform(x)
#Step2.Compute the initial potentials for each data point
    radii_vector=ra*np.ones([n_points,n_feature])
    p=np.zeros([n_points,1])
    for n in range(n_points):
        point=x[n,:]
        point_vector=np.array([point,]*n_points)
        dx=(point_vector-x_scaled)/radii_vector
        p[n]=np.sum(np.exp(-4*np.sum(dx**2,1)))
#Step3.Find the 1st cluster center with highest potential value
    p1=np.max(p)
    c1_idx=np.argmax(p)
    center1=x_scaled[c1_idx,:]
#Step4-7
# put the 1st center in,compute the potential for the remaining data
    final_center=center1.reshape(-1,3)
    updated_data=x_scaled
    updated_max_idx=c1_idx
    reduced_p=p
    keep_searching=1
    num_iteration=1
    need_calculate_p=1
    while keep_searching==1:
        if need_calculate_p==1:
#Step7.compute the potential for the remaining data
            new_points,new_feature=np.shape(updated_data)
            rbdii_vector=rb*np.ones([new_points,new_feature])
            dx_reduce=(updated_data-np.array([updated_data[updated_max_idx,:],]*new_points))/rbdii_vector
            reduced_p=reduced_p-reduced_p[updated_max_idx]*np.exp(-4*np.sum(dx_reduce**2,1)).reshape(new_points,1)

#remove max point as we should be working on the remaining data
            reduced_p=np.delete(reduced_p,(updated_max_idx),axis=0)
            updated_data=np.delete(updated_data,(updated_max_idx),axis=0)
            updated_max_p=np.max(reduced_p)
            updated_max_idx=np.argmax(reduced_p)
            updated_center=updated_data[updated_max_idx,:]
#Step6.determine the next cluster center
        if updated_max_p/p1<epsilon_lower:
            keep_searching=0
        elif updated_max_p/p1>epsilon_upper:
            final_center=np.vstack([final_center,updated_center.reshape(-1,3)])
            need_calculate_p=1
        else:
            center_vector=np.array([updated_center.reshape(-1,3),]*final_center.shape[0])
            dist_vector=np.sqrt(np.sum((final_center.reshape(-1,3)-center_vector.reshape(-1,3))**2,1))
            dist_min=np.min(dist_vector)

            test=dist_min/ra+updated_max_p/p1
            if test>=1:
                final_center=np.vstack([final_center,updated_center.reshape(-1,3)])
                need_calculate_p=1
            else:
                reduced_p[updated_max_idx]=0
                updated_max_idx=np.argmax(reduced_p)
                updated_center=updated_data[updated_max_idx,:]
                updated_max_p=reduced_p[updated_max_idx]
                need_calculate_p=0
        print('iteration:'+str(num_iteration)+' with px/p1: '+str(updated_max_p/p1))
        num_iteration+=1
#scale back centers
    original_center=scaler.inverse_transform(final_center)
    print('Find %d cluster center(s):' %len(original_center))
    print(original_center)
    return original_center
#=======================Task4.Obtain a clustering for the synthetic data set using substractive clustering algorithm=====================
centers=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15)
labels,_ = vq(x,centers)


fig = plt.figure(figsize=[12,5])
fig.suptitle('Substractive Clustering with Default paremeters \n (ra=0.5,rb=1.25ra,e_up=0.5,e_low=0.15)')
ax = fig.add_subplot(1, 2, 1,projection='3d')
#ax = fig.add_subplot(1,2,1, projection='3d')
ax.scatter(centers[:,0],centers[:,1],centers[:,2],c='r',marker='x',s=60,label='cluster centers')
ax.scatter(x[:,0],x[:,1],x[:,2], c='grey', s=20,marker='+')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.legend(loc=4)
ax.set_title('Clustering results')
ax = fig.add_subplot(1, 2, 2,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('Clustering results')




#=======================Task2.Test a number of parameter values==================================================
centers1=substractive_clustering(x,ra=0.5,rb_factor=1.5,epsilon_upper=0.5,epsilon_lower=0.15)
labels1,_ = vq(x,centers1)
centers2=substractive_clustering(x,ra=0.8,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15)
labels2,_ = vq(x,centers2)
centers3=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.08)
labels3,_ = vq(x,centers3)
centers4=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=1,epsilon_lower=0.15)
labels4,_ = vq(x,centers4)

fig = plt.figure(figsize=[18,3])
fig.suptitle('Test substractive clustering on different parameters')
ax = fig.add_subplot(1, 4, 1,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels1.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('rb=1.5ra')

ax = fig.add_subplot(1, 4, 2,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels2.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('ra=0.8')

ax = fig.add_subplot(1, 4, 3,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels3.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('e_low=0.08')

ax = fig.add_subplot(1, 4, 4,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels4.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('e_up=1')


#=======================Task5.Obtain a clustering for the synthetic data set using the K-means algorithm===================================
test_n=4
fig = plt.figure(figsize=[test_n*4,3])
fig.suptitle('Test k-means with different cluster number')
for n in range(test_n):
    cluster_n=n+2
    centroids,_ = kmeans(x,cluster_n)
    labels,_ = vq(x,centroids)
    ax = fig.add_subplot(1, test_n, n+1,projection='3d')
    ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels.astype(np.float), edgecolor='k')
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_title('Cluster number: '+str(cluster_n))
#=======================Task6.Compare the clustering results from different algorithms
#1. print centers
kmean_c,_ = kmeans(x,3)
subclust_c=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15)

kmeans_labels,_ = vq(x,kmean_c)
subclust_labels,_=vq(x,subclust_c)

print(kmean_c)
print(subclust_c)

#2. plot
fig = plt.figure(figsize=[16,5])
fig.suptitle('compare clustering results from different algorithms')
ax = fig.add_subplot(1, 3, 1,projection='3d')
#ax = fig.add_subplot(1,2,1, projection='3d')
ax.scatter(subclust_c[:,0],subclust_c[:,1],subclust_c[:,2],c='r',marker='x',s=60,label='substractive')
ax.scatter(kmean_c[:,0],kmean_c[:,1],kmean_c[:,2],c='b',marker='o',s=60,label='k-means')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.legend(loc=1)
ax.set_title('centers')
ax = fig.add_subplot(1, 3, 2,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=kmeans_labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('k-means')
ax = fig.add_subplot(1, 3, 3,projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=subclust_labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('Subclust')

#3.calculate sum distance
def sum_dist(data=x, center=kmean_c):
    labels,_ = vq(data,center)
    dist={}
    for m in np.unique(labels):
        cluster_points=data[labels==m]
        this_center=center[m]
        this_center_vector=np.array([this_center,]*np.shape(cluster_points)[0])
        sq_dist=np.sum((cluster_points-this_center_vector)**2,1)
        dist['cluster'+str(m)]=np.sum(sq_dist)
    return dist

    sum_dist(x,subclust_c)
    sum_dist(x,kmean_c)

# Task7/8 are included in text or previous steps

Previousproject1 Nextproject3

Last updated 5 years ago

Was this helpful?

import sklearn.preprocessing as skp import numpy as np import pandas as pd from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt from scipy.cluster.vq import kmeans,vq %matplotlib inline #=======================Task1.Load the data set in and summarize the data===================================================== #file_path=r'C:\Users\Zhipeng\Desktop\UCD python online\project2\Project2_dataset(1).csv' x = np.genfromtxt(file_path,delimiter=',') #summarize the data df=pd.DataFrame(x) df.columns=['x','y','z'] df.describe() #=======================Task3.Implement the Subtractive clustering algorithm in Python======================================== def substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15): n_points,n_feature=np.shape(x) rb=ra*rb_factor if n_feature==1: print('This function currently only works for points with at least 2 features') return None #Step1. Normalize each point into a unit hyperbox to make each dimension identical #This is a MinMaxScaler feature scaling scaler = skp.MinMaxScaler() scaler.fit(x) x_scaled = scaler.transform(x) #Step2.Compute the initial potentials for each data point radii_vector=ra*np.ones([n_points,n_feature]) p=np.zeros([n_points,1]) for n in range(n_points): point=x[n,:] point_vector=np.array([point,]*n_points) dx=(point_vector-x_scaled)/radii_vector p[n]=np.sum(np.exp(-4*np.sum(dx**2,1))) #Step3.Find the 1st cluster center with highest potential value p1=np.max(p) c1_idx=np.argmax(p) center1=x_scaled[c1_idx,:] #Step4-7 # put the 1st center in,compute the potential for the remaining data final_center=center1.reshape(-1,3) updated_data=x_scaled updated_max_idx=c1_idx reduced_p=p keep_searching=1 num_iteration=1 need_calculate_p=1 while keep_searching==1: if need_calculate_p==1: #Step7.compute the potential for the remaining data new_points,new_feature=np.shape(updated_data) rbdii_vector=rb*np.ones([new_points,new_feature]) dx_reduce=(updated_data-np.array([updated_data[updated_max_idx,:],]*new_points))/rbdii_vector reduced_p=reduced_p-reduced_p[updated_max_idx]*np.exp(-4*np.sum(dx_reduce**2,1)).reshape(new_points,1) #remove max point as we should be working on the remaining data reduced_p=np.delete(reduced_p,(updated_max_idx),axis=0) updated_data=np.delete(updated_data,(updated_max_idx),axis=0) updated_max_p=np.max(reduced_p) updated_max_idx=np.argmax(reduced_p) updated_center=updated_data[updated_max_idx,:] #Step6.determine the next cluster center if updated_max_p/p1<epsilon_lower: keep_searching=0 elif updated_max_p/p1>epsilon_upper: final_center=np.vstack([final_center,updated_center.reshape(-1,3)]) need_calculate_p=1 else: center_vector=np.array([updated_center.reshape(-1,3),]*final_center.shape[0]) dist_vector=np.sqrt(np.sum((final_center.reshape(-1,3)-center_vector.reshape(-1,3))**2,1)) dist_min=np.min(dist_vector) test=dist_min/ra+updated_max_p/p1 if test>=1: final_center=np.vstack([final_center,updated_center.reshape(-1,3)]) need_calculate_p=1 else: reduced_p[updated_max_idx]=0 updated_max_idx=np.argmax(reduced_p) updated_center=updated_data[updated_max_idx,:] updated_max_p=reduced_p[updated_max_idx] need_calculate_p=0 print('iteration:'+str(num_iteration)+' with px/p1: '+str(updated_max_p/p1)) num_iteration+=1 #scale back centers original_center=scaler.inverse_transform(final_center) print('Find %d cluster center(s):' %len(original_center)) print(original_center) return original_center #=======================Task4.Obtain a clustering for the synthetic data set using substractive clustering algorithm===================== centers=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15) labels,_ = vq(x,centers) fig = plt.figure(figsize=[12,5]) fig.suptitle('Substractive Clustering with Default paremeters \n (ra=0.5,rb=1.25ra,e_up=0.5,e_low=0.15)') ax = fig.add_subplot(1, 2, 1,projection='3d') #ax = fig.add_subplot(1,2,1, projection='3d') ax.scatter(centers[:,0],centers[:,1],centers[:,2],c='r',marker='x',s=60,label='cluster centers') ax.scatter(x[:,0],x[:,1],x[:,2], c='grey', s=20,marker='+') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.legend(loc=4) ax.set_title('Clustering results') ax = fig.add_subplot(1, 2, 2,projection='3d') ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=labels.astype(np.float), edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_title('Clustering results') #=======================Task2.Test a number of parameter values================================================== centers1=substractive_clustering(x,ra=0.5,rb_factor=1.5,epsilon_upper=0.5,epsilon_lower=0.15) labels1,_ = vq(x,centers1) centers2=substractive_clustering(x,ra=0.8,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15) labels2,_ = vq(x,centers2) centers3=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.08) labels3,_ = vq(x,centers3) centers4=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=1,epsilon_lower=0.15) labels4,_ = vq(x,centers4) fig = plt.figure(figsize=[18,3]) fig.suptitle('Test substractive clustering on different parameters') ax = fig.add_subplot(1, 4, 1,projection='3d') ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels1.astype(np.float), edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_title('rb=1.5ra') ax = fig.add_subplot(1, 4, 2,projection='3d') ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels2.astype(np.float), edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_title('ra=0.8') ax = fig.add_subplot(1, 4, 3,projection='3d') ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels3.astype(np.float), edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_title('e_low=0.08') ax = fig.add_subplot(1, 4, 4,projection='3d') ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels4.astype(np.float), edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_title('e_up=1') #=======================Task5.Obtain a clustering for the synthetic data set using the K-means algorithm=================================== test_n=4 fig = plt.figure(figsize=[test_n*4,3]) fig.suptitle('Test k-means with different cluster number') for n in range(test_n): cluster_n=n+2 centroids,_ = kmeans(x,cluster_n) labels,_ = vq(x,centroids) ax = fig.add_subplot(1, test_n, n+1,projection='3d') ax.scatter(x[:, 0], x[:, 1], x[:, 2],c=labels.astype(np.float), edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_title('Cluster number: '+str(cluster_n)) #=======================Task6.Compare the clustering results from different algorithms #1. print centers kmean_c,_ = kmeans(x,3) subclust_c=substractive_clustering(x,ra=0.5,rb_factor=1.25,epsilon_upper=0.5,epsilon_lower=0.15) kmeans_labels,_ = vq(x,kmean_c) subclust_labels,_=vq(x,subclust_c) print(kmean_c) print(subclust_c) #2. plot fig = plt.figure(figsize=[16,5]) fig.suptitle('compare clustering results from different algorithms') ax = fig.add_subplot(1, 3, 1,projection='3d') #ax = fig.add_subplot(1,2,1, projection='3d') ax.scatter(subclust_c[:,0],subclust_c[:,1],subclust_c[:,2],c='r',marker='x',s=60,label='substractive') ax.scatter(kmean_c[:,0],kmean_c[:,1],kmean_c[:,2],c='b',marker='o',s=60,label='k-means') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.legend(loc=1) ax.set_title('centers') ax = fig.add_subplot(1, 3, 2,projection='3d') ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=kmeans_labels.astype(np.float), edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_title('k-means') ax = fig.add_subplot(1, 3, 3,projection='3d') ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=subclust_labels.astype(np.float), edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_title('Subclust') #3.calculate sum distance def sum_dist(data=x, center=kmean_c): labels,_ = vq(data,center) dist={} for m in np.unique(labels): cluster_points=data[labels==m] this_center=center[m] this_center_vector=np.array([this_center,]*np.shape(cluster_points)[0]) sq_dist=np.sum((cluster_points-this_center_vector)**2,1) dist['cluster'+str(m)]=np.sum(sq_dist) return dist sum_dist(x,subclust_c) sum_dist(x,kmean_c) # Task7/8 are included in text or previous steps