Week8
Types of Data for classificatin
Qualitative Data
Categorical or nominal data
Ordinal or Ranked Data
Quantitative Data
Discrete Data
Continuous measurements
Handling Categorical/Nominal data----dummy encoding
from statsmodels.tools import categorical
cat_encod=categorical(data, dictnames=False, drop=True)
# dictnames: create a dict
# drop: create new /drop data
Support Vector Machine (SVM)
https://oceandatamining.sciencesconf.org/data/program/OBIDAM14_Canu.pdf https://web.stanford.edu/~hastie/Papers/ESLII.pdf
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=f)
from sklearn.svm import SVC
svc=SVC(c=1.0, kernel='rbf', degree=3,gamma='auto',probability=Fales, tol=0.001, max_iter=-1, random_state=None)
svc.fit(x_train, y_train)
svc.predict(x_test)
svc.predict(x_train)
# c: penalty parameter
#kernel: 'rbf','sigmoid','poly'
#tol: stop criterion
#max_iter: -1 no limit;
#random_state: random-seed to use
Naive Bayesian
A collection of classification algorithms.
from sklearn.naive_bayes import GaussianNB
NB =GaussianNB(priors)
NB.fit(X,y)
NB.predict(X_dash)
Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier (n_estimators =10, random_state=12)
RF.fit(x,y)
RF.predict(x_dash)
Classification Metrics: confusion matrix!
Diagonal elements of the matrix, it contains the number of correctly identified samples for each class
from sklean.metrics import confusion_matrix
CM=confusion_matrix(y_actual, y_predicted)
print(CM)
important note: use heatmap to plot the CM
plt.subplot(131)
sns.heatmap(CM.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')
Exercise
RF
Import the dataset from the following ‘url’ and do a classification with decision tree and Random Forest (RF) with number of trees equal to 5, and compare the result of testing data with confusion matrix.
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from statsmodels.tools import categorical
from sklearn import tree
# ------------------ Loading Dataset --------------------------#
dataframe = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-
databases/abalone/abalone.data")
dataframe = dataframe .drop(dataframe .index[:-1000])
numericdf = dataframe[dataframe .columns[1:9]]
categordf = dataframe[dataframe .columns[0]]
categordf_en = categorical(categordf.values , drop=True)
categordf_en = categordf_en[:, 0:2]
numeric_arr = np.asarray(numericdf.values)
categor_arr = np.asarray(categordf_en)
Output = numeric_arr[:, 7]
Input_numeric = numeric_arr[:, 0:6]
Input_categor = categor_arr
Input = np.concatenate((Input_numeric, Input_categor), axis=1)
#---------------------------------------------------------------#
RF = RandomForestClassifier(n_estimators=5, random_state=12)
RF.fit(Input, Output)
Z_RF = RF.predict(Input)
CM_RF= confusion_matrix(Output, Z_RF)
#---------------------------------------------------------------
DT = tree.DecisionTreeClassifier()
DT.fit(Input, Output)
Z_DT = DT.predict(Input)
CM_DT= confusion_matrix(Output, Z_DT)
#---------------------------------------------------------------
plt.subplot(121)
sns.heatmap(CM_RF.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.subplot(122)
sns.heatmap(CM_DT.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.show()
Last updated
Was this helpful?