Skip to content

Latest commit

 

History

History
815 lines (591 loc) · 11.2 KB

SKL_Iris_Clustering.md

File metadata and controls

815 lines (591 loc) · 11.2 KB
import pandas as pd
import numpy as np
#etc

from sklearn import cluster
iris = pd.read_csv("https://raw.githubusercontent.com/PyDataWorkshop/datasets/master/iris-skl.csv")
iris.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
Sepal Length    150 non-null float64
Sepal Width     150 non-null float64
Petal Length    150 non-null float64
Petal Width     150 non-null float64
Species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB

Column Names

iris.columns.tolist()
['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species']
iris.columns[iris.columns.str.startswith('Pet')]
Index(['Petal Length', 'Petal Width'], dtype='object')
iris.columns[iris.columns.str.endswith('dth')].tolist()
['Sepal Width', 'Petal Width']
iris.columns = iris.columns.str.replace(' ','')
iris.head()
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}
</style>
SepalLength SepalWidth PetalLength PetalWidth Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
iris["Species"].head(5)
0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: Species, dtype: object
iris["Species"].groupby(iris["Species"], axis=0).count()
Species
setosa        50
versicolor    50
virginica     50
Name: Species, dtype: int64
iris.columns[iris.columns.str.startswith('Sep')]
Index(['SepalLength', 'SepalWidth'], dtype='object')
iris.columns[iris.columns.str.endswith('gth')]
Index(['SepalLength', 'PetalLength'], dtype='object')
## df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
### Remove White Space from Column Names
#print(iris.columns.tolist())
iris.columns.str.replace('',' ') 
#iris.columns.tolist()
Index([' S e p a l L e n g t h ', ' S e p a l W i d t h ',
       ' P e t a l L e n g t h ', ' P e t a l W i d t h ', ' S p e c i e s '],
      dtype='object')
iris.columns.str.replace(' ','') 
Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species'], dtype='object')
iris.columns = iris.columns.str.replace(' ','')
iris[1:4] #Rows
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}
</style>
SepalLength SepalWidth PetalLength PetalWidth Species
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa

*** Not what we want! Python is "Zero Index" ***

integer location

iris.iloc[:, [1]].head(5)
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}
</style>
SepalWidth
0 3.5
1 3.0
2 3.2
3 3.1
4 3.6
iris.iloc[:,:4].head(5)

# All Rows
# First 4 Columns
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}
</style>
SepalLength SepalWidth PetalLength PetalWidth
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
iris_feature = iris.iloc[:,:4]
iris_feature.head(5)
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}
</style>
SepalLength SepalWidth PetalLength PetalWidth
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
iris_targ = iris["Species"]
iris_targ.head(6)
0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
5    setosa
Name: Species, dtype: object
# alternative approach
iris.drop(iris.columns[:4], axis=1).head(5)
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}
</style>
Species
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
type(iris_targ)
pandas.core.series.Series
k = 3
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(iris_feature)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
# to get the locations of the centroids and the label of the owning cluster for each observation in the data set:

labels = kmeans.labels_
centroids = kmeans.cluster_centers_
labels
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2,
       0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)
pd.crosstab(iris_targ,labels)
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}
</style>
col_0 0 1 2
Species
setosa 0 50 0
versicolor 2 0 48
virginica 36 0 14
centroids,labels,inertia = cluster.k_means(iris_feature,n_clusters=k,n_init=25,algorithm = "elkan")
labels
type(pd.crosstab(iris_targ,labels))
df = pd.crosstab(iris_targ,labels)
column_titles = [1,2,0]

df.reindex(columns=column_titles)
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}
</style>
col_0 1 2 0
Species
setosa 50 0 0
versicolor 0 48 2
virginica 0 14 36