import pandas as pd
import numpy as np
#etc

from sklearn import cluster

iris = pd.read_csv("https://raw.githubusercontent.com/PyDataWorkshop/datasets/master/iris-skl.csv")

iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
Sepal Length    150 non-null float64
Sepal Width     150 non-null float64
Petal Length    150 non-null float64
Petal Width     150 non-null float64
Species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB

Column Names

iris.columns.tolist()

['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species']

iris.columns[iris.columns.str.startswith('Pet')]

Index(['Petal Length', 'Petal Width'], dtype='object')

iris.columns[iris.columns.str.endswith('dth')].tolist()

['Sepal Width', 'Petal Width']

iris.columns = iris.columns.str.replace(' ','')

iris.head()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

iris["Species"].head(5)

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: Species, dtype: object

iris["Species"].groupby(iris["Species"], axis=0).count()

Species
setosa        50
versicolor    50
virginica     50
Name: Species, dtype: int64

iris.columns[iris.columns.str.startswith('Sep')]

Index(['SepalLength', 'SepalWidth'], dtype='object')

iris.columns[iris.columns.str.endswith('gth')]

Index(['SepalLength', 'PetalLength'], dtype='object')

## df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})

### Remove White Space from Column Names

#print(iris.columns.tolist())
iris.columns.str.replace('',' ') 
#iris.columns.tolist()

Index([' S e p a l L e n g t h ', ' S e p a l W i d t h ',
       ' P e t a l L e n g t h ', ' P e t a l W i d t h ', ' S p e c i e s '],
      dtype='object')

iris.columns.str.replace(' ','')

Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species'], dtype='object')

iris.columns = iris.columns.str.replace(' ','')

iris[1:4] #Rows

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa

*** Not what we want! Python is "Zero Index" ***

integer location

iris.iloc[:, [1]].head(5)

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	SepalWidth
0	3.5
1	3.0
2	3.2
3	3.1
4	3.6

iris.iloc[:,:4].head(5)

# All Rows
# First 4 Columns

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	SepalLength	SepalWidth	PetalLength	PetalWidth
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

iris_feature = iris.iloc[:,:4]
iris_feature.head(5)

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	SepalLength	SepalWidth	PetalLength	PetalWidth
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

iris_targ = iris["Species"]
iris_targ.head(6)

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
5    setosa
Name: Species, dtype: object

# alternative approach

iris.drop(iris.columns[:4], axis=1).head(5)

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	Species
0	setosa
1	setosa
2	setosa
3	setosa
4	setosa

type(iris_targ)

pandas.core.series.Series

k = 3
kmeans = cluster.KMeans(n_clusters=k)

kmeans.fit(iris_feature)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

# to get the locations of the centroids and the label of the owning cluster for each observation in the data set:

labels = kmeans.labels_
centroids = kmeans.cluster_centers_

labels

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2,
       0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)

pd.crosstab(iris_targ,labels)

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

col_0	0	1	2
Species
setosa	0	50	0
versicolor	2	0	48
virginica	36	0	14

centroids,labels,inertia = cluster.k_means(iris_feature,n_clusters=k,n_init=25,algorithm = "elkan")

labels

type(pd.crosstab(iris_targ,labels))

df = pd.crosstab(iris_targ,labels)
column_titles = [1,2,0]

df.reindex(columns=column_titles)

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

col_0	1	2	0
Species
setosa	50	0	0
versicolor	0	48	2
virginica	0	14	36

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

SKL_Iris_Clustering.md

SKL_Iris_Clustering.md

Column Names

integer location

	SepalLength	SepalWidth	PetalLength	PetalWidth
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	SepalLength	SepalWidth	PetalLength	PetalWidth
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	SepalLength	SepalWidth	PetalLength	PetalWidth
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	SepalLength	SepalWidth	PetalLength	PetalWidth
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

Files

SKL_Iris_Clustering.md

Latest commit

History

SKL_Iris_Clustering.md

File metadata and controls

Column Names

integer location

	SepalLength	SepalWidth	PetalLength	PetalWidth
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	SepalLength	SepalWidth	PetalLength	PetalWidth
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2