import pandas as pd
import numpy as np
from sklearn import cluster
iris = pd.read_csv("")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
Sepal Length 150 non-null float64
Sepal Width 150 non-null float64
Petal Length 150 non-null float64
Petal Width 150 non-null float64
Species 150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB
['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species']
Index(['Petal Length', 'Petal Width'], dtype='object')
['Sepal Width', 'Petal Width']
iris.columns = iris.columns.str.replace(' ','')
SepalLength |
SepalWidth |
PetalLength |
PetalWidth |
Species |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
setosa |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
setosa |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
setosa |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
setosa |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
setosa |
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
Name: Species, dtype: object
iris["Species"].groupby(iris["Species"], axis=0).count()
setosa 50
versicolor 50
virginica 50
Name: Species, dtype: int64
Index(['SepalLength', 'SepalWidth'], dtype='object')
Index(['SepalLength', 'PetalLength'], dtype='object')
## df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
### Remove White Space from Column Names
iris.columns.str.replace('',' ')
Index([' S e p a l L e n g t h ', ' S e p a l W i d t h ',
' P e t a l L e n g t h ', ' P e t a l W i d t h ', ' S p e c i e s '],
iris.columns.str.replace(' ','')
Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species'], dtype='object')
iris.columns = iris.columns.str.replace(' ','')
SepalLength |
SepalWidth |
PetalLength |
PetalWidth |
Species |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
setosa |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
setosa |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
setosa |
*** Not what we want! Python is "Zero Index" ***
iris.iloc[:, [1]].head(5)
SepalWidth |
0 |
3.5 |
1 |
3.0 |
2 |
3.2 |
3 |
3.1 |
4 |
3.6 |
# All Rows
# First 4 Columns
SepalLength |
SepalWidth |
PetalLength |
PetalWidth |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
iris_feature = iris.iloc[:,:4]
SepalLength |
SepalWidth |
PetalLength |
PetalWidth |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
iris_targ = iris["Species"]
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
5 setosa
Name: Species, dtype: object
iris.drop(iris.columns[:4], axis=1).head(5)
Species |
0 |
setosa |
1 |
setosa |
2 |
setosa |
3 |
setosa |
4 |
setosa |
k = 3
kmeans = cluster.KMeans(n_clusters=k)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0)
# to get the locations of the centroids and the label of the owning cluster for each observation in the data set:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2,
0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)
col_0 |
0 |
1 |
2 |
Species |
setosa |
0 |
50 |
0 |
versicolor |
2 |
0 |
48 |
virginica |
36 |
0 |
14 |
centroids,labels,inertia = cluster.k_means(iris_feature,n_clusters=k,n_init=25,algorithm = "elkan")
df = pd.crosstab(iris_targ,labels)
column_titles = [1,2,0]
col_0 |
1 |
2 |
0 |
Species |
setosa |
50 |
0 |
0 |
versicolor |
0 |
48 |
2 |
virginica |
0 |
14 |
36 |