import pandas as pd
import numpy as np
#etc
from sklearn import cluster
iris = pd.read_csv("https://raw.githubusercontent.com/PyDataWorkshop/datasets/master/iris-skl.csv")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
Sepal Length 150 non-null float64
Sepal Width 150 non-null float64
Petal Length 150 non-null float64
Petal Width 150 non-null float64
Species 150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB
['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species']
iris.columns[iris.columns.str.startswith('Pet')]
Index(['Petal Length', 'Petal Width'], dtype='object')
iris.columns[iris.columns.str.endswith('dth')].tolist()
['Sepal Width', 'Petal Width']
iris.columns = iris.columns.str.replace(' ','')
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
SepalLength |
SepalWidth |
PetalLength |
PetalWidth |
Species |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
setosa |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
setosa |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
setosa |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
setosa |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
setosa |
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
Name: Species, dtype: object
iris["Species"].groupby(iris["Species"], axis=0).count()
Species
setosa 50
versicolor 50
virginica 50
Name: Species, dtype: int64
iris.columns[iris.columns.str.startswith('Sep')]
Index(['SepalLength', 'SepalWidth'], dtype='object')
iris.columns[iris.columns.str.endswith('gth')]
Index(['SepalLength', 'PetalLength'], dtype='object')
## df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
### Remove White Space from Column Names
#print(iris.columns.tolist())
iris.columns.str.replace('',' ')
#iris.columns.tolist()
Index([' S e p a l L e n g t h ', ' S e p a l W i d t h ',
' P e t a l L e n g t h ', ' P e t a l W i d t h ', ' S p e c i e s '],
dtype='object')
iris.columns.str.replace(' ','')
Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species'], dtype='object')
iris.columns = iris.columns.str.replace(' ','')
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
SepalLength |
SepalWidth |
PetalLength |
PetalWidth |
Species |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
setosa |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
setosa |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
setosa |
*** Not what we want! Python is "Zero Index" ***
iris.iloc[:, [1]].head(5)
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
SepalWidth |
0 |
3.5 |
1 |
3.0 |
2 |
3.2 |
3 |
3.1 |
4 |
3.6 |
iris.iloc[:,:4].head(5)
# All Rows
# First 4 Columns
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
SepalLength |
SepalWidth |
PetalLength |
PetalWidth |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
iris_feature = iris.iloc[:,:4]
iris_feature.head(5)
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
SepalLength |
SepalWidth |
PetalLength |
PetalWidth |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
iris_targ = iris["Species"]
iris_targ.head(6)
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
5 setosa
Name: Species, dtype: object
iris.drop(iris.columns[:4], axis=1).head(5)
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
Species |
0 |
setosa |
1 |
setosa |
2 |
setosa |
3 |
setosa |
4 |
setosa |
pandas.core.series.Series
k = 3
kmeans = cluster.KMeans(n_clusters=k)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0)
# to get the locations of the centroids and the label of the owning cluster for each observation in the data set:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2,
0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)
pd.crosstab(iris_targ,labels)
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
col_0 |
0 |
1 |
2 |
Species |
|
|
|
setosa |
0 |
50 |
0 |
versicolor |
2 |
0 |
48 |
virginica |
36 |
0 |
14 |
centroids,labels,inertia = cluster.k_means(iris_feature,n_clusters=k,n_init=25,algorithm = "elkan")
type(pd.crosstab(iris_targ,labels))
df = pd.crosstab(iris_targ,labels)
column_titles = [1,2,0]
df.reindex(columns=column_titles)
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
col_0 |
1 |
2 |
0 |
Species |
|
|
|
setosa |
50 |
0 |
0 |
versicolor |
0 |
48 |
2 |
virginica |
0 |
14 |
36 |