From dcbd1d85e785a8be53c82b16cee91b9161e77ecf Mon Sep 17 00:00:00 2001 From: dhrey Date: Mon, 16 Nov 2020 21:19:59 +0100 Subject: [PATCH] Updating the project-my assgnment --- .../DataPrep-checkpoint.ipynb | 342 +++ Data Prep/DataPrep.ipynb | 2 +- .../decision_trees-checkpoint.ipynb | 1763 ++++++++++++++ Decision Trees/decision_trees.ipynb | 208 +- .../Linear_Algebra-checkpoint.ipynb | 1142 +++++++++ Linear Algebra/Linear_Algebra.ipynb | 2 +- .../linear_regression-checkpoint.ipynb | 2131 +++++++++++++++++ Linear Regression/linear_regression.ipynb | 2 +- .../logistic_regression-checkpoint.ipynb | 2111 ++++++++++++++++ Logistic Regression/logistic_regression.ipynb | 6 +- .../probability-checkpoint.ipynb | 464 ++++ Probabilty/probability.ipynb | 2 +- ...se Nigeria Project 'name'-checkpoint.ipynb | 1231 ++++++++++ ...020 ML Course Nigeria Project 'name'.ipynb | 2090 +++++++++++++++- .../regularization-checkpoint.ipynb | 1307 ++++++++++ Regularization/regularization.ipynb | 2 +- .../unsupervised_learning-checkpoint.ipynb | 1019 ++++++++ .../unsupervised_learning.ipynb | 2 +- 18 files changed, 13577 insertions(+), 249 deletions(-) create mode 100644 Data Prep/.ipynb_checkpoints/DataPrep-checkpoint.ipynb create mode 100644 Decision Trees/.ipynb_checkpoints/decision_trees-checkpoint.ipynb create mode 100644 Linear Algebra/.ipynb_checkpoints/Linear_Algebra-checkpoint.ipynb create mode 100644 Linear Regression/.ipynb_checkpoints/linear_regression-checkpoint.ipynb create mode 100644 Logistic Regression/.ipynb_checkpoints/logistic_regression-checkpoint.ipynb create mode 100644 Probabilty/.ipynb_checkpoints/probability-checkpoint.ipynb create mode 100644 Project/.ipynb_checkpoints/09-11-2020 ML Course Nigeria Project 'name'-checkpoint.ipynb create mode 100644 Regularization/.ipynb_checkpoints/regularization-checkpoint.ipynb create mode 100644 Unsupervised Learning/.ipynb_checkpoints/unsupervised_learning-checkpoint.ipynb diff --git a/Data Prep/.ipynb_checkpoints/DataPrep-checkpoint.ipynb b/Data Prep/.ipynb_checkpoints/DataPrep-checkpoint.ipynb new file mode 100644 index 0000000..96abb28 --- /dev/null +++ b/Data Prep/.ipynb_checkpoints/DataPrep-checkpoint.ipynb @@ -0,0 +1,342 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " All rights reserved © Global AI Hub 2020 \n", + "![](img/logo.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building ML Project" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/flow.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Steps of Project\n", + "\n", + "- Gathering Data\n", + "- Preparing the Data\n", + "- Choosing Models\n", + "- Training\n", + "- Evaluation\n", + "- Hyperparameter Tuning\n", + "- Prediction\n", + "- Model Selection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1) Gathering Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the problem definition, we need to obtain data which will be appropriate for our case. The quality and quantity of data that you gather will directly determine how good our predictive model can be." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2) Preparing the Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data preparation, where we load our data into a suitable place and prepare it for use in our machine learning training. This is also a good time to do any pertinent visualizations of your data, to help you see if there are any relevant relationships between different variables you can take advantage of, as well as show you if there are any data imbalances." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exploratory Data Analysis (EDA)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Exploratory Data Analysis refers to the critical process of performing initial investigations on data so as to discover patterns,to spot anomalies,to test hypothesis and to check assumptions with the help of summary statistics and graphical representations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/wordcloud.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/hist2.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/bar2.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pre-Processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Duplicate Values\n", + "In most cases, the duplicates are removed so as to not give that particular data object an advantage or bias, when running machine learning algorithms." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Imbalanced Data\n", + "An Imbalanced dataset is one where the number of instances of a class(es) are significantly higher than another class(es), thus leading to an imbalance and creating rarer class(es)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/imbalance.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Missing Values\n", + "\n", + "- Eleminate missing values\n", + "- Filling with mean or median\n", + "\n", + "`df.isnull().sum() ` \n", + "`df.dropna()`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/skew.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Outlier Detection\n", + "\n", + "- Standart Deviation\n", + "- Box Plots / IQR Calculation\n", + "- Isolation Forest\n", + "\n", + "\n", + "`from sklearn.ensemble import IsolationForest`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/stddev.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/IQR.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Feature Scaling\n", + "\n", + "- Standardization \n", + "$$ X_{new} = \\frac{X-\\mu}{\\sigma} $$ \n", + "- Normalization \n", + "$$X_{new} = \\frac{X-X_{min}}{X_{max} - X_{min}} $$ \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/stndr.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/norm.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Bucketing (Binning)\n", + "\n", + "Data binning, bucketing is a data pre-processing method used to minimize the effects of small observation errors (noisy data). The original data values are divided into small intervals known as bins and then they are replaced by a general value calculated for that bin. \n", + "\n", + "![](img/binning.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Feature Extraction\n", + "- Principle Components Analysis (PCA)\n", + "- Independent Component Analysis (ICA)\n", + "- Linear Discriminant Analysis (LDA)\n", + "- t-distributed Stochastic Neighbor Embedding (t-SNE)\n", + "\n", + "Example: \n", + "$$Profit = Revenue - Cost$$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Feature Encoding\n", + "Feature encoding is basically performing transformations on the data such that it can be easily accepted as input for machine learning algorithms while still retaining its original meaning.\n", + "\n", + "- **Nominal** : Any one-to-one mapping can be done which retains the meaning. For instance, a permutation of values like in One-Hot Encoding.\n", + "- **Ordinal** : An order-preserving change of values. The notion of small, medium and large can be represented equally well with the help of a new function. For example, we can encode this S, M and L sizes into {0, 1, 2} or maybe {1, 2, 3}." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/encode.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Train / Validation / Test Split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But before we start deciding the algorithm which should be used, it is always advised to split the dataset into 2 or sometimes 3 parts. Machine Learning algorithms, or any algorithm for that matter, has to be first trained on the data distribution available and then validated and tested, before it can be deployed to deal with real-world data. \n", + "\n", + "- 60 / 20 / 20\n", + "- 70 / 30\n", + "\n", + "`from sklearn.model_selection import train_test_split`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/split.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "#### Cross Validation\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/cross_valid.png)\n", + "`from sklearn.model_selection import cross_validate`\n", + "\n", + "https://scikit-learn.org/stable/modules/cross_validation.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resources\n", + "\n", + "https://towardsdatascience.com/introduction-to-data-preprocessing-in-machine-learning-a9fa83a5dc9d \n", + "https://developers.google.com/machine-learning/data-prep \n", + "https://towardsdatascience.com/5-ways-to-detect-outliers-that-every-data-scientist-should-know-python-code-70a54335a623\n", + "https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/ \n", + "https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02 \n", + "https://towardsdatascience.com/the-5-feature-selection-algorithms-every-data-scientist-need-to-know-3a6b566efd2 \n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Data Prep/DataPrep.ipynb b/Data Prep/DataPrep.ipynb index 96abb28..ba27e1d 100644 --- a/Data Prep/DataPrep.ipynb +++ b/Data Prep/DataPrep.ipynb @@ -334,7 +334,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.9" } }, "nbformat": 4, diff --git a/Decision Trees/.ipynb_checkpoints/decision_trees-checkpoint.ipynb b/Decision Trees/.ipynb_checkpoints/decision_trees-checkpoint.ipynb new file mode 100644 index 0000000..13b5e38 --- /dev/null +++ b/Decision Trees/.ipynb_checkpoints/decision_trees-checkpoint.ipynb @@ -0,0 +1,1763 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/logo.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A decision tree is one of the supervised machine learning algorithms. This algorithm can be used for regression and classification problems — yet, is mostly used for classification problems. As the name goes, it uses a tree-like model of decisions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/Decision_Tree.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Terminology\n", + "\n", + "**Root Node**: This attribute is used for dividing the data into two or more sets. The feature attribute in this node is selected based on Attribute Selection Techniques. \n", + "
\n", + "**Branch or Sub-Tree**: A part of the entire decision tree is called branch or sub-tree. \n", + "
\n", + "**Splitting**: Dividing a node into two or more sub-nodes based on if-else conditions. \n", + "
\n", + "**Decision Node**: After splitting the sub-nodes into further sub-nodes, then it is called as the decision node. \n", + "
\n", + "**Leaf or Terminal Node**: This is the end of the decision tree where it cannot be split into further sub-nodes. \n", + "
\n", + "**Pruning:** Removing a sub-node from the tree is called pruning. \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Splitting\n", + "\n", + "We have 2 main splitting criteria to make decision how we split our data:\n", + "- Gini Index\n", + "- Information Gain\n", + "\n", + "#### Gini Index\n", + "Gini Index, also known as Gini impurity, calculates the amount of probability of a specific feature that is classified incorrectly when selected randomly. If all the elements are linked with a single class then it can be called pure.\n", + "\n", + "Gini index varies between values 0 and 1, where 0 expresses the purity of classification, i.e. All the elements belong to a specified class or only one class exists there. And 1 indicates the random distribution of elements across various classes. The value of 0.5 of the Gini Index shows an equal distribution of elements over some classes.\n", + "\n", + "Node gini index = $p^2 + q^2$ \n", + "Where: \n", + "$p$: probability for success \n", + "$q$: probability for failure ($1-p$)\n", + "\n", + "Weighted Gini score of each node of that split = $\\sum$ proportion of element in node $*$ gini index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/split.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](\\img/split.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Split on Gender**: \n", + "
\n", + "Calculate, Gini for sub-node Female = (0.2) * (0.2) + (0.8) * (0.8)=0.68 \n", + "Gini for sub-node Male = (0.65) * (0.65) + (0.35) * (0.35)=0.55 \n", + "Calculate weighted Gini for Split Gender = *(10/30)* * 0.68 + *(20/30)* * 0.55 = 0.59 \n", + "
\n", + "**Similar for Split on Class:** \n", + "Gini for sub-node Class IX = (0.43) * (0.43)+ (0.57) * (0.57) = 0.51 \n", + "Gini for sub-node Class X = (0.56) * (0.56) + (0.44) * (0.44) = 0.51 \n", + "Calculate weighted Gini for Split Class = (14/30) * 0.51 + (16/30) * 0.51 = 0.51 \n", + "\n", + "Above, you can see that Gini score for Split on Gender is higher than Split on Class, hence, the node split will take place on Gender." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Information Gain\n", + "\n", + "Information Gain is applied to quantify which feature provides maximal information about the classification based on the notion of entropy, i.e. by quantifying the size of uncertainty, disorder or impurity, in general, with the intention of decreasing the amount of entropy initiating from the top (root node) to bottom(leaves nodes). \n", + "\n", + "Less impure node requires less information to describe it. And, more impure node requires more information. Information theory is a measure to define this degree of disorganization in a system known as Entropy. If the sample is completely homogeneous, then the entropy is zero and if the sample is an equally divided (50% – 50%), it has entropy of one. \n", + "
\n", + "Information Gain = entropy (parent) - weighted average * entropy (children) \n", + "
\n", + "Entropy = $-p \\log_np - q \\log_nq$ \n", + "
\n", + "Where:\n", + "
\n", + "$p$: probability for success\n", + "
\n", + "$q$: probability for failure ($1-p$)\n", + "
\n", + "$n$: number of classes in label" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Example:\n", + "
\n", + "Entropy for parent node = -(15/30) $\\log_2$ (15/30) – (15/30) $\\log_2$ (15/30) = 1. Here 1 shows that it is a impure node. \n", + "
\n", + "Entropy for Female node = -(2/10) $\\log_2$ (2/10) – (8/10) $\\log_2$ (8/10) = 0.72 and for male node, -(13/20) log2 (13/20) – (7/20) log2 (7/20) = 0.93 \n", + "
\n", + "Entropy for split Gender = Weighted entropy of sub-nodes = (10/30) * 0.72 + (20/30) * 0.93 = 0.86 \n", + "
\n", + "Entropy for Class IX node, -(6/14) $\\log_2$ (6/14) – (8/14) $\\log_2$ (8/14) = 0.99 and for Class X node, -(9/16) $\\log_2$ (9/16) – (7/16) $\\log_2$ (7/16) = 0.99. \n", + "
\n", + "Entropy for split Class = (14/30) * 0.99 + (16/30) * 0.99 = 0.99 \n", + "\n", + "Above, you can see that entropy for Split on Gender is the lowest among all, so the tree will split on Gender. We can derive information gain from entropy as **1- Entropy**. The lesser the entropy, the better it is." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Advantages of Decision Trees\n", + "- Simple to understand, interpret, visualize.\n", + "- Decision tree algorithm implementation can be done without scaling the data as well.\n", + "- Decision trees implicitly perform variable screening or feature selection.\n", + "- Nonlinear relationships between parameters do not affect tree performance. \n", + "
\n", + "\n", + "### Disadvantages of Decision Trees\n", + "- Decision-tree learners can create over-complex trees that do not generalize the data well. This is called overfitting.\n", + "- Decision trees can be unstable because small variations in the data might result in a completely different tree being generated. This is called variance, which needs to be lowered by methods like bagging and boosting.\n", + "- Greedy algorithms cannot guarantee to return the globally optimal decision tree. This can be mitigated by training multiple trees, where the features and samples are randomly sampled with replacement.(Random Forests)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Code" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "plt.rcParams['figure.figsize'] = (20.0, 10.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"datasets/iris.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
015.13.51.40.2Iris-setosa
124.93.01.40.2Iris-setosa
234.73.21.30.2Iris-setosa
344.63.11.50.2Iris-setosa
455.03.61.40.2Iris-setosa
\n", + "
" + ], + "text/plain": [ + " Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n", + "0 1 5.1 3.5 1.4 0.2 Iris-setosa\n", + "1 2 4.9 3.0 1.4 0.2 Iris-setosa\n", + "2 3 4.7 3.2 1.3 0.2 Iris-setosa\n", + "3 4 4.6 3.1 1.5 0.2 Iris-setosa\n", + "4 5 5.0 3.6 1.4 0.2 Iris-setosa" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 150 entries, 0 to 149\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 150 non-null int64 \n", + " 1 SepalLengthCm 150 non-null float64\n", + " 2 SepalWidthCm 150 non-null float64\n", + " 3 PetalLengthCm 150 non-null float64\n", + " 4 PetalWidthCm 150 non-null float64\n", + " 5 Species 150 non-null object \n", + "dtypes: float64(4), int64(1), object(1)\n", + "memory usage: 7.2+ KB\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Describe" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCm
count150.000000150.000000150.000000150.000000150.000000
mean75.5000005.8433333.0540003.7586671.198667
std43.4453680.8280660.4335941.7644200.763161
min1.0000004.3000002.0000001.0000000.100000
25%38.2500005.1000002.8000001.6000000.300000
50%75.5000005.8000003.0000004.3500001.300000
75%112.7500006.4000003.3000005.1000001.800000
max150.0000007.9000004.4000006.9000002.500000
\n", + "
" + ], + "text/plain": [ + " Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm\n", + "count 150.000000 150.000000 150.000000 150.000000 150.000000\n", + "mean 75.500000 5.843333 3.054000 3.758667 1.198667\n", + "std 43.445368 0.828066 0.433594 1.764420 0.763161\n", + "min 1.000000 4.300000 2.000000 1.000000 0.100000\n", + "25% 38.250000 5.100000 2.800000 1.600000 0.300000\n", + "50% 75.500000 5.800000 3.000000 4.350000 1.300000\n", + "75% 112.750000 6.400000 3.300000 5.100000 1.800000\n", + "max 150.000000 7.900000 4.400000 6.900000 2.500000" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 150 entries, 0 to 149\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 150 non-null int64 \n", + " 1 SepalLengthCm 150 non-null float64\n", + " 2 SepalWidthCm 150 non-null float64\n", + " 3 PetalLengthCm 150 non-null float64\n", + " 4 PetalWidthCm 150 non-null float64\n", + " 5 Species 150 non-null object \n", + "dtypes: float64(4), int64(1), object(1)\n", + "memory usage: 7.2+ KB\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Id 0\n", + "SepalLengthCm 0\n", + "SepalWidthCm 0\n", + "PetalLengthCm 0\n", + "PetalWidthCm 0\n", + "Species 0\n", + "dtype: int64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCm
Species
Iris-setosa5050505050
Iris-versicolor5050505050
Iris-virginica5050505050
\n", + "
" + ], + "text/plain": [ + " Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm\n", + "Species \n", + "Iris-setosa 50 50 50 50 50\n", + "Iris-versicolor 50 50 50 50 50\n", + "Iris-virginica 50 50 50 50 50" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby(by=\"Species\").count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "plt.rcParams['figure.figsize'] = (20.0, 10.0)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(x=\"SepalLengthCm\", y=\"SepalWidthCm\", hue=\"Species\", data=data)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.distplot(data[data.Species == \"Iris-setosa\"].PetalLengthCm)\n", + "sns.distplot(data[data.Species == \"Iris-versicolor\"].PetalLengthCm,color=\"r\")\n", + "sns.distplot(data[data.Species == \"Iris-virginica\"].PetalLengthCm,color=\"g\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.pairplot(data, hue=\"Species\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpeciesLabel
015.13.51.40.2Iris-setosa0
124.93.01.40.2Iris-setosa0
234.73.21.30.2Iris-setosa0
344.63.11.50.2Iris-setosa0
455.03.61.40.2Iris-setosa0
\n", + "
" + ], + "text/plain": [ + " Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species \\\n", + "0 1 5.1 3.5 1.4 0.2 Iris-setosa \n", + "1 2 4.9 3.0 1.4 0.2 Iris-setosa \n", + "2 3 4.7 3.2 1.3 0.2 Iris-setosa \n", + "3 4 4.6 3.1 1.5 0.2 Iris-setosa \n", + "4 5 5.0 3.6 1.4 0.2 Iris-setosa \n", + "\n", + " Label \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "label_encoder = LabelEncoder()\n", + "data[\"Label\"] = label_encoder.fit_transform(data[\"Species\"]) \n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2 50\n", + "1 50\n", + "0 50\n", + "Name: Label, dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"Label\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "categories = list(label_encoder.inverse_transform([0, 1, 2]))\n", + "categories" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'DataFrame' object has no attribute 'Species'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mclases\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSpecies\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"Id\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m\"Species\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 5271\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5272\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5273\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5274\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5275\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'Species'" + ] + } + ], + "source": [ + "clases = list(set(data.Species))\n", + "data.drop([\"Id\",\"Species\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmLabel
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
\n", + "
" + ], + "text/plain": [ + " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Label\n", + "0 5.1 3.5 1.4 0.2 0\n", + "1 4.9 3.0 1.4 0.2 0\n", + "2 4.7 3.2 1.3 0.2 0\n", + "3 4.6 3.1 1.5 0.2 0\n", + "4 5.0 3.6 1.4 0.2 0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = data.iloc[: , :-1], data.iloc[: , -1]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy of train: 0.99\n", + "Accuracy of test: 0.96\n" + ] + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "clf = DecisionTreeClassifier(max_depth=4 , random_state=42)\n", + "clf.fit(X_train,y_train)\n", + "print(\"Accuracy of train:\",clf.score(X_train,y_train))\n", + "print(\"Accuracy of test:\",clf.score(X_test,y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bias for training = 1-0.99 = 0.01 \n", + "Bias for test = 1-0.96 = 0.04 \n", + "\n", + "Variance = test bias - training bias = 0.03 " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "Tree\r\n", + "\r\n", + "\r\n", + "0\r\n", + "\r\n", + "PetalLengthCm <= 2.45\r\n", + "gini = 0.662\r\n", + "samples = 100\r\n", + "value = [30, 39, 31]\r\n", + "class = Iris-versicolor\r\n", + "\r\n", + "\r\n", + "1\r\n", + "\r\n", + "gini = 0.0\r\n", + "samples = 30\r\n", + "value = [30, 0, 0]\r\n", + "class = Iris-setosa\r\n", + "\r\n", + "\r\n", + "0->1\r\n", + "\r\n", + "\r\n", + "True\r\n", + "\r\n", + "\r\n", + "2\r\n", + "\r\n", + "PetalWidthCm <= 1.75\r\n", + "gini = 0.493\r\n", + "samples = 70\r\n", + "value = [0, 39, 31]\r\n", + "class = Iris-versicolor\r\n", + "\r\n", + "\r\n", + "0->2\r\n", + "\r\n", + "\r\n", + "False\r\n", + "\r\n", + "\r\n", + "3\r\n", + "\r\n", + "PetalLengthCm <= 5.35\r\n", + "gini = 0.136\r\n", + "samples = 41\r\n", + "value = [0, 38, 3]\r\n", + "class = Iris-versicolor\r\n", + "\r\n", + "\r\n", + "2->3\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "8\r\n", + "\r\n", + "PetalLengthCm <= 4.85\r\n", + "gini = 0.067\r\n", + "samples = 29\r\n", + "value = [0, 1, 28]\r\n", + "class = Iris-virginica\r\n", + "\r\n", + "\r\n", + "2->8\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "4\r\n", + "\r\n", + "SepalLengthCm <= 5.0\r\n", + "gini = 0.05\r\n", + "samples = 39\r\n", + "value = [0, 38, 1]\r\n", + "class = Iris-versicolor\r\n", + "\r\n", + "\r\n", + "3->4\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "7\r\n", + "\r\n", + "gini = 0.0\r\n", + "samples = 2\r\n", + "value = [0, 0, 2]\r\n", + "class = Iris-virginica\r\n", + "\r\n", + "\r\n", + "3->7\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "5\r\n", + "\r\n", + "gini = 0.5\r\n", + "samples = 2\r\n", + "value = [0, 1, 1]\r\n", + "class = Iris-versicolor\r\n", + "\r\n", + "\r\n", + "4->5\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "6\r\n", + "\r\n", + "gini = 0.0\r\n", + "samples = 37\r\n", + "value = [0, 37, 0]\r\n", + "class = Iris-versicolor\r\n", + "\r\n", + "\r\n", + "4->6\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "9\r\n", + "\r\n", + "SepalWidthCm <= 3.0\r\n", + "gini = 0.5\r\n", + "samples = 2\r\n", + "value = [0, 1, 1]\r\n", + "class = Iris-versicolor\r\n", + "\r\n", + "\r\n", + "8->9\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "12\r\n", + "\r\n", + "gini = 0.0\r\n", + "samples = 27\r\n", + "value = [0, 0, 27]\r\n", + "class = Iris-virginica\r\n", + "\r\n", + "\r\n", + "8->12\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "10\r\n", + "\r\n", + "gini = 0.0\r\n", + "samples = 1\r\n", + "value = [0, 0, 1]\r\n", + "class = Iris-virginica\r\n", + "\r\n", + "\r\n", + "9->10\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "11\r\n", + "\r\n", + "gini = 0.0\r\n", + "samples = 1\r\n", + "value = [0, 1, 0]\r\n", + "class = Iris-versicolor\r\n", + "\r\n", + "\r\n", + "9->11\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Visualization\n", + "\n", + "import os\n", + "from sklearn.tree import export_graphviz\n", + "# We need to locate graphiz directory for visualization (after conda)\n", + "os.environ[\"PATH\"] += ';' + r'C:\\Users\\Dell\\Anaconda3\\Library\\bin\\graphviz'\n", + "\n", + "import graphviz\n", + "\n", + "dot_data = export_graphviz(clf, out_file=None,\n", + " feature_names=X.columns,\n", + " class_names=categories,\n", + " filled=True, rounded=True)\n", + "graph = graphviz.Source(dot_data)\n", + "graph" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#Feature Importance\n", + "plt.figure(figsize=(12, 8))\n", + "importance = clf.feature_importances_\n", + "sns.barplot(x=importance, y=X.columns)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.01455535, 0.01534213, 0.55508492, 0.4150176 ])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "importance" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 20\n", + " 1 0.85 1.00 0.92 11\n", + " 2 1.00 0.89 0.94 19\n", + "\n", + " accuracy 0.96 50\n", + " macro avg 0.95 0.96 0.95 50\n", + "weighted avg 0.97 0.96 0.96 50\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score\n", + "pred = clf.predict(X_test)\n", + "print(classification_report(y_test,pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Precision = 0.9487179487179488\n", + "Recall = 0.9649122807017544\n", + "Accuracy = 0.96\n", + "F1 Score = 0.9537037037037036\n" + ] + } + ], + "source": [ + "print(\"Precision = {}\".format(precision_score(y_test, pred, average='macro')))\n", + "print(\"Recall = {}\".format(recall_score(y_test, pred, average='macro')))\n", + "print(\"Accuracy = {}\".format(accuracy_score(y_test, pred)))\n", + "print(\"F1 Score = {}\".format(f1_score(y_test, pred,average='macro')))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "\n", + "cm = confusion_matrix(y_test, pred)\n", + "plt.figure(figsize=(12, 8))\n", + "ax =sns.heatmap(cm, square=True, annot=True, cbar=False)\n", + "ax.xaxis.set_ticklabels(categories, fontsize = 12)\n", + "ax.yaxis.set_ticklabels(categories, fontsize = 12, rotation=0)\n", + "ax.set_xlabel('Predicted Labels',fontsize = 15)\n", + "ax.set_ylabel('True Labels',fontsize = 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enseble Learning\n", + "\n", + "In ensemble learning theory, we call weak learners (or base models) models that can be used as building blocks for designing more complex models by combining several of them. Most of the time, these basics models perform not so well by themselves either because they have a high bias or high variance. The idea of ensemble methods is to try reducing bias and/or variance of such weak learners by combining several of them together in order to create a strong learner (or ensemble model) that achieves better performances. \n", + " \n", + "Combining weak learners:\n", + "\n", + "**Bagging**: that often considers homogeneous weak learners, learns them independently from each other in parallel and combines them following some kind of deterministic averaging process \n", + "
\n", + "**Boosting**, that often considers homogeneous weak learners, learns them sequentially in a very adaptative way (a base model depends on the previous ones) and combines them following a deterministic strategy \n", + "
\n", + "**Stacking**, that often considers heterogeneous weak learners, learns them in parallel and combines them by training a meta-model to output a prediction based on the different weak models predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/ensemble1.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/ensemble2.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### XGBoost" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "XGBoost is an implementation of gradient boosted decision trees designed for speed and performance." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "dmatrix_train = xgb.DMatrix(data=X_train, label=y_train)\n", + "dmatrix_test = xgb.DMatrix(data=X_test, label=y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "param = {'max_depth':3, \n", + " 'eta':1, \n", + " 'objective':'multi:softprob', \n", + " 'num_class':3}\n", + "\n", + "num_round = 5\n", + "model = xgb.train(param, dmatrix_train, num_round)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.0046 , 0.9871078 , 0.00829213],\n", + " [0.00398089, 0.00646372, 0.9895555 ],\n", + " [0.00398902, 0.0044349 , 0.99157614],\n", + " [0.00387317, 0.9924493 , 0.00367761],\n", + " [0.9892699 , 0.00651816, 0.00421197],\n", + " [0.04196233, 0.5648941 , 0.39314362],\n", + " [0.0046 , 0.9871078 , 0.00829213],\n", + " [0.9892699 , 0.00651816, 0.00421197],\n", + " [0.9892699 , 0.00651816, 0.00421197],\n", + " [0.0038679 , 0.9910994 , 0.00503268]], dtype=float32)" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds = model.predict(dmatrix_test)\n", + "preds[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "best_preds = np.asarray([np.argmax(line) for line in preds])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 2, 2, 1, 0, 1, 1, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 1, 0, 0, 1,\n", + " 0, 2, 0, 0, 0, 2, 2, 0, 2, 1, 0, 0, 1, 1, 2, 0, 0, 1, 1, 0, 2, 2,\n", + " 2, 2, 2, 1, 0, 0], dtype=int64)" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_preds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Precision = 0.9285714285714285\n", + "Recall = 0.9473684210526315\n", + "Accuracy = 0.94\n" + ] + } + ], + "source": [ + "print(\"Precision = {}\".format(precision_score(y_test, best_preds, average='macro')))\n", + "print(\"Recall = {}\".format(recall_score(y_test, best_preds, average='macro')))\n", + "print(\"Accuracy = {}\".format(accuracy_score(y_test, best_preds)))" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "\n", + "plt.figure(figsize=(12, 8))\n", + "cm = confusion_matrix(y_test, best_preds)\n", + "ax = sns.heatmap(cm, square=True, annot=True, cbar=False)\n", + "ax.xaxis.set_ticklabels(categories, fontsize = 12)\n", + "ax.yaxis.set_ticklabels(categories, fontsize = 12, rotation=0)\n", + "ax.set_xlabel('Predicted Labels',fontsize = 15)\n", + "ax.set_ylabel('True Labels',fontsize = 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tuned: {'learning_rate': 0.001, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200}\n", + "Mean of the cv scores is 0.930000\n", + "Train Score 0.970000\n", + "Test Score 0.960000\n", + "Seconds used for refitting the best model on the train dataset: 0.157686\n" + ] + } + ], + "source": [ + "## Hyperparameter Tuning\n", + "\n", + "from xgboost.sklearn import XGBClassifier\n", + "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV \n", + "\n", + "param_dict = {\n", + " 'max_depth':range(3,10,2),\n", + " 'min_child_weight':range(1,6,2),\n", + " 'learning_rate': [0.001,0.01,0.1,1],\n", + " 'n_estimators': [200,500,1000]\n", + " \n", + "}\n", + "\n", + "xgc = XGBClassifier(booster='gbtree', learning_rate =0.01, n_estimators=200, max_depth=5,\n", + " min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,\n", + " objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27)\n", + "\n", + "clf = GridSearchCV(xgc,param_dict,cv=2,n_jobs = -1).fit(X_train,y_train)\n", + "\n", + "print(\"Tuned: {}\".format(clf.best_params_)) \n", + "print(\"Mean of the cv scores is {:.6f}\".format(clf.best_score_))\n", + "print(\"Train Score {:.6f}\".format(clf.score(X_train,y_train)))\n", + "print(\"Test Score {:.6f}\".format(clf.score(X_test,y_test)))\n", + "print(\"Seconds used for refitting the best model on the train dataset: {:.6f}\".format(clf.refit_time_))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAHoCAYAAABq5rTWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3deZwlZX3v8c93WJSdsDM4gIi4kWBUEBNjUBFEBbwmgohbojGJJpEb1+uKCkrcxSU3xKhXBARXBAEREBEEBRGQTWVYZJhh34Z9gN/9o6qhafrM9NDdc55mPu/Xq18zp+o5Vb9zpuacbz/PU1WpKiRJklo2a9gFSJIkLYmBRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS81YcdgGaGouuv9Tz0zUpq8z+q2GXIGk5d+89V2XQOntYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1LxHZWBJsneS44ddh4ZjwTXX8Xf/8m52ffWb2X3vf+TgI34AwC23LuRNb3svL9nzjbzpbe/lllsXDrlSzRQ777QDF5x/ChdfeCrveudbh12OZiCPoclLVQ27hkckyeXAm6rqhAZq2RfYsqpeM6waFl1/6cz8h5wG111/I9fdcCNPfdKW3H77Hezxxn/jwI9/gB8ccwJrrbkGb3rtHnzl4CO4deFC/v0tbxx2uc1YZfZfDbuEJs2aNYuLLvg5L37JXsybt4AzTj+G17z2LVx00R+GXZpmCI+hibv3nqsyaN2jroclyYrDrkHDtf566/DUJ20JwGqrrcoWm83hmutu4Kc/P53dd9kRgN132ZGTTjl9mGVqhthu2z9n7tzLueyyP7Jo0SKOOOJIdtt152GXpRnEY2hqzPjAkuQNSU5L8tkkNwL79stO7denX3dtkluSnJdk6wHbWi/J0UluTnJjkp8nmdWvm53ku0muS3JZkn/rl78YeC+wZ5Lbkpw7qv0P++1ckuQfRu1nuyRnJbk1yTVJPjNq3beTXN3XekqSp03bm7ccuGrBNVz0h7n82dOexA033cz6660DdKHmxptvGXJ1mglmb7IRV86b/8DjeVctYPbsjYZYkWYaj6GpMeMDS+/ZwKXABsD+Y9btBDwP2ApYG9gTuGHAdt4OzAPWBzakCyLVh5ajgHOBTYAXAvsk2bmqjgM+BhxeVatX1Tb9tg7rtzUb+FvgY0le2K/7PPD5qloTeAJwxKgajgWe2L+Ws4FDBr3oJG/ug89ZX/nGYYOaLbfuuONO/vf79uPd//aPrL7aasMuRzNU8vAe6pk6lK7h8BiaGo+W4ZP5VfWF/u/3jjk4FgFrAE8GflVVFy1mO4uAjYHNquoS4OfQ9YgA61fVR/p2lyb5b+BVwI/HbiTJHOC5wMuq6i7gnCRfAV4LnNjvZ8sk61XV9cAZI8+tqq+O2s6+wE1J1qqqh3UHVNVBwEHgHJaxFt17L/u8bz9eutPzedEOfwnAun+yNtddfyPrr7cO111/I+usvdaQq9RMcNW8Bcx53OwHHj9uk41ZsOCaIVakmcZjaGo8WnpYrhy0oqpOAr4IfAm4JslBSdZMsmk/hHNbktv65p8ELgGOT3Jpkvf0yzcDZvdDRTcnuZmu92XDAbudDdxYVaNPQ7mCrncG4I10PT4XJzkzycsAkqyQ5IAkc5PcClzet19v4m+FqooPfvxzbLHZHF7/qlc8sHyH527Pkcd2c7SPPPYEnv9XzxlWiZpBzjzrHLbc8vFsvvkcVlppJfbYY3eOOtqTEDVxHkNT49HSw7LY3oWqOhA4MMkGdMMv76yqDwCrj2m3kG5Y6O393JGfJjmTLhBdVlVPnOD+5wPrJFljVGjZFLiq388fgL36oaZXAN9Jsm7/992BHenCylrATcDAWdN6uN+cdwFHHXciT3zC5vzN67vTB9/2j6/nTa/dg7d/4GN87+gfs/GG6/OZ/d435Eo1E9x33328bZ/3c8yPDmWFWbP4+v87nAsv/P2wy9IM4jE0NR4tgWWgJNvS9SSdDdwO3AXcN6Dty4CLgbnArX27+4BfAbcmeTdwIHAP8BRglao6E7gGeFGSWVV1f1VdmeQXwMeTvIOuN+WNwGv6/bwG+HFVXdf31tDvZw3gbro5NqvSzY3RUnrGNltz/mnHjrvufw48YBlXo0eDY487iWOPO2nYZWgG8xiavEfLkNDirAn8N11PxRV0YeBTA9o+ETgBuA04HfhyVZ1cVfcBuwJPBy4Drge+QtcDAvDt/s8bkpzd/30vYHO63pbvAx+qqp/0614MXNAPRX0eeFU/1+UbfY1XARcyam6LJEnLsxl74Tg9lJNuNVleOE7SsC1XF46TJEmPPgYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXmpqmHXoCmw4sqb+A+pSbl4y62HXYJmuCdfcv6wS9AMd+89V2XQOntYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaN6HAkmT7JLuMevwnSb6a5Iwk+ydZYfpKlCRJy7uJ9rB8GnjWqMefA3YDLgbeCnxoiuuSJEl6wEQDy5OBswCSrAL8LfC2qnoD8C7g1dNSnSRJEhMPLI8F7uj//hfAysBR/eOLgNlTXJckSdIDJhpYfgfs2P/9VcAZVXVr/3hj4KapLkySJGnEihNstz9wWJI30AWUV4xa9yLgnCmuS5Ik6QETCixV9d0k2wDPAM6rqt+OWn0e8I3pKE6SJAkm3sNCVV1EN19l7PIvTGlFkiRJYwwMLElesDQbqqqTJl+OJEnSwy2uh+UEoIBMYDsFePE4SZI0LRYXWJ6yzKqQJElajIGBpap+tywLkSRJGmTCNz9MsmKSv0vypSQ/TPKEfvn/SvLE6StRkiQt7yZ688Mt6M4QOhDYBngpsFa/+kXAe6elOkmSJCbew3IgcAPweGAHHjoR92TgeVNalSRJ0igTvQ7LDsCrqur6JGPPBrqa7uq3kiRJ02KiPSyLgJUGrNsYuHXAOkmSpEmbaGA5AXhPktVHLaskKwJvBY6b8sokSZJ6Ex0SeifwC+D3dOGkgPcAW9NNvt1rWqqTJEligj0sVXU53dlBhwBPB64CnkQXXp5ZVVdNV4GSJElLc/PD6+h6WiRJkpapCQcWgCSr0V2yf2NgPnBxVd0+HYVJkiSNmOiF42Yl+TDdUNCvgCOBM4H5ST4yzqnOkiRJU2aiPSyfojsb6D+A7wHXAhsAfwO8G1gNePt0FChJkjTRwPJ64ANV9YlRy+YD5yS5DXgXBhZJkjRNJnodlgDnDFh3Dg+9VL8kSdKUmmhgOQx4w4B1bwAOn4piJEmSxjNwSCjJ3496eD7wviRn0024HZnD8nJgPeDj01mkJElavqWqxl+R3L8U26mq8kyhIVpx5U3G/4eUJujiLbcedgma4Z58yfnDLkEz3L33XDVwisniJt2uMg21SJIkLbWBgaWq7l6WhUiSJA2ytFe6XR94IvDYseuq6qSpKkqSJGm0CQWW/pL83wR2ZfApzM5hkSRJ02KipzXvDzwV2IkusOwF7EJ39+bLgb+ajuIkSZJg4oFlV2A/4Gf940ur6viqeh3wI+BfpqM4SZIkmHhg2Qi4vKruA+4A1h217ofAS6a6MEmSpBETDSzzeDCkXAK8eNS6PwfumsqiJEmSRpvoWUInAi8AfgAcCHwlyTbA3cCOwBenpzxJkqSJ97C8GzgAoKq+CuwN3AQU3Z2a3znZQpLsneT4yW5nukxFfUl2SDJvqmrSxOy80w5ccP4pXHzhqbzrnW8ddjmaITbY79/Z/OeHM+fI/3pg2Wo7/xVzfngQTzj/WB7ztCcOsTrNNH4OTd6EAktVLayq+aMeH1ZVf1NVLwG+DjxjIttJcnmSHQfs45Cq2mki2xmG1uvT+GbNmsWBn9+fl+36Gv50m+ez554v5ylP8YtGS3br949nwZvf95Bl9/zhcq7+t49w11m/HVJVmon8HJoaE+1hWZwXAKdPZgNJluoCdtMhnal4P6ZFEq9z8whst+2fM3fu5Vx22R9ZtGgRRxxxJLvtuvOwy9IMcNevz+e+WxY+ZNmiS69k0eV2kmrp+Dk0NYbyBZ3kDUlOS/LZJDcC+/bLTu3Xp193bZJbkpyXZNw7syW5KMnLRj1eMcn1SZ7RP94+yS+S3Jzk3CQ7jGp7cpL9k5xGd/bTFn0dlyZZmOSyJHuPqvnUUc99WpKfJLkxyTVJ3tsvf0ySzyWZ3/98LsljBtT+lL6Gm5NckGS3Ueu+nuQ/kxyT5Hbg+Y/4DV+Ozd5kI66c90DnIPOuWsDs2RsNsSJJyxs/h6bGMHsUng1cCmxAd2G60XYCngdsBawN7AncMGA7h9FdyG7EzsD1VXV2kk3orhOzH7AO8A7gu/0tBka8FngzsAZwHd2k4l2qag3gL4Bzxu4wyRrACcBxwGxgS7qJyQDvA7YHng5sA2wHvH+cbawEHAUc378H/wockuRJo5q9mu69WQM4dZxtvDnJWUnOuv/+2we8Pcu35OEXZh50h3JJmg5+Dk2NYQaW+VX1haq6t6ruHLNuEd2X9JOBVNVFVbVgwHYOBXZLsmr/+NX9MoDXAMdU1TFVdX9V/QQ4i4deN+brVXVBVd0L3AvcD2ydZJWqWlBVF4yzz5cBV1fVp6vqrn6Ozy/7dXsDH6mqa6vqOuDDdKForO2B1YEDquqe/l5MR/PQ8HVkVZ3W1/6wU8er6qCqelZVPWvWrNUGvD3Lt6vmLWDO42Y/8Phxm2zMggXXDLEiScsbP4emxjADy5WDVvRf3l8EvgRck+SgJGsm2TTJbSM/fdtLgIuAXfvQshsPBpbNgFf2Qy43J7kZeC6w8Xh1VNXtdL05/wQsSPKjJE8ep8Q5wNwB5c8Grhj1+Ip+2Xjtrqyq+8e03WS82vTInHnWOWy55ePZfPM5rLTSSuyxx+4cdXSzJ6NJehTyc2hqDJzsmuRKutOWl2TVJTcZ12K3XVUHAgcm2QA4AnhnVX2ArldirJFhoVnAhX2Ige4L/+Cq+oeJ1lFVPwZ+nGQVuqGk/+bh90q6kof2hIw2ny4ojfTMbNovG6/dnCSzRoWWTYHfD6pNS+++++7jbfu8n2N+dCgrzJrF1//f4Vx44e+X/EQt9zb85HtYZbs/Y4W112Lzk77JDV88mPtvWcj673sLK6yzFhv/50e55+K5zB9zJpE0lp9DU2NxZ+ccwpC+MJNsSxc+zgZup7uS7n2Lecq36OZ6rMODvSvQ3WH6zCQ70805WYluKOaSqnrYVP8kG9LNrTkRuBO4bcB+jwY+k2Qf4D+BlYGn9sNChwHvT3Im3fv3wb6OsX7Zv7Z3Jfk08Jd092zadjGvU4/AscedxLHHnTTsMjTDXPPOA8ZdfvuJv1jGlejRwM+hyRsYWKrqPcuykDHWBD4LbEEXVn4MfGpQ46pakOR04K+BPUYtvzLJ7sAn6ILEfcCvgH8esKlZwNuBg+nCxjnAW8bZ38IkLwI+D3yI7oq/n6MLIfv19Z/XN/92v2zsNu7pzwr6MvB/gKuA11XVxYNepyRJy6s4U/nRYcWVN/EfUpNy8ZbjXjlAmrAnX3L+sEvQDHfvPVc9/JSqXrMXSpMkSRphYJEkSc0zsEiSpOYZWCRJUvOW6qaDSZ5Ad2fmOcA3q+raJHOAG6rqjukoUJIkaUKBpb+I2n/RXSwt/c/JwLV0p/POBd41PSVKkqTl3USHhD4NvIjusvdr0QWWET8CdpniuiRJkh4w0SGhVwJvr6pjk6wwZt1ldJeilyRJmhYT7WFZDRh0a8nV6O5wLEmSNC0mGlh+Dbx6wLpX0F2SXpIkaVpMdEjog3R3MF6X7t44BeyY5J/pgszzp6k+SZKkifWwVNVPgRcDGwBfpZt0ewDdKc4vqarTp61CSZK03JvwdViq6iRguyRrAesCN1XVTdNWmSRJUm+pLhwHUFW3ALdMQy2SJEnjmuiF476xpDZV9brJlyNJkvRwE+1heeI4y9YBtgCup7sWiyRJ0rSYUGCpqueMt7y/t9C3gY9MZVGSJEmjTepuzVU1F/g48KmpKUeSJOnhJhVYenfjpfklSdI0muik2y3GWbwy8BS6Hpazp7IoSZKk0SY66fYSuqvbjhXgt8Cbp6wiSZKkMSYaWHYZZ9ldwLx+HoskSdK0WWJgSfIYYGvg+Kr67fSXJEmS9FBLnHRbVXfTnba8zvSXI0mS9HATPUvo18A201mIJEnSIBOdw/I24FtJ7gCOAa5hzCTcqrp/imuTJEkCJh5Yft3/+V+LabPCJGuRJEka10QDy1sY/7RmSZKkaTcwsCR5HnB2Vd1WVf93GdYkSZL0EIubdPtT4KnLqhBJkqRBFhdYssyqkCRJWoypuPmhJEnStFrSpNuXJHnyRDZUVd+YgnokSZIeZkmB5YMT3E4BBhZJkjQtlhRYng+ctSwKkSRJGmRJgeXOqrp9mVQiSZI0gJNuJUlS8wwskiSpeQOHhKrKMCNJkppgKJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOatOOwCJLXhJdfdMOwSNMMt/M7/HnYJehSzh0WSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmrTjsAiYiyd7A66tqpync5qbAhcBaVXXfdD5/svvS5Oy80w585jMfYYVZs/jq1w7jE5/80rBL0gyy0ewN+cSXPsz6G6zL/fffz+EHf59vHPStYZelGeBDR/yMUy78I+usvgrffcffAvCub57I5dfeDMDCu+5hjceuzBH//jfDLHPGSFUNuwYAklwOvKmqThh2LTPRiitv0sY/ZGNmzZrFRRf8nBe/ZC/mzVvAGacfw2te+xYuuugPwy6tOVustfGwS2jS+huuy/obrseF5/2O1VZble+deDBved07mPv7y4ZdWnPO/Z9XDbuEpvz60gWsuvJKvP9bJz8QWEb79FFnsPpjV+YfX/SMIVTXplV2e0cGrWt+SCjJMu8FSqf590ZLtt22f87cuZdz2WV/ZNGiRRxxxJHstuvOwy5LM8h119zAhef9DoDbb7+Dub+/nA033mDIVWkmeOYWG7Pmqo8Zd11Vcfy5l/Lipz9hGVc1czX3pZzkDUlOS/LZJDcC+/bLTu3Xp193bZJbkpyXZOsB27ooyctGPV4xyfVJnpFk8yQ1EoiSnJxk/ySnAXcAWyR5fJJTkixMckKSLyX5Zt9+vOd/tK99YZLjk6w3oO06Sb6WZH6Sm5L8oF/+J0mOTnJdv/zoJI+btjd7OTB7k424ct78Bx7Pu2oBs2dvNMSKNJNtMmdjnvqnT+LcX58/7FI0w5192dWsu8YqbLb+WsMuZcZoLrD0ng1cCmwA7D9m3U7A84CtgLWBPYEbBmznMGCvUY93Bq6vqrMHtH8t8GZgDeAK4FDgV8C6wL79+sV5NfB3fd0rA+8Y0O5gYFXgaX3bz/bLZwFfAzYDNgXuBL44aGdJ3pzkrCRn3X//7UsobfmUPLx3sZVhUM0sq662Cl/42if42Ps/ze23+f9Nk3Pcb+bau7KUWp10O7+qvtD//d4xXzqL6ALFk4FfVdVFi9nOocBvkqxaVXfQBYpDF9P+61V1ATwwUXZb4IVVdQ9wapIfLqHur1XV7/vnHwHsNrZBko2BXYB1q+qmfvHPAKrqBuC7o9ruD/x00M6q6iDgIHAOyyBXzVvAnMfNfuDx4zbZmAULrhliRZqJVlxxBb7wtU9w1HeO4/gfDfwvKU3Ivffdz4nnX85hb3v5sEuZUVrtYbly0IqqOomu1+FLwDVJDkqyZpJNk9w28tO3vQS4CNg1yap0AWJxgWX0fmcDN/ZBZ4l19a4e9fc7gNXHaTOn3+5NY1ckWTXJfyW5IsmtwCnA2klWWMJ+NcCZZ53Dlls+ns03n8NKK63EHnvszlFHHz/ssjTDfOxzH2Tu7y/ja//3kGGXokeBX/7hKh6/wVpsuPZ4XxEapNXAstjegqo6sKqeSTekshXwzqr6Y1WtPvIzqvnIsNDuwIV9iJnIfhcA6/RBZ8ScpXoV47uy3+7a46x7O/Ak4NlVtSbd0BfAwFnTWrz77ruPt+3zfo750aGcf97JfOc7R3Hhhb8fdlmaQZ757G14+Z4vZfvnbsuRPz2EI396CH+9418OuyzNAO855CRe/8UjueK6m9lpv0P5/q8uBuC4cxwOeiRaHRIaKMm2dEHrbOB24C5gcdc2+RbdPJh1WHzvykNU1RVJzqKb9Pt+4JnArsBRj7D0ke0uSHIs8OUkbwVuA55TVafQDXXdCdycZB3gQ5PZlzrHHncSxx530rDL0Az161+ey1brP2vYZWgGOmDvF4y7/KOv2mHZFvIo0WoPy+KsCfw3cBPdxNgbgE8NalxVC4DTgb8ADl/Kfe0NPKffx3798+9e+pIf5rV0c3EuBq4F9umXfw5YBbgeOAM4bgr2JUnSjNfMheNmgiSHAxdXVXM9H0661WR54ThNlheO02TN6AvHDVOSbZM8IcmsJC+mmwfzg2HXJUnS8mbGzWFZxjYCvkd3HZZ5wD9X1W+GW5IkScsfA8tiVNVRTHKSrSRJmjyHhCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNM7BIkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOYZWCRJUvMMLJIkqXkGFkmS1DwDiyRJap6BRZIkNc/AIkmSmmdgkSRJzTOwSJKk5hlYJElS8wwskiSpeQYWSZLUPAOLJElqnoFFkiQ1z8AiSZKaZ2CRJEnNS1UNuwZpmUjy5qo6aNh1aObyGNJkeQw9cvawaHny5mEXoBnPY0iT5TH0CBlYJElS8wwskiSpeQYWLU8cN9ZkeQxpsjyGHiEn3UqSpObZwyJJkppnYFFTkuyd5Phh16H2tH5sTEV9SXZIMm+qatLiTccxlWTTJLclWWG6nz/Zfc00DglpmUtyOfCmqjqhgVr2BbasqtcMuxa1dWwMQ5IdgG9W1eOGXcujxfJ+TD2a2MOiZiRZcdg1qE0tHBvpNPuZubz8lj1Vht8KzEIAAA3JSURBVHFMtX4Mtc43TkOT5A1JTkvy2SQ3Avv2y07t16dfd22SW5Kcl2TrAdtaL8nRSW5OcmOSn498MCSZneS7Sa5LclmSf+uXvxh4L7Bn36167qj2P+y3c0mSfxi1n+2SnJXk1iTXJPnMqHXfTnJ1X+spSZ42bW/eo9wUHxsXJXnZqMcrJrk+yTP6x9sn+UV/7Jzb93KMtD05yf5JTgPuALbo67g0ycL+eNp7VM2njnru05L8pD+Orkny3n75Y5J8Lsn8/udzSR4zoPan9DXcnOSCJLuNWvf1JP+Z5JgktwPPf8Rv+HJgWR1TSTZPUiOBaMAx9Pj+M2JhkhOSfCnJN/v24z3/o33tC5Mcn2S9AW3XSfK1/ri6KckP+uV/ku7z8bp++dFJZlwvnoFFw/Zs4FJgA2D/Met2Ap4HbAWsDewJ3DBgO28H5gHrAxvSBZFKF1qOAs4FNgFeCOyTZOeqOg74GHB4Va1eVdv02zqs39Zs4G+BjyV5Yb/u88Dnq2pN4AnAEaNqOBZ4Yv9azgYOWbq3QmNM1bFxGLDXqMc7A9dX1dlJNgF+BOwHrAO8A/hukvVHtX8t3dVJ1wCuAw4EdqmqNYC/AM4Zu8MkawAnAMfRHUdbAif2q98HbA88HdgG2A54/zjbWInu2D2+fw/+FTgkyZNGNXs13XuzBnDq2G3oYab9mBrQfvQxdAVwKPArYF1g33794rwa+Lu+7pXpjtPxHAysCjytb/vZfvks4GvAZsCmwJ3AF5ewz+YYWDRs86vqC1V1b1XdOWbdIrr/4E+mm291UVUtGLCdRcDGwGZVtaiqfl7dBK1tgfWr6iNVdU9VXQr8N/Cq8TaSZA7wXODdVXVXVZ0DfIUHP1AWAVsmWa+qbquqM0aeW1VfraqFVXU33YfQNknWWvq3RL2pOjYOBXZLsmr/+NX9MoDXAMdU1TFVdX9V/QQ4C3jJqOd/vaouqKp7gXuB+4Gtk6xSVQuq6oJx9vky4Oqq+nR/HC2sql/26/YGPlJV11bVdcCHGf8La3tgdeCA/tg9CTiah35RHllVp/W13zXg9etBy+KYGs/oY2hjus+lD/b/rqcCP1xC3V+rqt/3NR9BF3YfIsnGwC7AP1XVTf3n4M8AquqGqvpuVd1RVQvpwtpfL2GfzTGwaNiuHLSi/4D+IvAl4JokByVZMw/OjL8tyW19808ClwDH99317+mXbwbM7rvUb05yM13vy4YDdjsbuLH/Tz3iCrreGYA30v0GdnGSM0e6hZOskOSAJHOT3Apc3rdfb+JvhcaYkmOjqi4BLgJ27b9gduPBL5fNgFeOOT6eS/el8rA6qup2ut+8/wlYkORHSZ48TolzgLkDyp9Nd0yNuKJfNl67K6vq/jFtNxn1eOB7pHEti2NqSfsd+Yy5YyJ19a4e9fc76ILsWHP67d40dkWSVZP8V5Ir+s+nU4C1M8PmPRlYNGyLPU2tqg6sqmfSdXFuBbyzqv7YD+GsXlWr9+0WVtXbq2oLYFfg3/thnCuBy6pq7VE/a1TVyG/QY/c/H1in79IfsSlwVb+fP1TVXnTdrf8BfCfJanS/Ye0O7AisBWzePzdL/5aoNyXHRm+kC3934ML+Cwe64+PgMcfHalV1wKA6qurHVfUiulBzMV2P3VhX0g0Zjmc+XVAasWm/bLx2c/LQSZoPHIvj1aYlWhbH1JL2u4DuM2bVUcvmLNWrGN+V/XbXHmfd24EnAc/uh7Of1y+fUZ9PBhY1K8m2SZ7dj+XfDtwF3Deg7cuSbJkkwK19u/voxolvTfLuJKv0PSFbJ9m2f+o1wOYjXwpVdSXwC+DjSR6b5M/oelUO6ffzmiTr97/13txv4z66ruS76ca8V6WbG6NpsjTHRu9bdHMU/pmH/ib8Tbrfknfuj43HprsWyrgTEpNsmGS3PqTeDdw2YL9HAxsl2SfdJNs1kjy7X3cY8P4k6/eTJz/Y1zHWL/vX9q4kK6WbDLxr/1o0xabwmFqsqrqCbthx3yQrJ3kO3b/rpPTDV8cCX+4n2a6UZCSYrEE3b+XmJOsAH5rs/obBwKKWrUn32+tNdF3hNwCfGtD2iXSTHG8DTge+XFUnV9V9dB8GTwcuA66nm5MyMrfk2/2fNyQZmTC3F10PyXzg+8CH+rkNAC8GLui7hj8PvKqfO/CNvsargAuBB+a2aFoszbEx8mF+Ot0k2cNHLb+S7jfk99JNqL0SeCeDPxtn0f22Oh+4kW4ewFvG2d9C4EV0x97VwB948Cye/ei+sM4Dfks3QXu/cbZxD91Qwy50x+2XgddV1cWDXqcmZUqOqQnaG3hOv4/9+uffvfQlP8xr6ebiXAxcC+zTL/8csArdcXQG3WTwGccLx0mSNERJDgcurqoZ2fOxrNjDIknSMtQPPz0hyax014PaHfjBsOtq3dCvHilJ0nJmI+B7dNdhmQf8c1X9Zrgltc8hIUmS1DyHhCRJUvMMLJIkqXkGFkmS1DwDi6RHJMm+6e4UO/IzP91dsQdd4XWq9vudJCePqeP6pXj+yv1zHnY/lknU9C9JFjshMN2dgSvJeJdVX5p9XZ5k4PVBlnJbleRfpmJb0nQzsEiajFvoLoD1HLo7yD4dOLG/Euyy8hW6u+VO1Mp0V/qcssAiafp5WrOkybh31B2rz0jyR+DndHc7/vbYxv3N1lbor+I6JapqHt2poZIexexhkTSVft3/uTlAkq8nOSvJy5NcQHd/lmf36zZN8q0kNya5I8mPkzxp9MaSzElyTJI7+6GQN43d4XhDQknW7e9OuyDJXUl+l2TkMuUjd+L+2qjhrJF6H5vkE0muTHJ3knOTvGTMth+T5Ivp7u58Y5LPAitN4j0bve0Dkvw23Z2B5yU5JMlGA9p+IMnVfdtDkqw1Zv06/XtwTf8e/GLU/YwG7f+5SX6e5Nb+55wkr5yK1yZNlj0skqbS5v2fV49Z9gngI3Q3m7ysvwHbqXT3Uvkn4A7gPcAJSbaqqjuTBDgSWI/uBpR3AR8G1qG7N8+4kqwCnEx3R+0P091XZcv+B+AFwEl093D5Ub9sQf/nd4Dt6IaM5gJ7AD9M8qyqOqdvcwDwJuB9dPeN+gdgqr7UN6C7ceZ8YH26+xadlORP+/tijdgLuKTf98Z07+9XRupI8hi6e2utTXdvpGvpbtJ3QpInVtXofx/656xJd9PGI+n+rQL8ab8NaegMLJImJcnI58gWdDfoW0j3ZTliXWDHUV/4JPkosBrw9Kq6sV92GnA58PfAl+hu+vfnwPZV9cu+za/pgsTAwAK8Dnga8IxR+zxp1Poz+z/njhrOIskLgZcCO1TVz/rFxyfZii6cvDLJunQB60NV9en+eT+mCy6TVlV/P6qeFehurjcP+EvglFFNVwFeWlW39W1vBw5O8pSqugh4DbA18LSq+kPf5gTgd3Qh6J3j7H4rupuC/kt/80aA46fidUlTwSEhSZOxLt3dYRfRfRluAezZ38l2xFWjw0pvR+AnwK1JVuxDz0K6IaVn9W22A64ZCSsAVXUFDw47DfIC4Dfj7HNJdqTrGTptpKa+rhNH1fSnwGPpeiFGarp/9OPJSLJLP3RzC3AvD87N2WpM05+MhJXe9+h6RLYd9Vp+TdebteKoUPmzUa9lrLl0dzs/NMnuSexZUVPsYZE0GbfQfTkW3Zf9/Hr4/T6uGed56wHbA3uOs+7E/s+N6IYyxroWWGMxNa3Lg0M8S2O9fp+Lxlk3MhwzMp9kbF3j1blUkmwL/BD4Pt2w07V07+sZdCFp4P76IbTb6IaH4MH3d7zXMne8/VfVTUl2ohsOOwKYleR44F+r6tJH9KKkKWRgkTQZ91bVWUtoM971SW6k+3L+6DjrRoYjrqab0zHWBsCdi9nfDTw4X2Vp3AhcBbx8MW1G5n5s0LcfXdNk/S/gOroeqgJIstmAtg/ZXz9vZ3UeDGo3AmfRzVsZ6+5BBVTV6cCL++3tCHwGOJQu/EhDZWCRNAwn0k1ovaCqBoWPM4EPJXn2qDksmwLPAE5bwrZfmeTPquq8cdaPnFI9ttfiRLr5HbdV1cUDtv1busm/u9NN5iXJrP7xZK0CLBrTQ7X3gLYvSrL6qGGhV9AFw5HweCKwE/DHqlrq3p/+3+SoJFsD/2dpny9NBwOLpGH4DN3E0JOSfIGuZ2ND4K+BU6vqMOAY4Fzg20neTRcUPsKSh1++AbyVbsLsvnRzax4PbFVV76mqe5JcBuyR5Px+u+fRzan5MfCTJP8BXACsSXeBucdW1f+pqhuSHAR8OMm9fZt/oOvdmKiXJ7lrzLIz+/3vk+RzwFHAX/Tv0XjuBH6U5JN0w0CfBL5fVSOTf79BNzn45P6quJfSDZVtB1xdVZ8du8EkL6Wb8PwD4I/AJsA/8tAJy9LQGFgkLXNVdX2S7YH9gc/SnTq7gO5U5/P6NpVkN+Ag4Kt0QeVjwIvo5mgM2vZdSV5ANw/kI3Sh43K6M5hG/BPwKbqzmR4DPL6qLk/yCuC9wD7ApnRDK+cAXxj13HfRXXflg8D9wDfpAtinJ/jyDx5n2d9V1df7YPavdCHodOBlwO/Haf8tuqGz/6ELSz9k1PBP/x48v3/9H6YLg9cCv+rbjucSul6aj9ENOV1Hd5rzeyf4uqRplYfPj5MkSWqLpzVLkqTmGVgkSVLzDCySJKl5BhZJktQ8A4skSWqegUWSJDXPwCJJkppnYJEkSc0zsEiSpOb9fxMT9CP/3nNpAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 8))\n", + "\n", + "xgb_pred = clf.predict(X_test)\n", + "cm = confusion_matrix(y_test, xgb_pred)\n", + "ax = sns.heatmap(cm, square=True, annot=True, cbar=False)\n", + "ax.xaxis.set_ticklabels(categories, fontsize = 12)\n", + "ax.yaxis.set_ticklabels(categories, fontsize = 12, rotation=0)\n", + "ax.set_xlabel('Predicted Labels',fontsize = 15)\n", + "ax.set_ylabel('True Labels',fontsize = 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resources\n", + "\n", + "https://towardsai.net/p/programming/decision-trees-explained-with-a-practical-example-fe47872d3b53\n", + "https://towardsdatascience.com/decision-trees-in-machine-learning-641b9c4e8052 \n", + "https://towardsdatascience.com/visualizing-decision-trees-with-python-scikit-learn-graphviz-matplotlib-1c50b4aa68dc\n", + "https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ \n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Decision Trees/decision_trees.ipynb b/Decision Trees/decision_trees.ipynb index 13b5e38..d1a0477 100644 --- a/Decision Trees/decision_trees.ipynb +++ b/Decision Trees/decision_trees.ipynb @@ -1090,205 +1090,19 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 1, "metadata": {}, "outputs": [ { - "data": { - "image/svg+xml": [ - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "Tree\r\n", - "\r\n", - "\r\n", - "0\r\n", - "\r\n", - "PetalLengthCm <= 2.45\r\n", - "gini = 0.662\r\n", - "samples = 100\r\n", - "value = [30, 39, 31]\r\n", - "class = Iris-versicolor\r\n", - "\r\n", - "\r\n", - "1\r\n", - "\r\n", - "gini = 0.0\r\n", - "samples = 30\r\n", - "value = [30, 0, 0]\r\n", - "class = Iris-setosa\r\n", - "\r\n", - "\r\n", - "0->1\r\n", - "\r\n", - "\r\n", - "True\r\n", - "\r\n", - "\r\n", - "2\r\n", - "\r\n", - "PetalWidthCm <= 1.75\r\n", - "gini = 0.493\r\n", - "samples = 70\r\n", - "value = [0, 39, 31]\r\n", - "class = Iris-versicolor\r\n", - "\r\n", - "\r\n", - "0->2\r\n", - "\r\n", - "\r\n", - "False\r\n", - "\r\n", - "\r\n", - "3\r\n", - "\r\n", - "PetalLengthCm <= 5.35\r\n", - "gini = 0.136\r\n", - "samples = 41\r\n", - "value = [0, 38, 3]\r\n", - "class = Iris-versicolor\r\n", - "\r\n", - "\r\n", - "2->3\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "8\r\n", - "\r\n", - "PetalLengthCm <= 4.85\r\n", - "gini = 0.067\r\n", - "samples = 29\r\n", - "value = [0, 1, 28]\r\n", - "class = Iris-virginica\r\n", - "\r\n", - "\r\n", - "2->8\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "4\r\n", - "\r\n", - "SepalLengthCm <= 5.0\r\n", - "gini = 0.05\r\n", - "samples = 39\r\n", - "value = [0, 38, 1]\r\n", - "class = Iris-versicolor\r\n", - "\r\n", - "\r\n", - "3->4\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "7\r\n", - "\r\n", - "gini = 0.0\r\n", - "samples = 2\r\n", - "value = [0, 0, 2]\r\n", - "class = Iris-virginica\r\n", - "\r\n", - "\r\n", - "3->7\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "5\r\n", - "\r\n", - "gini = 0.5\r\n", - "samples = 2\r\n", - "value = [0, 1, 1]\r\n", - "class = Iris-versicolor\r\n", - "\r\n", - "\r\n", - "4->5\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "6\r\n", - "\r\n", - "gini = 0.0\r\n", - "samples = 37\r\n", - "value = [0, 37, 0]\r\n", - "class = Iris-versicolor\r\n", - "\r\n", - "\r\n", - "4->6\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "9\r\n", - "\r\n", - "SepalWidthCm <= 3.0\r\n", - "gini = 0.5\r\n", - "samples = 2\r\n", - "value = [0, 1, 1]\r\n", - "class = Iris-versicolor\r\n", - "\r\n", - "\r\n", - "8->9\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "12\r\n", - "\r\n", - "gini = 0.0\r\n", - "samples = 27\r\n", - "value = [0, 0, 27]\r\n", - "class = Iris-virginica\r\n", - "\r\n", - "\r\n", - "8->12\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "10\r\n", - "\r\n", - "gini = 0.0\r\n", - "samples = 1\r\n", - "value = [0, 0, 1]\r\n", - "class = Iris-virginica\r\n", - "\r\n", - "\r\n", - "9->10\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "11\r\n", - "\r\n", - "gini = 0.0\r\n", - "samples = 1\r\n", - "value = [0, 1, 0]\r\n", - "class = Iris-versicolor\r\n", - "\r\n", - "\r\n", - "9->11\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n", - "\r\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" + "ename": "ModuleNotFoundError", + "evalue": "No module named 'graphviz'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m----------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menviron\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"PATH\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;34m';'\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34mr'C:\\Users\\Dell\\Anaconda3\\Library\\bin\\graphviz'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mgraphviz\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 10\u001b[0m dot_data = export_graphviz(clf, out_file=None,\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'graphviz'" + ] } ], "source": [ @@ -1742,7 +1556,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.9" }, "toc": { "base_numbering": 1, diff --git a/Linear Algebra/.ipynb_checkpoints/Linear_Algebra-checkpoint.ipynb b/Linear Algebra/.ipynb_checkpoints/Linear_Algebra-checkpoint.ipynb new file mode 100644 index 0000000..f618fe2 --- /dev/null +++ b/Linear Algebra/.ipynb_checkpoints/Linear_Algebra-checkpoint.ipynb @@ -0,0 +1,1142 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bf15tCVlEoqh" + }, + "source": [ + "![](img/logo.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VrIbtuS4Eoqj" + }, + "source": [ + "
All rights reserved ©️ Global AI Hub 2020
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q8svL2FUEoqj" + }, + "source": [ + "# Linear Algebra Review" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9CvXqHtHEoqk" + }, + "source": [ + "![](img/linear.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CiRWYak3Eoqk" + }, + "source": [ + "- **Scalar:** Any single numerical value.\n", + "- **Vector:** An array of numbers(data) is a vector. \n", + "- **Matrix:** A matrix is a 2-D array of shape $(m×n)$ with m rows and n columns.\n", + "- **Tensor:** Generally, an n-dimensional array where n>2 is called a Tensor. But a matrix or a vector is also a valid tensor." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "PNp4ZvkmEoql" + }, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ljn54A0PEoqo" + }, + "source": [ + "### Creating Vector" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "cViwMQZBEoqp", + "outputId": "c4e9d558-5d85-4c91-f551-9fa46d1b24ca" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1, 2, 3, 4, 5]])" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arr_1 = np.array([1,2,3,4,5],ndmin=2)\n", + "arr_1" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "Ao_Wzw91Eoqs", + "outputId": "55d3af02-eb22-4f57-80b9-17922a09ebbf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type: \n", + "Shape: (1, 5)\n", + "Dimension: 2\n" + ] + } + ], + "source": [ + "print(f\"Type: {type(arr_1)}\")\n", + "print(f\"Shape: {arr_1.shape}\")\n", + "print(f\"Dimension: {arr_1.ndim}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hu32zcsKEoqv" + }, + "source": [ + "### Creating Matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "id": "Az4Es5XOEoqw", + "outputId": "38c31156-fb98-49e8-939a-1381b33e8ad4" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1, 2, 3, 4],\n", + " [5, 6, 7, 8]])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arr_2 = np.array([[1,2,3,4],\n", + " [5,6,7,8]]) \n", + "arr_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "WiraowZLEoqz", + "outputId": "96989ebd-1b60-46c9-f256-abecfc65a4de" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type: \n", + "Shape: (2, 4)\n", + "Dimension: 2\n" + ] + } + ], + "source": [ + "print(f\"Type: {type(arr_2)}\")\n", + "print(f\"Shape: {arr_2.shape}\")\n", + "print(f\"Dimension: {arr_2.ndim}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nXHAX7y5Eoq1" + }, + "source": [ + "# Addition and Scalar Multiplication " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gW4wg1S9Eoq2" + }, + "source": [ + "## Addition" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qkhyl2G9Eoq2" + }, + "source": [ + " Two matrices may be added or subtracted only if they have the same dimension ( $ m_1$ x $n_1$ = $m_2$ x $n_2$ ); that is, they must have the same number of rows and columns. Addition or subtraction is accomplished by adding or subtracting corresponding elements.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 248 + }, + "id": "K3J25dgJEoq3", + "outputId": "e2aaaeb4-d64e-49d0-e39e-227ff22b5e47" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Matrice_1: \n", + "[[1 2 3 4]\n", + " [5 6 7 8]\n", + " [9 8 6 5]] \n", + "\n", + "Shape of Matrice 1: (3, 4)\n", + "\n", + "Matrice_2: \n", + "[[-1 4 3 5]\n", + " [ 1 4 7 9]\n", + " [-6 5 11 -4]] \n", + "\n", + "Shape of Matrice 2: (3, 4)\n" + ] + } + ], + "source": [ + "matrice_1 = np.array([[1, 2, 3, 4], \n", + " [5, 6, 7, 8], \n", + " [9, 8, 6, 5]])\n", + "\n", + "matrice_2 = np.array([[-1, 4, 3, 5],\n", + " [1, 4, 7, 9],\n", + " [-6, 5, 11, -4]])\n", + "\n", + "print(f\"Matrice_1: \\n{matrice_1}\",\"\\n\")\n", + "print(f\"Shape of Matrice 1: {matrice_1.shape}\")\n", + "print(f\"\\nMatrice_2: \\n{matrice_2}\",\"\\n\")\n", + "print(f\"Shape of Matrice 2: {matrice_2.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FoEH4I5-Eoq6" + }, + "source": [ + "### Adding two matrices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "bvT8Xe-JEoq7", + "outputId": "db7f66c5-2de6-4007-cd5a-ee0f816e7e8e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 6, 6, 9],\n", + " [ 6, 10, 14, 17],\n", + " [ 3, 13, 17, 1]])" + ] + }, + "execution_count": 23, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "matrice_1 + matrice_2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XV0SqgErEoq9" + }, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pVDK5TrSEoq9" + }, + "source": [ + "## Multiplication" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tHBac_SaEoq-" + }, + "source": [ + "### Scalar Multiplication" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0QL9SKeaEoq-" + }, + "source": [ + " The term scalar multiplication refers to the product of a real number and a matrix. In scalar multiplication, each entry in the matrix is multiplied by the given scalar." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "K5fgYct3Eoq-", + "outputId": "dd24ce18-272e-466d-e60a-df2157fff632" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ -1, 8, 9, 20],\n", + " [ 5, 24, 49, 72],\n", + " [-54, 40, 66, -20]])" + ] + }, + "execution_count": 28, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "matrice_1 * matrice_2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KN_oC5TqEorC" + }, + "source": [ + "### Matrix-Vector Multiplication " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vffZaAAIEorC" + }, + "source": [ + " Multiplication between a matrix \"M\" and a vector \"v\", we need to view the vector as a column matrix. We define the matrix-vector product only for the case when the number of columns in M equals the number of rows in v. So, if M is an m×n matrix (i.e., with n columns), then the product $M.v$ is defined for $n$ × $1$ column vectors x. If we let $M.v=r$, then $r$ is an $m$ x $1$ column vector. \n", + "\n", + " $$ (m\\;,\\;n)\\;\\;.\\;(n\\;,\\;1) = (m\\;,\\;1) $$" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "id": "SDRAq70xEorC", + "outputId": "a843336c-b7a2-4b51-f991-f351523becda" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of matrix M = (3, 3) \n", + " Shape of vector v = (3,)\n" + ] + } + ], + "source": [ + "M = np.array([[ 6, 1 ,3], \n", + " [ -1, 1 ,1], \n", + " [ 1, 3 ,2]])\n", + "\n", + "#Rank 1 array\n", + "v = np.array([1, 2, 3])\n", + "\n", + "print(f\"Shape of matrix M = {M.shape}\",\"\\n\",f\"Shape of vector v = {v.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "-1*1 + 2*1 + 3*1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tyLcx6rBEorF" + }, + "source": [ + "#### Option 1:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "16x8wmcuEorF", + "outputId": "4869b4ab-8ea8-48e1-8c89-5e98dfaaf7d8" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([17, 4, 13])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "M.dot(v)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hUZ5F8EIEorI" + }, + "source": [ + "#### Option 2:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "T-arc6Q8EorI", + "outputId": "19336582-670d-40ee-d6c9-6084076ce85a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([17, 4, 13])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.dot(M,v)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zVo3plssEorK" + }, + "source": [ + "### Matrix-Matrix Multiplication " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IEzl_dSIEorK" + }, + "source": [ + "Matrix-Matrix multiplication, the number of columns in the first matrix must be equal to the number of rows in the second matrix. The resulting matrix, known as the matrix product, has the number of rows of the first and the number of columns of the second matrix.\n", + "\n", + "$$ n_1 = m_2$$\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "id": "bXA0BKPLEorL", + "outputId": "4356a204-5158-4dfa-9466-16ca06eecf6e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of matrix C = (3, 4) \n", + " Shape of vector D = (4, 3)\n", + "[[-1 4 3]\n", + " [ 5 1 4]\n", + " [ 7 9 -6]\n", + " [ 5 11 -4]]\n" + ] + } + ], + "source": [ + "C = np.array([[1, 2, 3, 4],\n", + " [5, 6, 7, 8],\n", + " [9, 8, 6, 5]])\n", + "\n", + "D = np.array([[-1, 4, 3, 5],\n", + " [1, 4, 7, 9],\n", + " [-6, 5, 11, -4]]).reshape(4,3)\n", + "\n", + "print(f\"Shape of matrix C = {C.shape}\",\"\\n\",f\"Shape of vector D = {D.shape}\")\n", + "print(D)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "6kzkltUSEorN", + "outputId": "72e657cf-af21-41d0-9663-ba949eb080a3" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 50, 77, -23],\n", + " [114, 177, -35],\n", + " [ 98, 153, 3]])" + ] + }, + "execution_count": 50, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "C.dot(D)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "-FTm0NADEorP", + "outputId": "66bc24b1-e2da-4278-b132-8ea4dc344700" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 50, 77, -23],\n", + " [114, 177, -35],\n", + " [ 98, 153, 3]])" + ] + }, + "execution_count": 51, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "np.dot(C,D)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[46, 46, 43, 43],\n", + " [46, 48, 46, 48],\n", + " [-2, 20, 48, 70],\n", + " [24, 44, 68, 88]])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.dot(D,C)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ncrKTCv7EorV" + }, + "source": [ + "## Matrix Multiplication Properties" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hRgUcHmQEorV" + }, + "source": [ + "1. The commutative property of multiplication $AB \\neq BA$\n", + "\n", + "2. Associative property of multiplication $(AB)C = A(BC)$\n", + "\n", + "3. Distributive properties $A(B+C) = AB+AC$\n", + "\n", + "4. Multiplicative identity property $ IA =A\\, \\& \\, AI=A$\n", + "\n", + "5. Multiplicative property of zero $ I0 =0 \\, \\& \\, A0=0$\n", + "\n", + "6. Dimension property" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w2anp9PQEorW" + }, + "source": [ + "# Inverse and Transpose" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IHCTLSUpEorW" + }, + "source": [ + "## Inverse" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Wp-bJiKnEorX" + }, + "source": [ + "In linear algebra, an n-by-n square matrix A is called invertible (also nonsingular or nondegenerate), if there exists an n-by-n square matrix B such that\n", + "\n", + "$ AB=BA=I $ where In denotes the n-by-n identity matrix and the multiplication used is ordinary matrix multiplication. If this is the case, then the matrix B is uniquely determined by A, and is called the (multiplicative) inverse of A, denoted by A−1.\n", + "\n", + " $$A\\;.\\; A^{-1} = I $$\n", + "Where: \n", + "$I$: Identity Matrix \n", + "Shape A: $ (n,n)$ " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "pvHq_cv6RBCn", + "outputId": "ee88e630-ed95-42fa-fc79-1344db14b219" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1., 0., 0.],\n", + " [0., 1., 0.],\n", + " [0., 0., 1.]])" + ] + }, + "execution_count": 80, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "#Example of identity matrix with 3 x 3 dimension\n", + "np.identity(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "8SvZ5NFnEorX" + }, + "outputs": [], + "source": [ + "x = np.array([[6, 9],\n", + " [12, 17]])\n", + "\n", + "y = np.array([[8, 5],\n", + " [1, 2]])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2, 2)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "oeaaSv9SEorZ" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-2.83333333, 1.5 ],\n", + " [ 2. , -1. ]])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_inv = np.linalg.inv(x)\n", + "x_inv" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "id": "F-nZz--VP7T4", + "outputId": "a28c2018-5ae0-4b4d-a294-0a05443e11fd" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1., 0.],\n", + " [0., 1.]])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.dot(x_inv)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BTOGBPgeEord" + }, + "source": [ + "## Transpose" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "22WuaRLnEord" + }, + "source": [ + "In linear algebra, the transpose of a matrix is an operator which flips a matrix over its diagonal; that is, it switches the row and column indices of the matrix $A$ by producing another matrix, often denoted by $A^T$(among other notations).\n", + "\n", + "If matrix A's shape is $(n,m)$, then shape of $A^T$ will be shape of $(m,n)$" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "id": "5WK1OEQDEord", + "outputId": "54db8416-1fa2-4eda-e64e-8f4f8f55a9eb" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 6, 9],\n", + " [12, 17]])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "id": "989rBTm7Eorf", + "outputId": "c6762853-0c18-43a7-9e09-b8c6eea471a9" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 6, 12],\n", + " [ 9, 17]])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_trans = x.T\n", + "x_trans" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1, 5, 9],\n", + " [2, 6, 8],\n", + " [3, 7, 6],\n", + " [4, 8, 5]])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "C.T" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 159 + }, + "id": "0nfN6Dq8Eori", + "outputId": "645541ae-1d0b-42d6-b63d-17e6c8db32aa" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Matrice: \n", + "[[6 1 2]\n", + " [5 2 3]\n", + " [3 5 9]\n", + " [1 7 1]\n", + " [5 2 6]]\n", + "\n", + "Shape: (5, 3)\n" + ] + } + ], + "source": [ + "A = np.random.randint(1, 10, size=(5, 3))\n", + "\n", + "print(f\"Matrice: \\n{A}\")\n", + "print(f\"\\nShape: {A.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 159 + }, + "id": "s5CBacjkSOGj", + "outputId": "196467e0-b123-4f59-9d9e-2885aff1e180" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Matrice: \n", + "[[5 5 1]\n", + " [7 7 3]\n", + " [4 1 9]\n", + " [9 8 5]\n", + " [9 7 9]]\n", + "\n", + "Shape: (5, 3)\n" + ] + } + ], + "source": [ + "#Seed\n", + "np.random.seed(5)\n", + "\n", + "A = np.random.randint(1, 10, size=(5, 3))\n", + "\n", + "print(f\"Matrice: \\n{A}\")\n", + "print(f\"\\nShape: {A.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[62, 85, 20],\n", + " [45, 88, 71],\n", + " [52, 33, 98],\n", + " [53, 60, 68],\n", + " [78, 46, 67]])" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.random.seed()\n", + "\n", + "A = np.random.randint(1, 10, size=(5, 3))\n", + "\n", + "B = np.random.randint(17, 100, size=(5, 3))\n", + "\n", + "A + B" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 124 + }, + "id": "3196zyn2Eork", + "outputId": "4cee206e-b4f0-4fd3-ee9a-26591c8c55f4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Matrice: \n", + "[[4 5 9 4 1]\n", + " [9 8 8 1 3]\n", + " [5 2 4 4 9]]\n", + "\n", + "Shape: (3, 5)\n" + ] + } + ], + "source": [ + "A_t = A.T\n", + "\n", + "print(f\"Matrice: \\n{A_t}\")\n", + "print(f\"\\nShape: {A_t.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "id": "Gg7E0F_7Eorm" + }, + "outputs": [], + "source": [ + "np.random.seed()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zBORL4KGEoro" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Linear_Algebra.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Linear Algebra/Linear_Algebra.ipynb b/Linear Algebra/Linear_Algebra.ipynb index f618fe2..fad87ce 100644 --- a/Linear Algebra/Linear_Algebra.ipynb +++ b/Linear Algebra/Linear_Algebra.ipynb @@ -1134,7 +1134,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.9" } }, "nbformat": 4, diff --git a/Linear Regression/.ipynb_checkpoints/linear_regression-checkpoint.ipynb b/Linear Regression/.ipynb_checkpoints/linear_regression-checkpoint.ipynb new file mode 100644 index 0000000..31ec2ef --- /dev/null +++ b/Linear Regression/.ipynb_checkpoints/linear_regression-checkpoint.ipynb @@ -0,0 +1,2131 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PIBzTWnCk94D" + }, + "source": [ + " All rights reserved © Global AI Hub 2020 \n", + "![](img/logo.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "xAFYL8FOC3EX" + }, + "source": [ + "# What is Regression?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ObZajM_LDfze" + }, + "source": [ + "Regression analysis is one of the most important fields in statistics and machine learning. There are many regression methods available. Linear regression is one of them." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tp0RCvw-D0jq" + }, + "source": [ + "Regression searches for relationships among variables. For example, you can observe several employees of some company and try to understand how their salaries depend on the features, such as experience, level of education, role, city they work in, and so on." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "t7TUcDz4HyB4" + }, + "source": [ + "# Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pcJ2U0sUIAzn" + }, + "source": [ + "Linear regression is probably one of the most important and widely used regression techniques. It’s among the simplest regression methods. One of its main advantages is the ease of interpreting results." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Fh3LdXkTI2Ns" + }, + "source": [ + "When implementing linear regression of some dependent variable 𝑦 on the set of independent variables 𝐱 = (𝑥₁, …, 𝑥ᵣ), where 𝑟 is the number of predictors, you assume a linear relationship between 𝑦 and 𝐱: 𝑦 = 𝛽₀ + 𝛽₁𝑥₁ + ⋯ + 𝛽ᵣ𝑥ᵣ + 𝜀. This equation is the regression equation. 𝛽₀, 𝛽₁, …, 𝛽ᵣ are the regression coefficients, and 𝜀 is the random error." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gBea82xsJZuh" + }, + "source": [ + "## Simple Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "CDXdTosaJfFJ" + }, + "source": [ + "Simple or single-variate linear regression is the simplest case of linear regression with a single independent variable, 𝐱 = x. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HJMakg8FJ0oi" + }, + "source": [ + "![image.png]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "DesNbbSCK7iA" + }, + "source": [ + "The estimated regression function (red line) has the equation:\n", + "$$f(x) = b_o + 𝑏_1x$$ \n", + "\n", + "Our goal is to calculate the optimal values of the predicted weights 𝑏₀ and 𝑏₁ that minimize the sum of squared residuals (SSR) and determine the estimated regression function." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "cBlpRHffLQTp" + }, + "source": [ + "## Multiple Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "JTXLFFmILXvG" + }, + "source": [ + "Multiple or multivariate linear regression is a case of linear regression with two or more independent variables.\n", + "\n", + "If there are just two independent variables, the estimated regression function is: \n", + "$$f(x_1,x_2) = b_0 + b_1x_1 + b_2x_2$$\n", + "It represents a regression plane in a three-dimensional space. The goal of regression is to determine the values of the weights 𝑏₀, 𝑏₁, and 𝑏₂ such that this plane is as close as possible to the actual responses and yield the minimal SSR." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/lin1.png)\n", + "\n", + "

Plane

" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nM-uW1kShW0b" + }, + "source": [ + "## Polynomial Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "SrKnG8HMhZE3" + }, + "source": [ + "You can regard polynomial regression as a generalized case of linear regression. You assume the polynomial dependence between the output and inputs and, consequently, the polynomial estimated regression function.\n", + "\n", + "In other words, in addition to linear terms like 𝑏₁𝑥₁, your regression function 𝑓 can include non-linear terms such as 𝑏₂𝑥₁², 𝑏₃𝑥₁³, or even 𝑏₄𝑥₁𝑥₂, 𝑏₅𝑥₁²𝑥₂, and so on.\n", + "\n", + "\n", + "The simplest example of polynomial regression has a single independent variable, and the estimated regression function is a polynomial of degree 2:\n", + "$$ f(x) = b_o + b_1x + b_2x^2$$\n", + "\n", + "- Kernel Trick\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Regression Assumptions \n", + "\n", + "### Linearity\n", + "\n", + "The simple meaning of linearity is understand from scatter plot which is the straight line that drawn in between points on scatter plot.\n", + "\n", + "### No Heteroskedasticity\n", + "The presence of non-constant variance in the error terms results in heteroskedasticity \n", + "\n", + "**Residual Data**: The residual data of the simple linear regression model is the difference between the observed data of the dependent variable y and the fitted values ŷ. \n", + "` plt.scatter(pred, (pred - actual), c='b')`\n", + "![](img/residual-plot.png)\n", + "\n", + "### No Multi-Colinearity\n", + "Multicollinearity refers to a situation in which two or more explanatory variables in a multiple regression model are highly linearly related. \n", + " \n", + "How to identify this:\n", + "- Using Pearson-correlation.
\n", + "\n", + "How to fix this: \n", + "- Drop one of the two variables \n", + "- Create a function to create a new independent variable using the correlated features and drop the correlated features." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QrisOngVj_Gm" + }, + "source": [ + "## Simple Linear Regression With scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dwT8MuDxmi8v" + }, + "source": [ + "### Import the Packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LQUftsJ6C0e1" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.linear_model import LinearRegression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8FrcERYtmqOW" + }, + "source": [ + "### Provide Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 141 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 946, + "status": "ok", + "timestamp": 1600455328617, + "user": { + "displayName": "Adem Tekin", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gjji3ec_q6_sWOPCSacyEQOavzHeGZCtJpnBwD9=s64", + "userId": "01835850446278926637" + }, + "user_tz": -180 + }, + "id": "FqECs48CC0Tg", + "outputId": "8a823499-d4ad-4da9-c550-76783a1486d3" + }, + "outputs": [], + "source": [ + "from sklearn.datasets import make_regression\n", + "X, y = make_regression(n_samples=100, n_features=1, noise=5,random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "7lgOFL7xmw6I" + }, + "source": [ + "### Create a model & fit" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "c9pvWB2xC0HP" + }, + "outputs": [], + "source": [ + "model = LinearRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 907, + "status": "ok", + "timestamp": 1600455412483, + "user": { + "displayName": "Adem Tekin", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gjji3ec_q6_sWOPCSacyEQOavzHeGZCtJpnBwD9=s64", + "userId": "01835850446278926637" + }, + "user_tz": -180 + }, + "id": "Z6ZSZku6Cz7p", + "outputId": "8eb7e5e0-3468-466b-d940-a3ac54703ce6" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "sooWWsmqm6RP" + }, + "source": [ + "### Get Results" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1004, + "status": "ok", + "timestamp": 1600455441876, + "user": { + "displayName": "Adem Tekin", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gjji3ec_q6_sWOPCSacyEQOavzHeGZCtJpnBwD9=s64", + "userId": "01835850446278926637" + }, + "user_tz": -180 + }, + "id": "mFxmTaseCzu_", + "outputId": "ff14a536-3cf6-4608-f92c-a607bef255d3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coefficient of determination (R2): 0.9872919587211264\n" + ] + } + ], + "source": [ + "r_sq = model.score(X, y)\n", + "print('Coefficient of determination (R2):', r_sq)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 882, + "status": "ok", + "timestamp": 1600455495680, + "user": { + "displayName": "Adem Tekin", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gjji3ec_q6_sWOPCSacyEQOavzHeGZCtJpnBwD9=s64", + "userId": "01835850446278926637" + }, + "user_tz": -180 + }, + "id": "obl0Qm53Czfx", + "outputId": "f55ae184-bb4e-4869-8cf7-fe9c9ddf289b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "intercept: 0.5825576602634852\n", + "slope: [43.08913515]\n" + ] + } + ], + "source": [ + "print('intercept:', model.intercept_)\n", + "print('slope:', model.coef_)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "L3CUuxm0nBb3" + }, + "source": [ + "### Predict" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 917, + "status": "ok", + "timestamp": 1600455553379, + "user": { + "displayName": "Adem Tekin", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gjji3ec_q6_sWOPCSacyEQOavzHeGZCtJpnBwD9=s64", + "userId": "01835850446278926637" + }, + "user_tz": -180 + }, + "id": "b8Jb11mTCzMV", + "outputId": "e54c5f07-4136-49cb-e731-c8066fe192ea" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "predicted response:\n", + "[ 4.07106126e+01 4.33334055e+00 -4.49932915e+01 1.41232034e+01\n", + " -2.00646458e+01 2.84908966e+01 -1.93857099e+01 2.39610007e+01\n", + " 2.69391599e+01 4.38239223e+01 3.59976258e+01 6.68552231e+01\n", + " 3.24024439e+01 -8.88291400e+00 -3.42547325e+01 4.53645016e+00\n", + " -8.38579332e+01 2.26988075e+01 4.50074354e+01 -1.12299993e+02\n", + " 2.19855409e+01 4.76554530e+00 -1.94853346e+01 4.00264432e+01\n", + " 6.80016996e+01 6.37360956e+01 -2.52984440e+01 -9.61155381e-01\n", + " -2.53444599e+01 -5.09609512e+01 1.59702295e+01 1.67710603e+01\n", + " 1.18312036e+01 -2.22443514e+01 -3.85434145e+01 9.06513806e+00\n", + " -1.23014105e+01 1.61651412e+01 3.60253065e+01 -1.19862737e+01\n", + " 1.61547817e+01 -2.85854259e+01 6.62085970e+01 -2.17492578e+01\n", + " -9.50690875e+00 -1.63130435e+01 -1.23917402e+01 -2.72150946e+01\n", + " 1.47481587e+01 9.58230933e+00 -1.35361209e+01 8.02892085e-01\n", + " -9.50620132e+00 6.42637339e+01 -1.60103048e+01 1.53887726e+01\n", + " 1.48564124e+01 -7.41732708e+00 -9.52559904e+00 -8.50600679e+01\n", + " 1.45470556e+01 -9.14594787e+00 -4.70884596e+01 -2.96683030e+01\n", + " -6.24790357e+01 4.23206326e+01 -7.37426600e+01 -8.18590334e+01\n", + " -2.36459257e+01 3.36506564e+01 3.55935926e+01 -6.02723874e+01\n", + " -6.31256762e+01 -5.20225394e+01 -3.55785996e+01 6.86294721e+01\n", + " 4.26179535e+01 1.10085027e+01 1.33421243e+01 -2.28744231e+01\n", + " -7.53853179e+01 5.36211612e+00 7.96666869e+00 -5.66478305e+01\n", + " 4.61330401e+01 -4.40062681e+00 -1.96466876e+01 9.73919333e-04\n", + " -4.30594594e+01 -5.37513150e+00 -4.90127602e+01 5.90217676e+01\n", + " 3.49228960e+00 -2.10377194e+01 -1.92659686e+01 8.03956227e+01\n", + " -6.08086095e+01 -3.04349067e+01 -2.52029620e+00 -1.27411362e+01]\n" + ] + } + ], + "source": [ + "X_test, y_test = make_regression(n_samples=100, n_features=1, noise=20 ,random_state=42)\n", + "y_pred = model.predict(X_test)\n", + "print('predicted response:', y_pred, sep='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(X, y, color=\"red\")\n", + "plt.scatter(X_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9872919587211264" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.score(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coefficient of determination (R2) of test: 0.8460442632197438\n" + ] + } + ], + "source": [ + "r_t = model.score(X_test, y_test)\n", + "print('Coefficient of determination (R2) of test:', r_t)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "icAi86X1nJLc" + }, + "source": [ + "## Multiple Linear Regression With scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "H0xTkmm_nVJ9" + }, + "source": [ + "### Provide Data" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 176 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 780, + "status": "ok", + "timestamp": 1600456172877, + "user": { + "displayName": "Adem Tekin", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gjji3ec_q6_sWOPCSacyEQOavzHeGZCtJpnBwD9=s64", + "userId": "01835850446278926637" + }, + "user_tz": -180 + }, + "id": "Diw806hRl1A_", + "outputId": "c65a73ab-25ae-49d7-d252-8eca3500a059" + }, + "outputs": [], + "source": [ + "from sklearn.datasets import make_regression\n", + "X_m, y_m = make_regression(n_samples=100, n_features=2, noise=20,random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0.05820872, -1.1429703 ],\n", + " [ 0.58685709, 2.19045563],\n", + " [ 0.47323762, -0.07282891],\n", + " [ 0.73846658, 0.17136828]])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_m[1:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "otpUMvJZn1kM" + }, + "source": [ + "### Create a model & fit" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3CHiKIZZl1K6" + }, + "outputs": [], + "source": [ + "model = LinearRegression().fit(X_m, y_m)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kLOWx40on-L0" + }, + "source": [ + "### Get Results" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 910, + "status": "ok", + "timestamp": 1600456293410, + "user": { + "displayName": "Adem Tekin", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gjji3ec_q6_sWOPCSacyEQOavzHeGZCtJpnBwD9=s64", + "userId": "01835850446278926637" + }, + "user_tz": -180 + }, + "id": "iBt3pM-hl1S4", + "outputId": "77ac4fff-59d2-4598-c567-1bcbbf13861f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "coefficient of determination: 0.959604849170989\n", + "intercept: 0.43271616892190323\n", + "slope: [84.26785337 74.15664645]\n" + ] + } + ], + "source": [ + "r_sq = model.score(X_m, y_m)\n", + "print('coefficient of determination:', r_sq)\n", + "print('intercept:', model.intercept_)\n", + "print('slope:', model.coef_)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "TY6mePg5oSBE" + }, + "source": [ + "### Predict" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1542, + "status": "ok", + "timestamp": 1600456327996, + "user": { + "displayName": "Adem Tekin", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gjji3ec_q6_sWOPCSacyEQOavzHeGZCtJpnBwD9=s64", + "userId": "01835850446278926637" + }, + "user_tz": -180 + }, + "id": "6ViFtCWfl1a5", + "outputId": "27a58f97-7a0f-4896-a3e5-a275700519ef", + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "predicted response:\n", + "[ -53.2618421 -81.47203332 182.21191983 -185.91510539 -48.19449499\n", + " -30.58967254 -142.85219232 -89.69922222 -68.68904094 -149.21585472\n", + " -41.44798528 90.09462956 -73.17883242 -174.86416876 -114.62008988\n", + " -36.66153161 11.33844031 111.89967811 25.61613614 176.80558272]\n" + ] + } + ], + "source": [ + "Xm_test, ym_test = make_regression(n_samples=20, n_features=2, noise=20, random_state=42)\n", + "\n", + "ym_pred = model.predict(Xm_test)\n", + "print('predicted response:', ym_pred, sep='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coefficient of determination (R2) of test: 0.9587826719061527\n" + ] + } + ], + "source": [ + "print('Coefficient of determination (R2) of test:', model.score(Xm_test,ym_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9587826719061527" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import r2_score\n", + "r2_score(ym_test, ym_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0t2bMe0DruXG" + }, + "source": [ + "## Cost Function" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "h0d9Gg4Sr_JD" + }, + "source": [ + "It is a function that measures the performance of a Machine Learning model for given data. Cost Function quantifies the error between predicted values and expected values and presents it in the form of a single real number." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3KrPoOTJjDpS" + }, + "source": [ + "$$MSE = \\frac{1}{m}\\sum_{n=1}^{m} (y_p-y)^{2}$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "b-CqFQ0cmijD" + }, + "source": [ + "#### Mean Square Error (MSE)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_squared_error" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19.513562605309748" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19.513562605309748" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean((y - y_pred)**2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Mean Square Error (MAE)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.5522465657069584" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import mean_absolute_error\n", + "mean_absolute_error(y,y_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Adjusted ($R^2$)\n", + "\n", + "Adjusted R2 also indicates how well terms fit a curve or line, but adjusts for the number of terms in a model. If you add more and more useless variables to a model, adjusted r-squared will decrease. If you add more useful variables, adjusted r-squared will increase.\n", + "Adjusted $R^2$ will always be less than or equal to $R^2$." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def adj_r2 (X,y,model):\n", + " r_squared = model.score(X,y)\n", + " return(1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9539335744833471" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adj_r2(Xm_test, ym_test, model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Real Life Example" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + "\n", + " PTRATIO B LSTAT \n", + "0 15.3 396.90 4.98 \n", + "1 17.8 396.90 9.14 \n", + "2 17.8 392.83 4.03 \n", + "3 18.7 394.63 2.94 \n", + "4 18.7 396.90 5.33 " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_boston\n", + "import pandas as pd\n", + "\n", + "Xb,yb =load_boston(return_X_y=True)\n", + "\n", + "df_boston = pd.DataFrame(Xb,columns = load_boston().feature_names)\n", + "df_boston.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 506 entries, 0 to 505\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 CRIM 506 non-null float64\n", + " 1 ZN 506 non-null float64\n", + " 2 INDUS 506 non-null float64\n", + " 3 CHAS 506 non-null float64\n", + " 4 NOX 506 non-null float64\n", + " 5 RM 506 non-null float64\n", + " 6 AGE 506 non-null float64\n", + " 7 DIS 506 non-null float64\n", + " 8 RAD 506 non-null float64\n", + " 9 TAX 506 non-null float64\n", + " 10 PTRATIO 506 non-null float64\n", + " 11 B 506 non-null float64\n", + " 12 LSTAT 506 non-null float64\n", + "dtypes: float64(13)\n", + "memory usage: 51.5 KB\n" + ] + } + ], + "source": [ + "df_boston.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
count506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000
mean3.61352411.36363611.1367790.0691700.5546956.28463468.5749013.7950439.549407408.23715418.455534356.67403212.653063
std8.60154523.3224536.8603530.2539940.1158780.70261728.1488612.1057108.707259168.5371162.16494691.2948647.141062
min0.0063200.0000000.4600000.0000000.3850003.5610002.9000001.1296001.000000187.00000012.6000000.3200001.730000
25%0.0820450.0000005.1900000.0000000.4490005.88550045.0250002.1001754.000000279.00000017.400000375.3775006.950000
50%0.2565100.0000009.6900000.0000000.5380006.20850077.5000003.2074505.000000330.00000019.050000391.44000011.360000
75%3.67708312.50000018.1000000.0000000.6240006.62350094.0750005.18842524.000000666.00000020.200000396.22500016.955000
max88.976200100.00000027.7400001.0000000.8710008.780000100.00000012.12650024.000000711.00000022.000000396.90000037.970000
\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM \\\n", + "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n", + "mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 \n", + "std 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 \n", + "min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 \n", + "25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 \n", + "50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 \n", + "75% 3.677083 12.500000 18.100000 0.000000 0.624000 6.623500 \n", + "max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 \n", + "\n", + " AGE DIS RAD TAX PTRATIO B \\\n", + "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n", + "mean 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 \n", + "std 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 \n", + "min 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 \n", + "25% 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 \n", + "50% 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 \n", + "75% 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 \n", + "max 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 \n", + "\n", + " LSTAT \n", + "count 506.000000 \n", + "mean 12.653063 \n", + "std 7.141062 \n", + "min 1.730000 \n", + "25% 6.950000 \n", + "50% 11.360000 \n", + "75% 16.955000 \n", + "max 37.970000 " + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_boston.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CRIM 0\n", + "ZN 0\n", + "INDUS 0\n", + "CHAS 0\n", + "NOX 0\n", + "RM 0\n", + "AGE 0\n", + "DIS 0\n", + "RAD 0\n", + "TAX 0\n", + "PTRATIO 0\n", + "B 0\n", + "LSTAT 0\n", + "dtype: int64" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_boston.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Correlation of the features\n", + "import seaborn as sns\n", + "corr = df_boston.corr()\n", + "\n", + "plt.figure(figsize=(14, 14))\n", + "ax = sns.heatmap(\n", + " corr, \n", + " vmin=-1, vmax=1, center=0,\n", + " cmap=sns.diverging_palette(20, 220, n=200),\n", + " square=True, annot = True\n", + ")\n", + "ax.set_xticklabels(\n", + " ax.get_xticklabels(),\n", + " rotation=45,\n", + " horizontalalignment='right'\n", + ")\n", + "ax.set_ylim(len(corr)+0.5, -0.5);" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(Xb,yb, test_size=0.3, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`train_test_split` splits arrays or matrices into random train and test subsets. That means that everytime you run it without specifying `random_state`, you will get a different result. Thus, your project will not be reproducible." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score of the train set 0.7434997532004697\n", + "Score of the test set 0.711226005748496\n" + ] + } + ], + "source": [ + "modelb = LinearRegression(normalize=False)\n", + "modelb.fit(X_train,y_train)\n", + "\n", + "print(\"Score of the train set\",modelb.score(X_train,y_train))\n", + "print(\"Score of the test set\",modelb.score(X_test,y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adj. R2 of the train set 0.7336923908228405\n", + "Adj. R2 of the test set 0.6840226584639341\n" + ] + } + ], + "source": [ + "print(\"Adj. R2 of the train set\",adj_r2(X_train,y_train,modelb))\n", + "print(\"Adj. R2 of the test set\",adj_r2(X_test,y_test,modelb))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature CRIM Score: -0.13347010285294553\n", + "Feature ZN Score: 0.03580891359323298\n", + "Feature INDUS Score: 0.04952264522005613\n", + "Feature CHAS Score: 3.119835116285376\n", + "Feature NOX Score: -15.417060895306665\n", + "Feature RM Score: 4.057199231645383\n", + "Feature AGE Score: -0.010820835184928124\n", + "Feature DIS Score: -1.3859982431608822\n", + "Feature RAD Score: 0.24272733982225028\n", + "Feature TAX Score: -0.0087022343656624\n", + "Feature PTRATIO Score: -0.9106852081102877\n", + "Feature B Score: 0.011794115892572747\n", + "Feature LSTAT Score: -0.5471133128239566\n" + ] + } + ], + "source": [ + "importance = modelb.coef_\n", + "for i in range(len(importance)):\n", + " print(\"Feature\", df_boston.columns[i], \"Score:\", importance[i])" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping colerated features\n", + "new_df = df_boston.drop([\"AGE\",\"INDUS\"],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score of the train set 0.7428288884259309\n", + "Score of the test set 0.7149039348907364\n" + ] + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(new_df,yb, test_size=0.3, random_state=42)\n", + "modelb = LinearRegression(normalize=True)\n", + "modelb.fit(X_train,y_train)\n", + "print(\"Score of the train set\",modelb.score(X_train,y_train))\n", + "print(\"Score of the test set\",modelb.score(X_test,y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score of the train set 0.7345573029659462\n", + "Score of the test set 0.6925035297750086\n" + ] + } + ], + "source": [ + "print(\"Score of the train set\",adj_r2(X_train,y_train,modelb))\n", + "print(\"Score of the test set\",adj_r2(X_test,y_test,modelb))" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.41978194, 0.28482986, 1.2879095 , ..., 1.45900038, 0.44105193,\n", + " 1.0755623 ],\n", + " [0.41733926, 0.48772236, 0.59338101, ..., 0.30309415, 0.44105193,\n", + " 0.49243937],\n", + " [0.41734159, 0.48772236, 0.59338101, ..., 0.30309415, 0.39642699,\n", + " 1.2087274 ],\n", + " ...,\n", + " [0.41344658, 0.48772236, 0.11573841, ..., 1.17646583, 0.44105193,\n", + " 0.98304761],\n", + " [0.40776407, 0.48772236, 0.11573841, ..., 1.17646583, 0.4032249 ,\n", + " 0.86530163],\n", + " [0.41500016, 0.48772236, 0.11573841, ..., 1.17646583, 0.44105193,\n", + " 0.66905833]])" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Scaling\n", + "from scipy import stats\n", + "import numpy as np\n", + "z = np.abs(stats.zscore(df_boston))\n", + "z" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(np.where(z > 3)[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexCRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
000.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
110.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
220.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
330.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
440.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
.............................................
4105010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.67
4115020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.08
4125030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.64
4135040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.48
4145050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.88
\n", + "

415 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " index CRIM ZN INDUS CHAS NOX RM AGE DIS RAD \\\n", + "0 0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 \n", + "1 1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 \n", + "2 2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 \n", + "3 3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 \n", + "4 4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 \n", + ".. ... ... ... ... ... ... ... ... ... ... \n", + "410 501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 \n", + "411 502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 \n", + "412 503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 \n", + "413 504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 \n", + "414 505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 \n", + "\n", + " TAX PTRATIO B LSTAT \n", + "0 296.0 15.3 396.90 4.98 \n", + "1 242.0 17.8 396.90 9.14 \n", + "2 242.0 17.8 392.83 4.03 \n", + "3 222.0 18.7 394.63 2.94 \n", + "4 222.0 18.7 396.90 5.33 \n", + ".. ... ... ... ... \n", + "410 273.0 21.0 391.99 9.67 \n", + "411 273.0 21.0 396.90 9.08 \n", + "412 273.0 21.0 396.90 5.64 \n", + "413 273.0 21.0 393.45 6.48 \n", + "414 273.0 21.0 396.90 7.88 \n", + "\n", + "[415 rows x 14 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "415" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliers = list(set(np.where(z > 3)[0]))\n", + "new_df = df_boston.drop(outliers,axis = 0).reset_index(drop = False)\n", + "display(new_df)\n", + "\n", + "y_new = yb[list(new_df[\"index\"])]\n", + "len(y_new)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.50009668, 0.42727822, -1.25466328, ..., -1.48379449,\n", + " 0.43991388, -1.1282211 ],\n", + " [-0.49580542, -0.48858069, -0.55367539, ..., -0.312853 ,\n", + " 0.43991388, -0.48493501],\n", + " [-0.49580951, -0.48858069, -0.55367539, ..., -0.312853 ,\n", + " 0.36091635, -1.27512537],\n", + " ...,\n", + " [-0.4889668 , -0.48858069, 0.16203913, ..., 1.18595212,\n", + " 0.43991388, -1.02616129],\n", + " [-0.47898384, -0.48858069, 0.16203913, ..., 1.18595212,\n", + " 0.37295037, -0.89626698],\n", + " [-0.49169611, -0.48858069, 0.16203913, ..., 1.18595212,\n", + " 0.43991388, -0.67977647]])" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_new = new_df.drop('index', axis = 1)\n", + "\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "X_scaled = StandardScaler().fit_transform(X_new)\n", + "X_scaled" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score of the train set 0.7598701100873175\n", + "Score of the test set 0.6739782514175501\n" + ] + } + ], + "source": [ + "# Outliers removed & scaling\n", + "X_train, X_test, y_train, y_test = train_test_split(X_scaled,y_new, test_size=0.3, random_state=42)\n", + "modelb = LinearRegression(normalize=False)\n", + "modelb.fit(X_train,y_train)\n", + "print(\"Score of the train set\",modelb.score(X_train,y_train))\n", + "print(\"Score of the test set\",modelb.score(X_test,y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score of the train set 0.7434997532004697\n", + "Score of the test set 0.711226005748496\n" + ] + } + ], + "source": [ + "# With outliers & not scaled\n", + "X_train, X_test, y_train, y_test = train_test_split(Xb,yb, test_size=0.3, random_state=42)\n", + "modelb = LinearRegression(normalize=False)\n", + "modelb.fit(X_train,y_train)\n", + "print(\"Score of the train set\",modelb.score(X_train,y_train))\n", + "print(\"Score of the test set\",modelb.score(X_test,y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score of the train set 0.7434997532004697\n", + "Score of the test set 0.7112260057484925\n" + ] + } + ], + "source": [ + "# With outlier but scaling\n", + "X_alls = StandardScaler().fit_transform(Xb)\n", + "X_train, X_test, y_train, y_test = train_test_split(X_alls,yb, test_size=0.3, random_state=42)\n", + "\n", + "modelb = LinearRegression(normalize=False)\n", + "modelb.fit(X_train,y_train)\n", + "print(\"Score of the train set\",modelb.score(X_train,y_train))\n", + "print(\"Score of the test set\",modelb.score(X_test,y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resorces\n", + "https://towardsdatascience.com/statistics-supporting-linear-models-bfc24fb9781f \n", + " " + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "ML Zemin.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Linear Regression/linear_regression.ipynb b/Linear Regression/linear_regression.ipynb index 31ec2ef..ab92f1e 100644 --- a/Linear Regression/linear_regression.ipynb +++ b/Linear Regression/linear_regression.ipynb @@ -2123,7 +2123,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.9" } }, "nbformat": 4, diff --git a/Logistic Regression/.ipynb_checkpoints/logistic_regression-checkpoint.ipynb b/Logistic Regression/.ipynb_checkpoints/logistic_regression-checkpoint.ipynb new file mode 100644 index 0000000..33fd463 --- /dev/null +++ b/Logistic Regression/.ipynb_checkpoints/logistic_regression-checkpoint.ipynb @@ -0,0 +1,2111 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "O1lN5imPlfWb" + }, + "source": [ + " All rights reserved © Global AI Hub 2020 \n", + "![](img/logo.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "4CTfJC29leb8" + }, + "source": [ + "# Logistic Regression " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_iq3Lo2Eleb9" + }, + "source": [ + "Logistic regression is a fundamental classification technique. It belongs to the group of linear classifiers and is somewhat similar to polynomial and linear regression. Logistic regression is fast and relatively uncomplicated, and it’s convenient for you to interpret the results. Although it’s essentially a method for binary classification, it can also be applied to multiclass problems." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "GA8bmkyAleb-" + }, + "source": [ + "You’ll need an understanding of the sigmoid function and the natural logarithm function to understand what logistic regression is and how it works.\n", + "\n", + "This image shows the sigmoid function (or S-shaped curve) of some variable 𝑥:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "s97qP9XJleb-" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8euP2T8zleb_" + }, + "source": [ + "The sigmoid function has values very close to either 0 or 1 across most of its domain. This fact makes it suitable for application in classification methods." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "mWzlA4vqleb_" + }, + "source": [ + "## Single-Variate Logistic Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pzzWSR2glecA" + }, + "source": [ + "Single-variate logistic regression is the most straightforward case of logistic regression. There is only one independent variable (or feature), which is 𝐱 = 𝑥. This figure illustrates single-variate logistic regression:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "CfiHv1p1lecB" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WdQx45HKlecB" + }, + "source": [ + "Here, you have a given set of input-output (or 𝑥-𝑦) pairs, represented by green circles. These are your observations. Remember that 𝑦 can only be 0 or 1. For example, the leftmost green circle has the input 𝑥 = 0 and the actual output 𝑦 = 0. The rightmost observation has 𝑥 = 9 and 𝑦 = 1.\n", + "\n", + "Logistic regression finds the weights 𝑏₀ and 𝑏₁ that correspond to the maximum log-likelihood function (LLF). These weights define the logit 𝑓(𝑥) = 𝑏₀ + 𝑏₁𝑥, which is the dashed black line. They also define the predicted probability $p(x) = 1 / (1 + exp(−𝑓(𝑥)))$, shown here as the full black line. In this case, the threshold 𝑝(𝑥) = 0.5 and 𝑓(𝑥) = 0 corresponds to the value of 𝑥 slightly higher than 3. This value is the limit between the inputs with the predicted outputs of 0 and 1." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "GXp6HofvlecF" + }, + "source": [ + "## Logistic Regression in Python" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ooCLpj5WlecF" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import classification_report, confusion_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ts_LzMz3lecJ" + }, + "outputs": [], + "source": [ + "x = np.arange(10).reshape(-1, 1)\n", + "y = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0],\n", + " [1],\n", + " [2],\n", + " [3],\n", + " [4],\n", + " [5],\n", + " [6],\n", + " [7],\n", + " [8],\n", + " [9]])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ISWMaQmKlecL" + }, + "outputs": [], + "source": [ + "model = LogisticRegression(random_state = 42)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "kEvfhhlvlecO", + "outputId": "afc1edc9-773d-48ed-b22c-cb8b6d757828" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=42, solver='lbfgs', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(x, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "P1KY8ygglecS" + }, + "source": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", + " multi_class='warn', n_jobs=None, penalty='l2',\n", + " random_state=0, solver='liblinear', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TWyHYujKleca", + "outputId": "6efac8d8-9adf-4ac5-e982-fae7942ad8b4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classes: [0 1]\n", + "Intercept: [-4.12617727]\n", + "Coef: [[1.18109091]]\n" + ] + } + ], + "source": [ + "print(\"Classes: \", model.classes_)\n", + "print(\"Intercept: \",model.intercept_)\n", + "print(\"Coef: \",model.coef_)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Probability: [[0.98411203 0.01588797]\n", + " [0.95003074 0.04996926]\n", + " [0.85370936 0.14629064]\n", + " [0.64173546 0.35826454]\n", + " [0.35475873 0.64524127]\n", + " [0.1443924 0.8556076 ]\n", + " [0.04924876 0.95075124]\n", + " [0.01565079 0.98434921]\n", + " [0.00485659 0.99514341]\n", + " [0.00149573 0.99850427]]\n" + ] + } + ], + "source": [ + "print(\"Probability: \",model.predict_proba(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "WD6Inaewlecl", + "outputId": "ecd200d5-70e5-498d-fa31-06c8d9b0cc26" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "inYjtdS9lecq", + "outputId": "c8a25a25-2ab6-445f-af43-a6c298943ac8" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[4, 0],\n", + " [0, 6]], dtype=int64)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "confusion_matrix(y, model.predict(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "U3CLyPvTlect", + "outputId": "b41cb437-bf7b-4296-a2fa-6777aaa496f3" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVQAAAD8CAYAAAAoqlyCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAORklEQVR4nO3dfYydZZnH8d91hqktAqK8tZ1WixQL+FK7KSUE/0BFWoWCxBWsEbKEOL6nJRuhIsmuWUmMfyCQEM1RsEQBmdQlutB2NVZSicC23RS2nWmhtEDPdBCNb9BlYeaca//ooTnWmWfOaa+Z+5x7vp/mSeec5+1OOvn1up/7ee7H3F0AgKNXSt0AAMgFgQoAQQhUAAhCoAJAEAIVAIIQqAAQhEAFgDGY2YlmttbMdprZgJmdX7T9MZPVMADoQLdL2uDu/2hm0yQdW7SxcWM/APw9MztB0pOS3ulNBuWEV6h/ufYiEht/56R7B1I3AW1o5PVBO9pjDP9hT9OZM+2UMz4nqbfhq7K7l+s/v1PS7yX90MwWStoqaaW7HxjreFxDBTBluXvZ3Rc3LOWG1cdI+gdJ33X3RZIOSFpddDwCFUBeatXml2IVSRV3f6L+ea0OBuyYGJQCkJfqSMhh3P1FM9tnZgvcfZekD0vqL9qHQAWQFfda5OG+Iune+gj/HknXFm1MoALISy0uUN19m6TFzW5PoALIS2yF2hICFUBexh9smjAEKoC8UKECQAwPGuU/EgQqgLwEDkq1ikAFkBe6/AAQhEEpAAhChQoAQRiUAoAgDEoBQAx3rqECQAyuoQJAELr8ABCEChUAglSHk52aQAWQF7r8ABCELj8ABKFCBYAgBCoAxHAGpQAgCNdQASAIXX4ACEKFCgBBqFABIAgVKgAEGWGCaQCIQYUKAEECr6Ga2XOSXpZUlTTi7ouLtidQAeQlvkL9oLv/oZkNCVQAeUk4yl9KdmYAmAhea35p4miSfmFmW82sd7yNqVAB5KWFUf56SDYGZdndyw2fL3D3/WZ2qqRfmtlOd9801vEIVAB5cW9hUy9LKhes31//+yUze1DSEkljBipdfgB5qdWaXwqY2ZvN7Pg3fpZ0saTtRftQoQLIS9yg1GmSHjQz6WBW3ufuG4p2IFAB5CXotil33yNpYSv7EKgA8lKtJjs1gQogL8w2BQBBCFQACMLkKAAQw2vN34cajUAFkBe6/AAQhFF+AAhChQoAQZi+bwqwko771+/p2JXfTN0StJGlF1+oHds3aWf/o7rhq19K3Zw8uDe/BCNQJ8m0j1yh6tALqZuBNlIqlXTH7bfo0uWf0XsXflBXXfVxnX32mamb1fmCJkc5EuMGqpmdZWY3mtkdZnZ7/eezw1uSMXvryepeeJ5e37QudVPQRpacu0jPPvuc9u59QcPDw+rr+5kuW740dbM6X82bX4IVBqqZ3SjpJ5JM0n9J2lz/+X4zWx3emkzNWPFFvdr3/Qn5B0Tnmt0zU/sq+w99rgwOafbsmQlblIlqtfkl2HiDUtdJere7Dzd+aWa3Stoh6Vuj7dQ4C/Zt55+lf1rQE9DUznTMwvNUe/nPqj3/jLoWtDRxDTJXnxbub/gEXNebaryNR/lrkmZLev6w72fV142qcRbsv1x70ZT+Dek68z3qfv/56n7fEql7mmz6sZrRu1qvlkf9vwhTyGBlSHPnzD70eU7PLA0N/S5hizLRxk9KrZL0KzN7RtK++ndvlzRf0pcnsmG5eG3tXXpt7V2SpK4FC/WmZZ8kTCFJ2rxlm+bPP13z5s3V4OCLuvLKy3X1NYz0H7V2fZbf3TeY2bt08D0qPTp4/bQiabO7p3scAchAtVrVylU3a93D96mrVNKaex5Qf//TqZvV+dq4QpW71yQ9PgltyV5115P6311Ppm4G2sj6DRu1fsPG1M3IywiPngJAjHbt8gNAx2nnLj8AdJJ2vm0KADoLFSoABCFQASAIE0wDQAzeKQUAUQhUAAjCKD8ABKFCBYAgwYFqZl2StkgadPdLi7YlUAFkxavhXf6VkgYknTDehrxTCkBeAl+BYmZzJF0i6QfNnJpABZAVr3nTi5n1mtmWhqX3sMPdJukGFUyo34guP4C8tHANtfHtIoczs0slveTuW83swmaOR6ACyEvcJdQLJF1mZh+TNF3SCWb2Y3f/zFg70OUHkBUfqTW9FB7H/WvuPsfd50n6lKSNRWEqUaECyE26+/oJVAB5mYhn+d39EUmPjLcdgQogL1SoABCD2aYAIAoVKgDE8JF05yZQAWQl4VukCVQAmSFQASAGFSoABCFQASCIVy3ZuQlUAFmhQgWAIF6jQgWAEFSoABDEnQoVAEJQoQJAkBqj/AAQg0EpAAhCoAJAEE83HSqBCiAvVKgAEITbpgAgSJVRfgCIQYUKAEG4hgoAQRjlB4AgVKgAEKRaKyU7N4EKICt0+QEgSC1olN/MpkvaJOlNOpiVa939X4r2IVABZCXwtqnXJH3I3V8xs25Jj5rZend/fKwdCFQAWYnq8ru7S3ql/rG7vhQefcID9aR7Byb6FOhAr+7/TeomIFOtdPnNrFdSb8NXZXcvN6zvkrRV0nxJd7r7E0XHo0IFkJVWRvnr4VkuWF+V9H4zO1HSg2b2HnffPtb26e4vAIAJ4C0sTR/T/c+SHpG0rGg7AhVAVmpuTS9FzOyUemUqM5sh6SJJO4v2ocsPICuBo/yzJN1Tv45aktTn7g8V7UCgAshK1EtP3f0pSYta2YdABZAVF8/yA0CIEeZDBYAYVKgAECTqGuqRIFABZIUKFQCCUKECQJAqFSoAxEj4BhQCFUBealSoABAj4RtQCFQAeWFQCgCC1IwuPwCEqCY8N4EKICuM8gNAEEb5ASAIo/wAEIQuPwAE4bYpAAhSpUIFgBhUqAAQhEAFgCAJXylFoALICxUqAATh0VMACMJ9qAAQhC4/AARJGailhOcGgHDewlLEzOaa2a/NbMDMdpjZyvHOTYUKICuB11BHJP2zu/+3mR0vaauZ/dLd+8fagUAFkJWoUX53H5I0VP/5ZTMbkNQjacxApcsPICs1edOLmfWa2ZaGpXe0Y5rZPEmLJD1RdG4qVABZaWVQyt3LkspF25jZcZJ+KmmVu/+1aFsCFUBWIieYNrNuHQzTe93938fbnkAFkJWo26bMzCTdJWnA3W9tZh8CFUBWRiysRr1A0tWS/sfMttW/u8nd1421A4EKICtRceruj0qtvfGPQAWQFR49BYAgtYTvPSVQAWSF10gDQBC6/AAQpEqXHwBiUKECQBCnQgWAGEwwPQUsvfhC7di+STv7H9UNX/1S6uagTfz15Vd0/de/qeUrPqvln+7Vtu0DqZvU8VqZbSoaFeokKJVKuuP2W7TsYytUqQzp8cfW6T8e+oUGBp5J3TQk9q3bvqcLzlus79xys4aHh/Xq/72WukkdL+VtU1Sok2DJuYv07LPPae/eFzQ8PKy+vp/psuVLUzcLib1y4IC2Prldn6j/LnR3d+uE449L3KrONyJveolGhToJZvfM1L7K/kOfK4NDWnLuooQtQjuoDL6ot574Ft18y63atXuPzllwplav+ryOnTE9ddM6WspBqSOuUM3s2oJ1h2bBrtUOHOkpsnFwFrC/5Z6yY4J2MFKtauDp3brqiku0ds2dmjFjuu76UV/qZnW8WgtLtKPp8n9jrBXuXnb3xe6+uFR681GcIg+DlSHNnTP70Oc5PbM0NPS7hC1CO5h56sk67ZST9b53nyVJuvjCD6j/6d2JW9X5vIU/0Qq7/Gb21FirJJ0W3ppMbd6yTfPnn6558+ZqcPBFXXnl5br6Gkb6p7qTT3qbZp56ivY+X9Hp75ijx7du0xnz3p66WR2vnW/sP03SUkl/Oux7k/TbCWlRhqrVqlauulnrHr5PXaWS1tzzgPr7n07dLLSBm67/gm78xrc1PDKsubNn6d9uuj51kzpeNeHltPEC9SFJx7n7tsNXmNkjE9KiTK3fsFHrN2xM3Qy0mbPedYb67r4jdTOy0rbT97n7dQXrPh3fHAA4Ojx6CgBB2vkaKgB0lLbt8gNAp6HLDwBB2nmUHwA6Cl1+AAjCoBQABOEaKgAEocsPAEFSzuTGBNMAslKVN72Mx8zuNrOXzGx7M+cmUAFkJfidUmskLWv23HT5AWQlssvv7pvMbF6z2xOoALKSclCKLj+ArLQyY3/j65rqS+/RnJsKFUBWWnn01N3LkspR5yZQAWSFLj8ABIkc5Tez+yU9JmmBmVXMbMxJ9yUqVACZCR7lX9HK9gQqgKzw6CkABGFyFAAIUvV0E/gRqACyknJyFAIVQFa4hgoAQbiGCgBBanT5ASAGFSoABGGUHwCC0OUHgCB0+QEgCBUqAAShQgWAIFWvJjs3gQogKzx6CgBBePQUAIJQoQJAEEb5ASAIo/wAEIRHTwEgCNdQASAI11ABIAgVKgAE4T5UAAhChQoAQRjlB4AgDEoBQJCUXf5SsjMDwATwFv6Mx8yWmdkuM9ttZqvH254KFUBWoipUM+uSdKekj0iqSNpsZj939/6x9iFQAWQl8BrqEkm73X2PJJnZTyRdLildoI68PmgTfY5OYWa97l5O3Q60F34vYrWSOWbWK6m34atyw79Fj6R9Desqks4rOh7XUCdX7/ibYAri9yIRdy+7++KGpfE/ttGCubD8JVABYHQVSXMbPs+RtL9oBwIVAEa3WdKZZna6mU2T9ClJPy/agUGpycV1MoyG34s25O4jZvZlSf8pqUvS3e6+o2gfS3kTLADkhC4/AAQhUAEgCIE6SVp9hA35M7O7zewlM9ueui2IQaBOgoZH2D4q6RxJK8zsnLStQhtYI2lZ6kYgDoE6OQ49wubur0t64xE2TGHuvknSH1O3A3EI1Mkx2iNsPYnaAmCCEKiTo+VH2AB0HgJ1crT8CBuAzkOgTo6WH2ED0HkI1Eng7iOS3niEbUBS33iPsCF/Zna/pMckLTCzipldl7pNODo8egoAQahQASAIgQoAQQhUAAhCoAJAEAIVAIIQqAAQhEAFgCD/D+E3VzFVyTbSAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "\n", + "cm = confusion_matrix(y, model.predict(x))\n", + "sns.heatmap(cm, annot=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "hsUoY8_flecv", + "outputId": "dcc208a4-87be-42df-80c5-93fc0dc646f5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 4\n", + " 1 1.00 1.00 1.00 6\n", + "\n", + " accuracy 1.00 10\n", + " macro avg 1.00 1.00 1.00 10\n", + "weighted avg 1.00 1.00 1.00 10\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y, model.predict(x)))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3wYZj-pllecy", + "outputId": "8d198813-a002-4cc4-ba2b-56f232758d81" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=0, solver='liblinear', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = LogisticRegression(solver='liblinear', C=0.5, random_state=0)\n", + "model.fit(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2c1MUas8lec1", + "outputId": "d993f8fc-2808-47fb-bc1b-a66546e1523f" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.61167085])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.intercept_" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "5MAptsgclec3", + "outputId": "501c5009-bbce-4f07-d150-ebcef755ead0" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.41299976]])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Kl20AVxHlec5", + "outputId": "de7bd512-6213-4da1-ffd1-d443b89acb0c" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.64832185, 0.35167815],\n", + " [0.54950505, 0.45049495],\n", + " [0.44662201, 0.55337799],\n", + " [0.34811656, 0.65188344],\n", + " [0.26108668, 0.73891332],\n", + " [0.18948992, 0.81051008],\n", + " [0.13396721, 0.86603279],\n", + " [0.09284959, 0.90715041],\n", + " [0.06342763, 0.93657237],\n", + " [0.04288806, 0.95711194]])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict_proba(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "30IDSy4Ilec7", + "outputId": "e6e56ded-0da2-407c-eef9-435d1fae557b" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1])" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "iMBDTGVJlec9", + "outputId": "4d13e887-6896-4b03-f4f0-1713f63bb735" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.score(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1XZwGMEZlec_", + "outputId": "b59aa2f0-687e-4e25-93e0-9eea939c51a1" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[2, 2],\n", + " [0, 6]], dtype=int64)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "confusion_matrix(y, model.predict(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVQAAAD8CAYAAAAoqlyCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAANx0lEQVR4nO3dfYxc1X3G8edZY2JeDIQajLEdTHEAQZXU1eKoIpVoq4ATAqiiSkPSVEppto2aCqOqCSVIUVpcRalEXiRUNEoIUQokFi0JpeAEBZBLC8R266SLlxdjSNi1XZe0KNgiYXfm1z88oBH13p3Bv9k7c/z9WFd4Zu7Lkbg8/M4999zriBAA4PCN1N0AACgFgQoASQhUAEhCoAJAEgIVAJIQqACQhEAFgFnYPsn2XbaftD1h+9er1j9qvhoGAEPoS5I2RcTv2j5a0rFVK5sb+wHg/7N9oqTtkn45ugzKvleoG874MIkNoCuf/vHtPtx9TL+4q+vMOfqUs/5Y0ljHV42IaLT/fqak/5b0NdvvlLRN0jURcWC2/XENFcARKyIaETHasTQ6fj5K0q9J+ruIWCPpgKTrqvZHoAIoS6vZ/VJtUtJkRDze/nyXDgbsrBiUAlCW5kzKbiJir+0XbJ8TEU9J+m1JO6q2IVABFCWilbm7P5N0e3uEf5ekj1atTKACKEsrL1AjYruk0W7XJ1ABlCW3Qu0JgQqgLHMPNvUNgQqgLFSoAJAjkkb53wwCFUBZEgelekWgAigLXX4ASMKgFAAkoUIFgCQMSgFAEgalACBHBNdQASAH11ABIAldfgBIQoUKAEma07UdmkAFUBa6/ACQhC4/ACShQgWAJAQqAOQIBqUAIAnXUAEgCV1+AEhChQoASahQASAJFSoAJJnhAdMAkIMKFQCSJF5Dtf28pJclNSXNRMRo1foEKoCy5FeovxkRL3azIoEKoCw1jvKP1HZkAOiHaHW/dLE3Sd+zvc322FwrU6ECKEsPo/ztkOwMykZENDo+vzsipmyfKukB209GxObZ9kegAihLRA+rRkNSo+L3qfY/99m+W9JaSbMGKl1+AGVptbpfKtg+zvbi1/4u6WJJ41XbUKECKEveoNRSSXfblg5m5R0RsalqAwIVQFmSbpuKiF2S3tnLNgQqgLI0m7UdmkAFUBaeNgUASQhUAEjCw1EAIEe0ur8PNRuBCqAsdPkBIAmj/ACQhAoVAJIQqGVbvOxkXf6Fj+u4JSdKEfqPOx7Ulq99t+5moWacF33Sw8NRshGo8yCaLX3/xtu1d/x5HX3cIv3hvTfquUfG9eIzU3U3DTXivOiTQa5QbZ8r6QpJy9tfTUm6JyIm+tmwkuzf95L273tJkvTqgZ/rpzt3a/HSt/IfzhGO86JParxtqvLxfbY/JembkizpB+3Fku60fV3/m1eeE1cs0dLzz9DU9mfrbgoGCOdFomaz+yXZXBXq1ZLOj4jpzi9t3yTpCUmfO9RGnU/BvuLktbrg+NUJTR1+C499i668Zb0e+Ktv6NX9r9TdHAwIzotcMcDvlGpJOv0Q3y9r/3ZIEdGIiNGIGCVMDxo5aoGuvGW9xr/9r3pq09a6m4MBwXnRB63ofkk2V4W6XtL3bT8j6YX2d2+TtFrSJ9JbU7BLP/8x/XTnlH7wlfvrbgoGCOdFHwzqXP6I2GT7bB18j0rnoNSWiKhvOsKQWTF6tt5x5W/ovyZ+oj+6728kSQ/97bf07EM/rLllqBPnRZ8M8lz+iGhJemwe2lKsya1Pa8MZH667GRgwnBd9MsPUUwDIMahdfgAYOoPc5QeAYVLnbVMEKoCyUKECQBICFQCS8IBpAMjBO6UAIAuBCgBJGOUHgCRUqACQJDlQbS+QtFXSVES8v2pdAhVAUaKZ3uW/RtKEpBPmWnGu56ECwHBJfB6q7RWSLpX0lW4OTaACKEq0ouvF9pjtrR3L2Bt290VJn1TFA/U70eUHUJYerqFGRENS41C/2X6/pH0Rsc32Rd3sj0AFUJa8S6gXSrrc9vskLZJ0gu2/j4jfn20DuvwAihIzra6Xyv1E/GVErIiIVZI+KOnBqjCVqFABlKa++/oJVABl6cdc/oh4WNLDc61HoAIoCxUqAOTgaVMAkIUKFQByxEx9xyZQARSlxrdIE6gACkOgAkAOKlQASEKgAkCSaLq2YxOoAIpChQoASaJFhQoAKahQASBJBBUqAKSgQgWAJC1G+QEgB4NSAJCEQAWAJFHf41AJVABloUIFgCTcNgUASZqM8gNADipUAEjCNVQASMIoPwAkoUIFgCTN1khtxyZQARSFLj8AJGkljfLbXiRps6S36GBW3hURn6nahkAFUJTE26Z+Iem3ImK/7YWSHrF9f0Q8NtsGBCqAomR1+SMiJO1vf1zYXir33vdA/cyeh/t9CAyhV3b/S91NQKF66fLbHpM01vFVIyIaHb8vkLRN0mpJN0fE41X7o0IFUJReRvnb4dmo+L0p6VdtnyTpbtu/EhHjs61f3/0FANAH0cPS9T4jXpL0kKR1VesRqACK0gp3vVSxfUq7MpXtYyS9R9KTVdvQ5QdQlMRR/mWSvt6+jjoiaWNE3Fu1AYEKoChZLz2NiB9JWtPLNgQqgKKEmMsPAClmeB4qAOSgQgWAJFnXUN8MAhVAUahQASAJFSoAJGlSoQJAjhrfgEKgAihLiwoVAHLU+AYUAhVAWRiUAoAkLdPlB4AUzRqPTaACKAqj/ACQhFF+AEjCKD8AJKHLDwBJuG0KAJI0qVABIAcVKgAkIVABIEmNr5QiUAGUhQoVAJIw9RQAknAfKgAkocsPAEnqDNSRGo8NAOmih6WK7ZW2H7K9w/YTtq+Z69hUqACKkngNdUbSn0fEv9teLGmb7QciYsdsGxCoAIqSNcofEXsk7Wn//WXbE5KWS5o1UOnyAyhKS9H1YnvM9taOZexQ+7S9StIaSY9XHZsKFUBRehmUioiGpEbVOraPl/QPktZHxM+q1iVQARQl8wHTthfqYJjeHhH/ONf6BCqAomTdNmXbkr4qaSIibupmGwIVQFFmnFajXijpI5L+0/b29nfXR8R9s21AoAIoSlacRsQjUm9v/CNQARSFqacAkKRV43tPCVQAReE10gCQhC4/ACRp0uUHgBxUqACQJKhQASAHD5g+Alxy8UV6YnyzntzxiD75F39ad3MwIH728n5d++kbddlVH9NlHxrT9vGJups09Hp52lQ2KtR5MDIyoi9/aYPWve8qTU7u0WOP3qd/uvd7mph4pu6moWaf++ItuvBdo/rChhs0PT2tV37+i7qbNPTqvG2KCnUerL1gjZ599nk999xPND09rY0bv6PLL7uk7mahZi/vP6BtPxzXle1zYeHChTph8fE1t2r4zSi6XrJRoc6D05efphcmd7/+eXJqj9ZesKbGFmEQTO3eq7eedKJu2HCTntq5S+ed83Zdt/5PdOwxi+pu2lCrc1DqTVeotj9a8dvrT8FutQ682UMARZtpNjXx9E793u9cqrtuu1nHHLNIX/3GxrqbNfRaPSzZDqfL/9nZfoiIRkSMRsToyMhxh3GIMuye2quVK05//fOK5cu0e/feGluEQXDaqUu09JQlesf550qSLr7o3drx9M6aWzX8ooc/2Sq7/LZ/NNtPkpamt6ZQW7Zu1+rVZ2rVqpWamtqrD3zgCn3kDxjpP9It+aWTddqpp+i5H0/qzDNW6LFt23XWqrfV3ayhN8g39i+VdImk/33D95b0b31pUYGazaauWX+D7vvnO7RgZES3ff1b2rHj6bqbhQFw/bUf16c++3lNz0xr5enL9NfXX1t3k4ZeMwb3xv57JR0fEdvf+IPth/vRoFLdv+lB3b/pwbqbgQFz7tlnaeOtX667GUUZ2Mf3RcTVFb99KL85AHB4mHoKAEkG+RoqAAyVge3yA8CwocsPAEkGeZQfAIYKXX4ASMKgFAAk4RoqACShyw8ASaLGQSkeMA2gKE1F18tcbN9qe5/t8W6OTaACKEryO6Vuk7Su22PT5QdQlMwuf0Rstr2q2/UJVABFqXNQii4/gKL08sT+ztc1tZexwzk2FSqAovQy9TQiGpIaWccmUAEUhS4/ACTJHOW3faekRyWdY3vS9qwP3ZeoUAEUJnmU/6pe1idQARSFqacAkISHowBAkmbU9wA/AhVAUep8OAqBCqAoXEMFgCRcQwWAJC26/ACQgwoVAJIwyg8ASejyA0ASuvwAkIQKFQCSUKECQJJmNGs7NoEKoChMPQWAJEw9BYAkVKgAkIRRfgBIwig/ACRh6ikAJOEaKgAk4RoqACShQgWAJNyHCgBJqFABIAmj/ACQhEEpAEhSZ5d/pLYjA0AfRA9/5mJ7ne2nbO+0fd1c61OhAihKVoVqe4GkmyW9R9KkpC2274mIHbNtQ6ACKEriNdS1knZGxC5Jsv1NSVdIqi9QZ16dcr+PMSxsj0VEo+52YLBwXuTqJXNsj0ka6/iq0fHvYrmkFzp+m5T0rqr9cQ11fo3NvQqOQJwXNYmIRkSMdiyH9T82AhUADm1K0sqOzyva382KQAWAQ9si6e22z7R9tKQPSrqnagMGpeYX18lwKJwXAygiZmx/QtJ3JS2QdGtEPFG1jeu8CRYASkKXHwCSEKgAkIRAnSe9TmFD+Wzfanuf7fG624IcBOo86JjC9l5J50m6yvZ59bYKA+A2SevqbgTyEKjz4/UpbBHxqqTXprDhCBYRmyX9T93tQB4CdX4cagrb8praAqBPCFQASEKgzo+ep7ABGD4E6vzoeQobgOFDoM6DiJiR9NoUtglJG+eawoby2b5T0qOSzrE9afvqutuEw8PUUwBIQoUKAEkIVABIQqACQBICFQCSEKgAkIRABYAkBCoAJPk/g1NZeUJXgJQAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.heatmap(confusion_matrix(y, model.predict(x)), annot=True)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "hAo4kysIledB", + "outputId": "86fcee8c-d72c-43e3-a655-4b1e2cc4c6d4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.50 0.67 4\n", + " 1 0.75 1.00 0.86 6\n", + "\n", + " accuracy 0.80 10\n", + " macro avg 0.88 0.75 0.76 10\n", + "weighted avg 0.85 0.80 0.78 10\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y, model.predict(x)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real Life Example" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst radiusworst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimension
017.9910.38122.801001.00.118400.277600.30010.147100.24190.07871...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
120.5717.77132.901326.00.084740.078640.08690.070170.18120.05667...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
219.6921.25130.001203.00.109600.159900.19740.127900.20690.05999...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
311.4220.3877.58386.10.142500.283900.24140.105200.25970.09744...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
420.2914.34135.101297.00.100300.132800.19800.104300.18090.05883...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", + "

5 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " mean radius mean texture mean perimeter mean area mean smoothness \\\n", + "0 17.99 10.38 122.80 1001.0 0.11840 \n", + "1 20.57 17.77 132.90 1326.0 0.08474 \n", + "2 19.69 21.25 130.00 1203.0 0.10960 \n", + "3 11.42 20.38 77.58 386.1 0.14250 \n", + "4 20.29 14.34 135.10 1297.0 0.10030 \n", + "\n", + " mean compactness mean concavity mean concave points mean symmetry \\\n", + "0 0.27760 0.3001 0.14710 0.2419 \n", + "1 0.07864 0.0869 0.07017 0.1812 \n", + "2 0.15990 0.1974 0.12790 0.2069 \n", + "3 0.28390 0.2414 0.10520 0.2597 \n", + "4 0.13280 0.1980 0.10430 0.1809 \n", + "\n", + " mean fractal dimension ... worst radius worst texture worst perimeter \\\n", + "0 0.07871 ... 25.38 17.33 184.60 \n", + "1 0.05667 ... 24.99 23.41 158.80 \n", + "2 0.05999 ... 23.57 25.53 152.50 \n", + "3 0.09744 ... 14.91 26.50 98.87 \n", + "4 0.05883 ... 22.54 16.67 152.20 \n", + "\n", + " worst area worst smoothness worst compactness worst concavity \\\n", + "0 2019.0 0.1622 0.6656 0.7119 \n", + "1 1956.0 0.1238 0.1866 0.2416 \n", + "2 1709.0 0.1444 0.4245 0.4504 \n", + "3 567.7 0.2098 0.8663 0.6869 \n", + "4 1575.0 0.1374 0.2050 0.4000 \n", + "\n", + " worst concave points worst symmetry worst fractal dimension \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + "\n", + "[5 rows x 30 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_breast_cancer\n", + "import pandas as pd\n", + "X,y = load_breast_cancer(return_X_y=True)\n", + "df = pd.DataFrame(X,columns = load_breast_cancer().feature_names)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 569 entries, 0 to 568\n", + "Data columns (total 30 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 mean radius 569 non-null float64\n", + " 1 mean texture 569 non-null float64\n", + " 2 mean perimeter 569 non-null float64\n", + " 3 mean area 569 non-null float64\n", + " 4 mean smoothness 569 non-null float64\n", + " 5 mean compactness 569 non-null float64\n", + " 6 mean concavity 569 non-null float64\n", + " 7 mean concave points 569 non-null float64\n", + " 8 mean symmetry 569 non-null float64\n", + " 9 mean fractal dimension 569 non-null float64\n", + " 10 radius error 569 non-null float64\n", + " 11 texture error 569 non-null float64\n", + " 12 perimeter error 569 non-null float64\n", + " 13 area error 569 non-null float64\n", + " 14 smoothness error 569 non-null float64\n", + " 15 compactness error 569 non-null float64\n", + " 16 concavity error 569 non-null float64\n", + " 17 concave points error 569 non-null float64\n", + " 18 symmetry error 569 non-null float64\n", + " 19 fractal dimension error 569 non-null float64\n", + " 20 worst radius 569 non-null float64\n", + " 21 worst texture 569 non-null float64\n", + " 22 worst perimeter 569 non-null float64\n", + " 23 worst area 569 non-null float64\n", + " 24 worst smoothness 569 non-null float64\n", + " 25 worst compactness 569 non-null float64\n", + " 26 worst concavity 569 non-null float64\n", + " 27 worst concave points 569 non-null float64\n", + " 28 worst symmetry 569 non-null float64\n", + " 29 worst fractal dimension 569 non-null float64\n", + "dtypes: float64(30)\n", + "memory usage: 133.5 KB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst radiusworst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimension
count569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000...569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000
mean14.12729219.28964991.969033654.8891040.0963600.1043410.0887990.0489190.1811620.062798...16.26919025.677223107.261213880.5831280.1323690.2542650.2721880.1146060.2900760.083946
std3.5240494.30103624.298981351.9141290.0140640.0528130.0797200.0388030.0274140.007060...4.8332426.14625833.602542569.3569930.0228320.1573360.2086240.0657320.0618670.018061
min6.9810009.71000043.790000143.5000000.0526300.0193800.0000000.0000000.1060000.049960...7.93000012.02000050.410000185.2000000.0711700.0272900.0000000.0000000.1565000.055040
25%11.70000016.17000075.170000420.3000000.0863700.0649200.0295600.0203100.1619000.057700...13.01000021.08000084.110000515.3000000.1166000.1472000.1145000.0649300.2504000.071460
50%13.37000018.84000086.240000551.1000000.0958700.0926300.0615400.0335000.1792000.061540...14.97000025.41000097.660000686.5000000.1313000.2119000.2267000.0999300.2822000.080040
75%15.78000021.800000104.100000782.7000000.1053000.1304000.1307000.0740000.1957000.066120...18.79000029.720000125.4000001084.0000000.1460000.3391000.3829000.1614000.3179000.092080
max28.11000039.280000188.5000002501.0000000.1634000.3454000.4268000.2012000.3040000.097440...36.04000049.540000251.2000004254.0000000.2226001.0580001.2520000.2910000.6638000.207500
\n", + "

8 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " mean radius mean texture mean perimeter mean area \\\n", + "count 569.000000 569.000000 569.000000 569.000000 \n", + "mean 14.127292 19.289649 91.969033 654.889104 \n", + "std 3.524049 4.301036 24.298981 351.914129 \n", + "min 6.981000 9.710000 43.790000 143.500000 \n", + "25% 11.700000 16.170000 75.170000 420.300000 \n", + "50% 13.370000 18.840000 86.240000 551.100000 \n", + "75% 15.780000 21.800000 104.100000 782.700000 \n", + "max 28.110000 39.280000 188.500000 2501.000000 \n", + "\n", + " mean smoothness mean compactness mean concavity mean concave points \\\n", + "count 569.000000 569.000000 569.000000 569.000000 \n", + "mean 0.096360 0.104341 0.088799 0.048919 \n", + "std 0.014064 0.052813 0.079720 0.038803 \n", + "min 0.052630 0.019380 0.000000 0.000000 \n", + "25% 0.086370 0.064920 0.029560 0.020310 \n", + "50% 0.095870 0.092630 0.061540 0.033500 \n", + "75% 0.105300 0.130400 0.130700 0.074000 \n", + "max 0.163400 0.345400 0.426800 0.201200 \n", + "\n", + " mean symmetry mean fractal dimension ... worst radius \\\n", + "count 569.000000 569.000000 ... 569.000000 \n", + "mean 0.181162 0.062798 ... 16.269190 \n", + "std 0.027414 0.007060 ... 4.833242 \n", + "min 0.106000 0.049960 ... 7.930000 \n", + "25% 0.161900 0.057700 ... 13.010000 \n", + "50% 0.179200 0.061540 ... 14.970000 \n", + "75% 0.195700 0.066120 ... 18.790000 \n", + "max 0.304000 0.097440 ... 36.040000 \n", + "\n", + " worst texture worst perimeter worst area worst smoothness \\\n", + "count 569.000000 569.000000 569.000000 569.000000 \n", + "mean 25.677223 107.261213 880.583128 0.132369 \n", + "std 6.146258 33.602542 569.356993 0.022832 \n", + "min 12.020000 50.410000 185.200000 0.071170 \n", + "25% 21.080000 84.110000 515.300000 0.116600 \n", + "50% 25.410000 97.660000 686.500000 0.131300 \n", + "75% 29.720000 125.400000 1084.000000 0.146000 \n", + "max 49.540000 251.200000 4254.000000 0.222600 \n", + "\n", + " worst compactness worst concavity worst concave points \\\n", + "count 569.000000 569.000000 569.000000 \n", + "mean 0.254265 0.272188 0.114606 \n", + "std 0.157336 0.208624 0.065732 \n", + "min 0.027290 0.000000 0.000000 \n", + "25% 0.147200 0.114500 0.064930 \n", + "50% 0.211900 0.226700 0.099930 \n", + "75% 0.339100 0.382900 0.161400 \n", + "max 1.058000 1.252000 0.291000 \n", + "\n", + " worst symmetry worst fractal dimension \n", + "count 569.000000 569.000000 \n", + "mean 0.290076 0.083946 \n", + "std 0.061867 0.018061 \n", + "min 0.156500 0.055040 \n", + "25% 0.250400 0.071460 \n", + "50% 0.282200 0.080040 \n", + "75% 0.317900 0.092080 \n", + "max 0.663800 0.207500 \n", + "\n", + "[8 rows x 30 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "plt.figure(figsize=(16, 8))\n", + "sns.distplot(df[\"mean area\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1.09706398, 2.07333501, 1.26993369, ..., 2.29607613, 2.75062224,\n", + " 1.93701461],\n", + " [1.82982061, 0.35363241, 1.68595471, ..., 1.0870843 , 0.24388967,\n", + " 0.28118999],\n", + " [1.57988811, 0.45618695, 1.56650313, ..., 1.95500035, 1.152255 ,\n", + " 0.20139121],\n", + " ...,\n", + " [0.70228425, 2.0455738 , 0.67267578, ..., 0.41406869, 1.10454895,\n", + " 0.31840916],\n", + " [1.83834103, 2.33645719, 1.98252415, ..., 2.28998549, 1.91908301,\n", + " 2.21963528],\n", + " [1.80840125, 1.22179204, 1.81438851, ..., 1.74506282, 0.04813821,\n", + " 0.75120669]])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy import stats\n", + "import numpy as np\n", + "z = np.abs(stats.zscore(df))\n", + "z" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "74" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliers = list(set(np.where(z > 3)[0]))\n", + "\n", + "len(outliers)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexmean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetry...worst radiusworst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimension
0120.5717.77132.901326.00.084740.078640.086900.070170.1812...24.9923.41158.81956.00.12380.18660.24160.18600.27500.08902
1219.6921.25130.001203.00.109600.159900.197400.127900.2069...23.5725.53152.51709.00.14440.42450.45040.24300.36130.08758
2420.2914.34135.101297.00.100300.132800.198000.104300.1809...22.5416.67152.21575.00.13740.20500.40000.16250.23640.07678
3512.4515.7082.57477.10.127800.170000.157800.080890.2087...15.4723.75103.4741.60.17910.52490.53550.17410.39850.12440
4618.2519.98119.601040.00.094630.109000.112700.074000.1794...22.8827.66153.21606.00.14420.25760.37840.19320.30630.08368
..................................................................
49056014.0527.1591.38600.40.099290.112600.044620.043040.1537...15.3033.17100.2706.70.12410.22640.13260.10480.22500.08321
49156320.9225.09143.001347.00.109900.223600.317400.147400.2149...24.2929.41179.11819.00.14070.41860.65990.25420.29290.09873
49256421.5622.39142.001479.00.111000.115900.243900.138900.1726...25.4526.40166.12027.00.14100.21130.41070.22160.20600.07115
49356520.1328.25131.201261.00.097800.103400.144000.097910.1752...23.6938.25155.01731.00.11660.19220.32150.16280.25720.06637
49456616.6028.08108.30858.10.084550.102300.092510.053020.1590...18.9834.12126.71124.00.11390.30940.34030.14180.22180.07820
\n", + "

495 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " index mean radius mean texture mean perimeter mean area \\\n", + "0 1 20.57 17.77 132.90 1326.0 \n", + "1 2 19.69 21.25 130.00 1203.0 \n", + "2 4 20.29 14.34 135.10 1297.0 \n", + "3 5 12.45 15.70 82.57 477.1 \n", + "4 6 18.25 19.98 119.60 1040.0 \n", + ".. ... ... ... ... ... \n", + "490 560 14.05 27.15 91.38 600.4 \n", + "491 563 20.92 25.09 143.00 1347.0 \n", + "492 564 21.56 22.39 142.00 1479.0 \n", + "493 565 20.13 28.25 131.20 1261.0 \n", + "494 566 16.60 28.08 108.30 858.1 \n", + "\n", + " mean smoothness mean compactness mean concavity mean concave points \\\n", + "0 0.08474 0.07864 0.08690 0.07017 \n", + "1 0.10960 0.15990 0.19740 0.12790 \n", + "2 0.10030 0.13280 0.19800 0.10430 \n", + "3 0.12780 0.17000 0.15780 0.08089 \n", + "4 0.09463 0.10900 0.11270 0.07400 \n", + ".. ... ... ... ... \n", + "490 0.09929 0.11260 0.04462 0.04304 \n", + "491 0.10990 0.22360 0.31740 0.14740 \n", + "492 0.11100 0.11590 0.24390 0.13890 \n", + "493 0.09780 0.10340 0.14400 0.09791 \n", + "494 0.08455 0.10230 0.09251 0.05302 \n", + "\n", + " mean symmetry ... worst radius worst texture worst perimeter \\\n", + "0 0.1812 ... 24.99 23.41 158.8 \n", + "1 0.2069 ... 23.57 25.53 152.5 \n", + "2 0.1809 ... 22.54 16.67 152.2 \n", + "3 0.2087 ... 15.47 23.75 103.4 \n", + "4 0.1794 ... 22.88 27.66 153.2 \n", + ".. ... ... ... ... ... \n", + "490 0.1537 ... 15.30 33.17 100.2 \n", + "491 0.2149 ... 24.29 29.41 179.1 \n", + "492 0.1726 ... 25.45 26.40 166.1 \n", + "493 0.1752 ... 23.69 38.25 155.0 \n", + "494 0.1590 ... 18.98 34.12 126.7 \n", + "\n", + " worst area worst smoothness worst compactness worst concavity \\\n", + "0 1956.0 0.1238 0.1866 0.2416 \n", + "1 1709.0 0.1444 0.4245 0.4504 \n", + "2 1575.0 0.1374 0.2050 0.4000 \n", + "3 741.6 0.1791 0.5249 0.5355 \n", + "4 1606.0 0.1442 0.2576 0.3784 \n", + ".. ... ... ... ... \n", + "490 706.7 0.1241 0.2264 0.1326 \n", + "491 1819.0 0.1407 0.4186 0.6599 \n", + "492 2027.0 0.1410 0.2113 0.4107 \n", + "493 1731.0 0.1166 0.1922 0.3215 \n", + "494 1124.0 0.1139 0.3094 0.3403 \n", + "\n", + " worst concave points worst symmetry worst fractal dimension \n", + "0 0.1860 0.2750 0.08902 \n", + "1 0.2430 0.3613 0.08758 \n", + "2 0.1625 0.2364 0.07678 \n", + "3 0.1741 0.3985 0.12440 \n", + "4 0.1932 0.3063 0.08368 \n", + ".. ... ... ... \n", + "490 0.1048 0.2250 0.08321 \n", + "491 0.2542 0.2929 0.09873 \n", + "492 0.2216 0.2060 0.07115 \n", + "493 0.1628 0.2572 0.06637 \n", + "494 0.1418 0.2218 0.07820 \n", + "\n", + "[495 rows x 31 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "495" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df = df.drop(outliers,axis = 0).reset_index(drop = False)\n", + "display(new_df)\n", + "\n", + "y_new = y[list(new_df[\"index\"])]\n", + "len(y_new)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 2.20446536, -0.28261218, 2.06835079, ..., 1.36512913,\n", + " -0.17708262, 0.52461532],\n", + " [ 1.91497191, 0.5956785 , 1.92875368, ..., 2.34544773,\n", + " 1.56196975, 0.42458997],\n", + " [ 2.11235381, -1.14828374, 2.17425205, ..., 0.9609627 ,\n", + " -0.95492065, -0.32560022],\n", + " ...,\n", + " [ 2.53014549, 0.88339441, 2.50639691, ..., 1.97739829,\n", + " -1.56751847, -0.71667159],\n", + " [ 2.05971864, 2.36235515, 1.986518 , ..., 0.96612227,\n", + " -0.53577477, -1.04870021],\n", + " [ 0.89845514, 2.31945015, 0.88418216, ..., 0.60495226,\n", + " -1.24912881, -0.22696411]])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_new = new_df.drop('index', axis = 1)\n", + "\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "X_scaled = StandardScaler().fit_transform(X_new)\n", + "X_scaled" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", + " multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,\n", + " solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv[\"estimator\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean training accuracy: 0.9739630184907546\n", + "Test accuracy: 0.9798657718120806\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split, cross_validate\n", + "\n", + "#Scaling and outlier removed\n", + "X_train, X_test, y_train, y_test = train_test_split(X_scaled,y_new, test_size=0.3, random_state=42)\n", + "\n", + "models = LogisticRegression(random_state=42, n_jobs=-1)\n", + "cv = cross_validate(models,X_train,y_train,cv = 3, n_jobs=-1, return_estimator=True)\n", + "\n", + "print(\"Mean training accuracy: {}\".format(np.mean(cv['test_score'])))\n", + "print(\"Test accuracy: {}\".format(cv[\"estimator\"][0].score(X_test,y_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "pred = cv[\"estimator\"][0].predict(X_test)\n", + "\n", + "cm = confusion_matrix(y_test, pred)\n", + "plt.figure(figsize=(12, 8))\n", + "ax =sns.heatmap(cm, square=True, annot=True, cbar=False)\n", + "ax.xaxis.set_ticklabels([\"False\",\"True\"], fontsize = 12)\n", + "ax.yaxis.set_ticklabels([\"False\",\"True\"], fontsize = 12, rotation=0)\n", + "ax.set_xlabel('Predicted Labels',fontsize = 15)\n", + "ax.set_ylabel('True Labels',fontsize = 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean training accuracy: 0.944691273638642\n", + "Test accuracy: 0.9707602339181286\n" + ] + } + ], + "source": [ + "# Without any preprocess \n", + "X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)\n", + "\n", + "models = LogisticRegression(random_state=42,n_jobs=-1)\n", + "cv = cross_validate(models,X_train,y_train,cv = 3, n_jobs=-1, return_estimator=True)\n", + "\n", + "print(\"Mean training accuracy: {}\".format(np.mean(cv['test_score'])))\n", + "print(\"Test accuracy: {}\".format(cv[\"estimator\"][0].score(X_test,y_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation Metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/acc.png)\n", + "\n", + "F1-score = $F_1 = 2 * \\frac{precision\\,*\\,recall}{precision\\, +\\, recall}$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find all evaluation metrics in sklearn library by clicking [here.](https://scikit-learn.org/stable/modules/model_evaluation.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.98 0.94 0.96 63\n", + " 1 0.96 0.99 0.98 108\n", + "\n", + " accuracy 0.97 171\n", + " macro avg 0.97 0.96 0.97 171\n", + "weighted avg 0.97 0.97 0.97 171\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import classification_report\n", + "print(classification_report(y_test, cv[\"estimator\"][0].predict(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9707602339181286\n", + "Precision: 0.963963963963964\n", + "Recall: 0.9907407407407407\n", + "F1 Score: 0.9771689497716894\n" + ] + } + ], + "source": [ + "from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score\n", + "\n", + "final_model = cv[\"estimator\"][0]\n", + "\n", + "y_pred = final_model.predict(X_test)\n", + "\n", + "print(\"Accuracy:\",accuracy_score(y_test,y_pred))\n", + "print(\"Precision:\",precision_score(y_test,y_pred))\n", + "print(\"Recall:\",recall_score(y_test,y_pred))\n", + "print(\"F1 Score:\",f1_score(y_test,y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**When to use Accuracy Metric** \n", + "When there are roughly equal number of samples belonging to each class. \n", + "
\n", + "**When to use Precision** \n", + "Precision is a good measure to determine, when the costs of False Positive is high. For instance, email spam detection. In email spam detection, a false positive means that an email that is non-spam (actual negative) has been identified as spam (predicted spam). The email user might lose important emails if the precision is not high for the spam detection model. \n", + "
\n", + "**When to use Recall** \n", + "For instance, in fraud detection or sick patient detection. If a fraudulent transaction (Actual Positive) is predicted as non-fraudulent (Predicted Negative), the consequence can be very bad for the bank. \n", + "
\n", + "**When to use F1 Score** \n", + "F1 Score might be a better measure to use if we need to seek a balance between Precision and Recall AND there is an uneven class distribution (large number of Actual Negatives)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ROC (Receiver Operating Characteristic) & AUC (Area Under the Curve)\n", + "AUC - ROC curve is a performance measurement for classification problem at various thresholds settings. ROC is a probability curve and AUC represents degree or measure of separability. It tells how much model is capable of distinguishing between classes. Higher the AUC, better the model is at predicting 0s as 0s and 1s as 1s. \n", + "\n", + "**True Positive Rate (TPR)** = $TP\\,/\\,(TP+FN)$ \n", + "**False Positive Rate (FPR)** = $FP\\,/\\,(FP+FN)$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/roc.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import roc_curve, auc\n", + "\n", + "y_pred_prop = final_model.predict_proba(X_test)[:,1]\n", + "\n", + "fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_prop)\n", + "roc_auc_log = auc(fpr_log, tpr_log)\n", + "\n", + "sns.set_style(\"white\")\n", + "plt.figure(figsize=(10, 7))\n", + "plt.plot(fpr_log, tpr_log, color='darkorange',\n", + " label='ROC curve (area = %0.2f)' % roc_auc_log)\n", + "plt.plot([0, 1], [0, 1], color='navy', linestyle='--')\n", + "plt.xlim([0.0, 1.0])\n", + "plt.ylim([0.0, 1.05])\n", + "plt.xlabel('False Positive Rate',fontsize=18,labelpad =10)\n", + "plt.ylabel('True Positive Rate',fontsize=18)\n", + "plt.title('Receiver Operating Characteristic',fontsize=22).set_position([.5, 1.02])\n", + "plt.legend(loc=\"lower right\",fontsize=13)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Resources\n", + "https://www.analyticsvidhya.com/blog/2020/10/how-to-choose-evaluation-metrics-for-classification-model/ \n", + "https://www.kaggle.com/vipulgandhi/how-to-choose-right-metric-for-evaluating-ml-model\n" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "MFiCsRxRledE" + ], + "include_colab_link": true, + "name": "logisticRegression.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Logistic Regression/logistic_regression.ipynb b/Logistic Regression/logistic_regression.ipynb index 33fd463..bffbdee 100644 --- a/Logistic Regression/logistic_regression.ipynb +++ b/Logistic Regression/logistic_regression.ipynb @@ -674,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -886,7 +886,7 @@ "[5 rows x 30 columns]" ] }, - "execution_count": 20, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -2103,7 +2103,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.9" } }, "nbformat": 4, diff --git a/Probabilty/.ipynb_checkpoints/probability-checkpoint.ipynb b/Probabilty/.ipynb_checkpoints/probability-checkpoint.ipynb new file mode 100644 index 0000000..50011b3 --- /dev/null +++ b/Probabilty/.ipynb_checkpoints/probability-checkpoint.ipynb @@ -0,0 +1,464 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ii6SZ-Urv_QC" + }, + "source": [ + "![](img/logo.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XT3n8LW2v_QD" + }, + "source": [ + "
All rights reserved ©️ Global AI Hub 2020
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtuZFrxtv_QE" + }, + "source": [ + "# Probability Review\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gxybYIfjv_QE" + }, + "source": [ + "Probability is the branch of mathematics concerning numerical descriptions of how likely an event is to occur, or how likely it is that a proposition is true. The probability of an event is a number between 0 and 1, where, roughly speaking, 0 indicates impossibility of the event and 1 indicates certainty. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BXHonzRLv_QF" + }, + "source": [ + "## Random Variables" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aWcCllckv_QF" + }, + "source": [ + "A random variable is a variable that can take on different values randomly. We typically denote the random variable itself with a lowercase letter in plain typeface, and the values it can take on with lowercase script letters. For example, x1 and x2 are both possible values that the random variable x can take on. For vector-valued variables, we would write the random variable as x and one of its values as x. On its own, a random variable is just a description of the states that are possible; it must be coupled with a probability distribution that specifies how likely each of these states are. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kGyrELttv_QF" + }, + "source": [ + "Random variables may be discrete or continuous. A discrete random variable is one that has a finite or countably infinite number of states. Note that these states are not necessarily the integers; they can also just be named states that are not considered to have any numerical value. A continuous random variable is associated with a real value." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "94UfigFgv_QG" + }, + "source": [ + "## Probability Distributions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pvg9bB81v_QG" + }, + "source": [ + "A probability distributionis a description of how likely a random variable or set of random variables is to take on each of its possible states. The way we describe probability distributions depends on whether the variables are discrete or continuous." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Go2MuuxPv_QH" + }, + "source": [ + "![Prob](img/1.png) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Prob](img/discont.png)\n", + "
Discrete vs Continuous
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iIgRi-sFv_QI" + }, + "source": [ + "### Discrete Variables and Probability Mass Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sG9RAchlv_QI" + }, + "source": [ + "A probability distribution over discrete variables can be explained using a probability mass function (PMF). We typically express probability mass functions with a capital P. Usually, we associate each random variable with a different probability mass function, and the reader must determine which PMF to use based on the identity of the random variable instead of the name of the function; P (x) is generally not the same as P (y)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Prob](img/dis2.png)\n", + "
Probability Mass Function
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jNTwrtSNv_QJ" + }, + "source": [ + "### Continuous Variables and Probability Density Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ms0drg0hv_QJ" + }, + "source": [ + "When working with continuous random variables, we describe probability distributions using a probability density function (PDF) rather than a probability mass function. To be a probability density function, a function p must satisfy the following properties:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tdrk1T9_v_QK" + }, + "source": [ + "1. The domain of p must be the set of all possible states of x\n", + "2. ∀x ∈ x, p(x) ≥ 0. Note that we do not require $p(x)\\leq1$\n", + "3. $\\int_a^bp(x)dx = 1$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Prob](img/cont1.jpg)\n", + "
Probability Mass Function
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Prob](img/overview-prob-distr.png)\n", + "
Types of Probabilty Distributions
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TBwKOZJjv_QK" + }, + "source": [ + "## Marginal & Conditional Prob." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hqkMsDRxv_QK" + }, + "source": [ + "### Marginal Probability" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The probability of one event in the presence of all (or a subset of) outcomes of the other random variable is called the marginal probability or the marginal distribution. The marginal probability of one random variable in the presence of additional random variables is referred to as the marginal probability distribution." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k_k47V_Lv_QL" + }, + "source": [ + "

$P(X=A) = \\sum_{i} P(X=A, Y=yi)$ for all Y

" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/mar_prob.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nikQKlFov_QM" + }, + "source": [ + "### Conditional Probability" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nDnE6ng2v_QM" + }, + "source": [ + "The probability of one event given the occurrence of another event is called the *conditional probability*. The conditional probability of one to one or more random variables is referred to as the conditional probability distribution." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2qGGdCbOv_QN" + }, + "source": [ + "

P(A given B) = P(A | B)

" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/cond.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WnMS5VHZv_QN" + }, + "source": [ + "## Independence" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PspPmXhIv_QN" + }, + "source": [ + "If one variable is not dependent on a second variable, this is called *independence* or *statistical independence*. For example, we may be interested in the joint probability of independent events A and B, which is the same as the probability of A and the probability of B.\n", + "\n", + "Probabilities are combined using multiplication, therefore the joint probability of independent events is calculated as the probability of event A multiplied by the probability of event B" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J_bfibLxv_QO" + }, + "source": [ + "

P(A and B) = P(A) * P(B)

" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i-Ey-Okqv_QO" + }, + "source": [ + "## Expectation, Variance, Covariance" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v3-ybw0pv_QP" + }, + "source": [ + "### Expectation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QCdKv7l7v_QP" + }, + "source": [ + "The expectation, or expected value, of some function f(x) with respect to a probability distribution P(x) is the average, or mean value, that f takes on when x is drawn from P. For discrete variables this can be computed with a summation:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NkTPlaYlv_QP" + }, + "source": [ + "

$\\mathrm{E[x]}=\\sum_{n=a}^{b}P(x)f(x)$

" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Q1uvuv-Uv_QQ" + }, + "source": [ + "while for continuous variables, it is computed with an integral:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5m7Y6DYNv_QQ" + }, + "source": [ + "

$\\mathrm{E[x]}=\\int_a^bP(x)f(x)dx$

\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NIP-7_H_v_QR" + }, + "source": [ + "### Variance" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Qtk6rkZgv_QR" + }, + "source": [ + "The variance gives a measure of how much the values of a function of a random variable x vary as we sample different values of x from its probability distribution. It measures how far a set of numbers is spread out from their average value:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JLSSJ6OGv_QR" + }, + "source": [ + "

$\\mathrm{Var[x]}= \\mathrm{E[(x-\\mathrm{E[x]})^2]}$

" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FyVcGE3sv_QS" + }, + "source": [ + "When the variance is low, the values of f(x) cluster near their expected value. The Square root of the variance is known as the *standard deviation*." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vsOWKwInv_QS" + }, + "source": [ + "### Covariance" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hi96QNs-v_QT" + }, + "source": [ + "In probability, covariance is the measure of the joint probability for two random variables. It describes how the two variables change together.\n", + "\n", + "It is denoted as the function cov(X, Y), where X and Y are the two random variables being considered." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OABR2Q74v_QT" + }, + "source": [ + "

$\\mathrm{Cov[x,y]}=\\mathrm{E[(x-\\mathrm{E[x]})(y-\\mathrm{E[y]})]}$

" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O6qt5FFMv_QU" + }, + "source": [ + "## Bayes' Rule " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "47SaOhn_v_QU" + }, + "source": [ + "In probability theory and statistics, Bayes's theorem (alternatively Bayes's law or Bayes's rule) describes the probability of an event, based on prior knowledge of conditions that might be related to the event. For example, if the risk of developing health problems is known to increase with age, Bayes's theorem allows the risk to an individual of a known age to be assessed more accurately than simply assuming that the individual is typical of the population as a whole." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "saCaM_Vfv_QU" + }, + "source": [ + "We often find ourselves in a situation where we know P(y | x) and need to know P(x | y). Fortunately, if we also know P(x), we can compute the desired quantity using Bayes’ rule:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/bayes.jpeg)" + ] + } + ], + "metadata": { + "colab": { + "name": "probability.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Probabilty/probability.ipynb b/Probabilty/probability.ipynb index 50011b3..28fb0e3 100644 --- a/Probabilty/probability.ipynb +++ b/Probabilty/probability.ipynb @@ -456,7 +456,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.9" } }, "nbformat": 4, diff --git a/Project/.ipynb_checkpoints/09-11-2020 ML Course Nigeria Project 'name'-checkpoint.ipynb b/Project/.ipynb_checkpoints/09-11-2020 ML Course Nigeria Project 'name'-checkpoint.ipynb new file mode 100644 index 0000000..9fbf733 --- /dev/null +++ b/Project/.ipynb_checkpoints/09-11-2020 ML Course Nigeria Project 'name'-checkpoint.ipynb @@ -0,0 +1,1231 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Project\n", + "\n", + "In this project, our aim is to building a model for predicting churn. Churn is the percentage of customers that stopped using your company's product or service during a certain time frame. Thus, in the given dataset, our label will be `Churn` column.\n", + "\n", + "## Steps\n", + "- Read the `churn.csv` file and describe it.\n", + "- Make at least 4 different analysis on Exploratory Data Analysis section.\n", + "- Pre-process the dataset to get ready for ML application. (Check missing data and handle them, can we need to do scaling or feature extraction etc.)\n", + "- Define appropriate evaluation metric for our case (classification).\n", + "- Train and evaluate Logistic Regression, Decision Trees and one other appropriate algorithm which you can choose from scikit-learn library.\n", + "- Is there any overfitting and underfitting? Interpret your results and try to overcome if there is any problem in a new section.\n", + "- Create confusion metrics for each algorithm and display Accuracy, Recall, Precision and F1-Score values.\n", + "- Analyse and compare results of 3 algorithms.\n", + "- Select best performing model based on evaluation metric you chose on test dataset.\n", + "\n", + "\n", + "Good luck :)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Your Name

" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ChurnAccountWeeksContractRenewalDataPlanDataUsageCustServCallsDayMinsDayCallsMonthlyChargeOverageFeeRoamMins
00128112.71265.111089.09.8710.0
10107113.71161.612382.09.7813.7
20137100.00243.411452.06.0612.2
3084000.02299.47157.03.106.6
4075000.03166.711341.07.4210.1
\n", + "
" + ], + "text/plain": [ + " Churn AccountWeeks ContractRenewal DataPlan DataUsage CustServCalls \\\n", + "0 0 128 1 1 2.7 1 \n", + "1 0 107 1 1 3.7 1 \n", + "2 0 137 1 0 0.0 0 \n", + "3 0 84 0 0 0.0 2 \n", + "4 0 75 0 0 0.0 3 \n", + "\n", + " DayMins DayCalls MonthlyCharge OverageFee RoamMins \n", + "0 265.1 110 89.0 9.87 10.0 \n", + "1 161.6 123 82.0 9.78 13.7 \n", + "2 243.4 114 52.0 6.06 12.2 \n", + "3 299.4 71 57.0 3.10 6.6 \n", + "4 166.7 113 41.0 7.42 10.1 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read csv\n", + "data = pd.read_csv(\"churn.csv\")\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Describe our data for each feature and use .info() for get information about our dataset\n", + "# Analys missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3333, 11)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 3333 entries, 0 to 3332\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Churn 3333 non-null int64 \n", + " 1 AccountWeeks 3333 non-null int64 \n", + " 2 ContractRenewal 3333 non-null int64 \n", + " 3 DataPlan 3333 non-null int64 \n", + " 4 DataUsage 3333 non-null float64\n", + " 5 CustServCalls 3333 non-null int64 \n", + " 6 DayMins 3333 non-null float64\n", + " 7 DayCalls 3333 non-null int64 \n", + " 8 MonthlyCharge 3333 non-null float64\n", + " 9 OverageFee 3333 non-null float64\n", + " 10 RoamMins 3333 non-null float64\n", + "dtypes: float64(5), int64(6)\n", + "memory usage: 286.6 KB\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Churn 0\n", + "AccountWeeks 0\n", + "ContractRenewal 0\n", + "DataPlan 0\n", + "DataUsage 0\n", + "CustServCalls 0\n", + "DayMins 0\n", + "DayCalls 0\n", + "MonthlyCharge 0\n", + "OverageFee 0\n", + "RoamMins 0\n", + "dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Churn 0\n", + "AccountWeeks 0\n", + "ContractRenewal 0\n", + "DataPlan 0\n", + "DataUsage 0\n", + "CustServCalls 0\n", + "DayMins 0\n", + "DayCalls 0\n", + "MonthlyCharge 0\n", + "OverageFee 0\n", + "RoamMins 0\n", + "dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isin(['?']).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the above, the data have 3333 rows and 11 column and it is revealed that there is no missing value in our data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ChurnAccountWeeksContractRenewalDataPlanDataUsageCustServCallsDayMinsDayCallsMonthlyChargeOverageFeeRoamMins
count3333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.000000
mean0.144914101.0648060.9030900.2766280.8164751.562856179.775098100.43564456.30516110.05148810.237294
std0.35206739.8221060.2958790.4473981.2726681.31549154.46738920.06908416.4260322.5357122.791840
min0.0000001.0000000.0000000.0000000.0000000.0000000.0000000.00000014.0000000.0000000.000000
25%0.00000074.0000001.0000000.0000000.0000001.000000143.70000087.00000045.0000008.3300008.500000
50%0.000000101.0000001.0000000.0000000.0000001.000000179.400000101.00000053.50000010.07000010.300000
75%0.000000127.0000001.0000001.0000001.7800002.000000216.400000114.00000066.20000011.77000012.100000
max1.000000243.0000001.0000001.0000005.4000009.000000350.800000165.000000111.30000018.19000020.000000
\n", + "
" + ], + "text/plain": [ + " Churn AccountWeeks ContractRenewal DataPlan DataUsage \\\n", + "count 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 \n", + "mean 0.144914 101.064806 0.903090 0.276628 0.816475 \n", + "std 0.352067 39.822106 0.295879 0.447398 1.272668 \n", + "min 0.000000 1.000000 0.000000 0.000000 0.000000 \n", + "25% 0.000000 74.000000 1.000000 0.000000 0.000000 \n", + "50% 0.000000 101.000000 1.000000 0.000000 0.000000 \n", + "75% 0.000000 127.000000 1.000000 1.000000 1.780000 \n", + "max 1.000000 243.000000 1.000000 1.000000 5.400000 \n", + "\n", + " CustServCalls DayMins DayCalls MonthlyCharge OverageFee \\\n", + "count 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 \n", + "mean 1.562856 179.775098 100.435644 56.305161 10.051488 \n", + "std 1.315491 54.467389 20.069084 16.426032 2.535712 \n", + "min 0.000000 0.000000 0.000000 14.000000 0.000000 \n", + "25% 1.000000 143.700000 87.000000 45.000000 8.330000 \n", + "50% 1.000000 179.400000 101.000000 53.500000 10.070000 \n", + "75% 2.000000 216.400000 114.000000 66.200000 11.770000 \n", + "max 9.000000 350.800000 165.000000 111.300000 18.190000 \n", + "\n", + " RoamMins \n", + "count 3333.000000 \n", + "mean 10.237294 \n", + "std 2.791840 \n", + "min 0.000000 \n", + "25% 8.500000 \n", + "50% 10.300000 \n", + "75% 12.100000 \n", + "max 20.000000 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exploratory Data Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAP60lEQVR4nO3dcayd9V3H8fdnMBm6ESEUVtrOsqVTCyqEayXyh0yi1CWmbHNLMRuNErsQZkaymMD+ENQ0WSLbHHPDdBmDmm2k2YZUBSfD6VxkY7dLs9JiXR0Id630spmARtF2X/84T8NZe3p/p7c959z2vl/JyXnO93l+z/lecssnz/P8nuemqpAkaS6vmHQDkqSFz7CQJDUZFpKkJsNCktRkWEiSms6cdAOjcv7559fKlSsn3YYknVK2b9/+fFUtObJ+2obFypUrmZ6ennQbknRKSfJvg+qehpIkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDWdtndwn6grfm/LpFvQArT9j2+YdAvSRHhkIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqSmkYVFkhVJvpzkySS7kry3q9+R5LtJdnSvN/eNuS3J3iR7klzbV78iyc5u3V1JMqq+JUlHO3OE+z4IvK+qvpnkNcD2JI906z5cVXf2b5xkNbAeuAS4CPhSkjdW1SHgbmAj8DXgIWAt8PAIe5ck9RnZkUVV7a+qb3bLLwJPAsvmGLIOuL+qXqqqp4C9wJokS4FzquqxqipgC3DdqPqWJB1tLNcskqwELge+3pXek+RbSe5Jcm5XWwY82zdspqst65aPrA/6no1JppNMz87OnswfQZIWtZGHRZJXA58HbqmqF+idUnoDcBmwH/jg4U0HDK856kcXqzZX1VRVTS1ZsuREW5ckdUYaFkleSS8oPl1VXwCoqueq6lBV/QD4BLCm23wGWNE3fDmwr6svH1CXJI3JKGdDBfgk8GRVfaivvrRvs7cAT3TL24D1Sc5KcjGwCni8qvYDLya5stvnDcCDo+pbknS0Uc6Gugp4F7AzyY6u9n7g+iSX0TuV9DTwboCq2pVkK7Cb3kyqm7uZUAA3AfcCZ9ObBeVMKEkao5GFRVV9lcHXGx6aY8wmYNOA+jRw6cnrTpJ0PLyDW5LUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lS08jCIsmKJF9O8mSSXUne29XPS/JIkm937+f2jbktyd4ke5Jc21e/IsnObt1dSTKqviVJRxvlkcVB4H1V9dPAlcDNSVYDtwKPVtUq4NHuM9269cAlwFrg40nO6PZ1N7ARWNW91o6wb0nSEUYWFlW1v6q+2S2/CDwJLAPWAfd1m90HXNctrwPur6qXquopYC+wJslS4JyqeqyqCtjSN0aSNAZjuWaRZCVwOfB14MKq2g+9QAEu6DZbBjzbN2ymqy3rlo+sD/qejUmmk0zPzs6e1J9BkhazkYdFklcDnwduqaoX5tp0QK3mqB9drNpcVVNVNbVkyZLjb1aSNNBIwyLJK+kFxaer6gtd+bnu1BLd+4GuPgOs6Bu+HNjX1ZcPqEuSxmSUs6ECfBJ4sqo+1LdqG7ChW94APNhXX5/krCQX07uQ/Xh3qurFJFd2+7yhb4wkaQzOHOG+rwLeBexMsqOrvR/4ALA1yY3AM8DbAapqV5KtwG56M6lurqpD3bibgHuBs4GHu5ckaUxGFhZV9VUGX28AuOYYYzYBmwbUp4FLT153kqTj4R3ckqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lS01BhkeTRYWqSpNPTmXOtTPIq4EeB85OcC6RbdQ5w0Yh7kyQtEHOGBfBu4BZ6wbCdl8PiBeBjo2tLkrSQzBkWVfUR4CNJfreqPjqmniRJC0zryAKAqvpokl8EVvaPqaotI+pLkrSADBUWSf4ceAOwAzjUlQswLCRpERgqLIApYHVV1SibkSQtTMPeZ/EE8NpRNiJJWriGDYvzgd1Jvphk2+HXXAOS3JPkQJIn+mp3JPlukh3d6819625LsjfJniTX9tWvSLKzW3dXkhz5XZKk0Rr2NNQd89j3vcCfcvR1jQ9X1Z39hSSrgfXAJfSm6X4pyRur6hBwN7AR+BrwELAWeHge/UiS5mnY2VD/cLw7rqqvJFk55ObrgPur6iXgqSR7gTVJngbOqarHAJJsAa7DsJCksRr2cR8vJnmhe/1PkkNJXpjnd74nybe601TndrVlwLN928x0tWXd8pH1Y/W5Mcl0kunZ2dl5tidJOtJQYVFVr6mqc7rXq4C30TvFdLzupjcF9zJgP/DBrj7oOkTNUT9Wn5uraqqqppYsWTKP9iRJg8zrqbNV9RfAL89j3HNVdaiqfgB8AljTrZoBVvRtuhzY19WXD6hLksZo2Jvy3tr38RX07rs47nsukiytqv3dx7fQm5ILsA34TJIP0bvAvQp4vKoOdafArgS+DtwA+NgRSRqzYWdD/Xrf8kHgaXoXpY8pyWeBq+k9sXYGuB24Osll9ILmaXoPKqSqdiXZCuzu9n9zNxMK4CZ6M6vOpndh24vbkjRmw86G+q3j3XFVXT+g/Mk5tt8EbBpQnwYuPd7vlySdPMPOhlqe5IHuJrvnknw+yfL2SEnS6WDYC9yfondd4SJ6U1f/sqtJkhaBYcNiSVV9qqoOdq97AeemStIiMWxYPJ/knUnO6F7vBL43ysYkSQvHsGHx28A7gH+ndzPdbwDHfdFbknRqGnbq7B8BG6rqPwCSnAfcSS9EJEmnuWGPLH72cFAAVNX3gctH05IkaaEZNixe0ffQv8NHFsMelUiSTnHD/g//g8A/Jfkcvbuv38GAG+gkSaenYe/g3pJkmt7DAwO8tap2j7QzSdKCMfSppC4cDAhJWoTm9YhySdLiYlhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkppGFRZJ7khxI8kRf7bwkjyT5dvfe/3e9b0uyN8meJNf21a9IsrNbd1eSjKpnSdJgozyyuBdYe0TtVuDRqloFPNp9JslqYD1wSTfm40nO6MbcDWwEVnWvI/cpSRqxkYVFVX0F+P4R5XXAfd3yfcB1ffX7q+qlqnoK2AusSbIUOKeqHquqArb0jZEkjcm4r1lcWFX7Abr3C7r6MuDZvu1mutqybvnI+kBJNiaZTjI9Ozt7UhuXpMVsoVzgHnQdouaoD1RVm6tqqqqmlixZctKak6TFbtxh8Vx3aonu/UBXnwFW9G23HNjX1ZcPqEuSxmjcYbEN2NAtbwAe7KuvT3JWkovpXch+vDtV9WKSK7tZUDf0jZEkjcmZo9pxks8CVwPnJ5kBbgc+AGxNciPwDPB2gKralWQrsBs4CNxcVYe6Xd1Eb2bV2cDD3UuSNEYjC4uquv4Yq645xvabgE0D6tPApSexNUnScVooF7glSQuYYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNEwmLJE8n2ZlkR5LprnZekkeSfLt7P7dv+9uS7E2yJ8m1k+hZkhazSR5ZvKmqLquqqe7zrcCjVbUKeLT7TJLVwHrgEmAt8PEkZ0yiYUlarBbSaah1wH3d8n3AdX31+6vqpap6CtgLrBl/e5K0eE0qLAr42yTbk2zsahdW1X6A7v2Crr4MeLZv7ExXO0qSjUmmk0zPzs6OqHVJWnzOnND3XlVV+5JcADyS5J/n2DYDajVow6raDGwGmJqaGriNJOn4TSQsqmpf934gyQP0Tis9l2RpVe1PshQ40G0+A6zoG74c2DfWhqUF5pk//JlJt6AF6HW/v3Nk+x77aagkP5bkNYeXgV8FngC2ARu6zTYAD3bL24D1Sc5KcjGwCnh8vF1L0uI2iSOLC4EHkhz+/s9U1d8k+QawNcmNwDPA2wGqaleSrcBu4CBwc1UdmkDfkrRojT0squo7wM8NqH8PuOYYYzYBm0bcmiTpGBbS1FlJ0gJlWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajplwiLJ2iR7kuxNcuuk+5GkxeSUCIskZwAfA34NWA1cn2T1ZLuSpMXjlAgLYA2wt6q+U1X/C9wPrJtwT5K0aJw56QaGtAx4tu/zDPALR26UZCOwsfv4n0n2jKG3xeB84PlJN7EQ5M4Nk25BR/P387DbczL28hODiqdKWAz6L1BHFao2A5tH387ikmS6qqYm3Yc0iL+f43GqnIaaAVb0fV4O7JtQL5K06JwqYfENYFWSi5P8CLAe2DbhniRp0TglTkNV1cEk7wG+CJwB3FNVuybc1mLiqT0tZP5+jkGqjjr1L0nSDzlVTkNJkibIsJAkNRkWmpOPWdFCleSeJAeSPDHpXhYDw0LH5GNWtMDdC6yddBOLhWGhufiYFS1YVfUV4PuT7mOxMCw0l0GPWVk2oV4kTZBhobkM9ZgVSac/w0Jz8TErkgDDQnPzMSuSAMNCc6iqg8Dhx6w8CWz1MStaKJJ8FngM+MkkM0lunHRPpzMf9yFJavLIQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFdAKSvDbJ/Un+NcnuJA8l2Zjkrybdm3QyGRbSPCUJ8ADw91X1hqpaDbwfuPAE93tK/LljLS7+Ukrz9ybg/6rqzw4XqmpHkh8HrknyOeBSYDvwzqqqJE8DU1X1fJIp4M6qujrJHcBFwErg+ST/ArwOeH33/idVddf4fjTph3lkIc3f4SAY5HLgFnp/B+T1wFVD7O8KYF1V/Wb3+aeAa+k9Kv72JK88oW6lE2BYSKPxeFXNVNUPgB30jhhatlXVf/d9/uuqeqmqngcOcIKnt6QTYVhI87eL3tHAIC/1LR/i5VO+B3n5392rjhjzX0PuQxo7w0Kav78DzkryO4cLSX4e+KU5xjzNywHzttG1Jp1choU0T9V7CudbgF/pps7uAu5g7r/58QfAR5L8I72jBemU4FNnJUlNHllIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqSm/wf03QODNr6OSgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Our label Distribution (countplot)\n", + "sns.countplot(x='Churn', data=data)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Example EDA\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize = (20, 25))\n", + "name = list('AccountWeeks', 'ContractRenewal', \"DataPlan\", \"DataUsage\", 'CustServCalls', 'DayMins', 'DayCalls', 'MonthlyCharge', 'OverageFee', 'RoamMins')\n", + "data.hist()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ChurnAccountWeeksContractRenewalDataPlanCustServCallsDayMinsDayCallsMonthlyChargeOverageFeeRoamMins
Churn1.0000000.016541-0.259852-0.1021480.2087500.2051510.0184590.0723130.0928120.068239
AccountWeeks0.0165411.000000-0.0247350.002918-0.0037960.0062160.0384700.012581-0.0067490.009514
ContractRenewal-0.259852-0.0247351.000000-0.0060060.024522-0.049396-0.003755-0.047291-0.019105-0.045871
DataPlan-0.1021480.002918-0.0060061.000000-0.017824-0.001684-0.0110860.7374900.021526-0.001318
CustServCalls0.208750-0.0037960.024522-0.0178241.000000-0.013423-0.018942-0.028017-0.012964-0.009640
DayMins0.2051510.006216-0.049396-0.001684-0.0134231.0000000.0067500.5679680.007038-0.010155
DayCalls0.0184590.038470-0.003755-0.011086-0.0189420.0067501.000000-0.007963-0.0214490.021565
MonthlyCharge0.0723130.012581-0.0472910.737490-0.0280170.567968-0.0079631.0000000.2817660.117433
OverageFee0.092812-0.006749-0.0191050.021526-0.0129640.007038-0.0214490.2817661.000000-0.011023
RoamMins0.0682390.009514-0.045871-0.001318-0.009640-0.0101550.0215650.117433-0.0110231.000000
\n", + "
" + ], + "text/plain": [ + " Churn AccountWeeks ContractRenewal DataPlan \\\n", + "Churn 1.000000 0.016541 -0.259852 -0.102148 \n", + "AccountWeeks 0.016541 1.000000 -0.024735 0.002918 \n", + "ContractRenewal -0.259852 -0.024735 1.000000 -0.006006 \n", + "DataPlan -0.102148 0.002918 -0.006006 1.000000 \n", + "CustServCalls 0.208750 -0.003796 0.024522 -0.017824 \n", + "DayMins 0.205151 0.006216 -0.049396 -0.001684 \n", + "DayCalls 0.018459 0.038470 -0.003755 -0.011086 \n", + "MonthlyCharge 0.072313 0.012581 -0.047291 0.737490 \n", + "OverageFee 0.092812 -0.006749 -0.019105 0.021526 \n", + "RoamMins 0.068239 0.009514 -0.045871 -0.001318 \n", + "\n", + " CustServCalls DayMins DayCalls MonthlyCharge OverageFee \\\n", + "Churn 0.208750 0.205151 0.018459 0.072313 0.092812 \n", + "AccountWeeks -0.003796 0.006216 0.038470 0.012581 -0.006749 \n", + "ContractRenewal 0.024522 -0.049396 -0.003755 -0.047291 -0.019105 \n", + "DataPlan -0.017824 -0.001684 -0.011086 0.737490 0.021526 \n", + "CustServCalls 1.000000 -0.013423 -0.018942 -0.028017 -0.012964 \n", + "DayMins -0.013423 1.000000 0.006750 0.567968 0.007038 \n", + "DayCalls -0.018942 0.006750 1.000000 -0.007963 -0.021449 \n", + "MonthlyCharge -0.028017 0.567968 -0.007963 1.000000 0.281766 \n", + "OverageFee -0.012964 0.007038 -0.021449 0.281766 1.000000 \n", + "RoamMins -0.009640 -0.010155 0.021565 0.117433 -0.011023 \n", + "\n", + " RoamMins \n", + "Churn 0.068239 \n", + "AccountWeeks 0.009514 \n", + "ContractRenewal -0.045871 \n", + "DataPlan -0.001318 \n", + "CustServCalls -0.009640 \n", + "DayMins -0.010155 \n", + "DayCalls 0.021565 \n", + "MonthlyCharge 0.117433 \n", + "OverageFee -0.011023 \n", + "RoamMins 1.000000 " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.corr()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "corr = data.corr()\n", + "sns.heatmap(corr)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "columns = np.full((corr.shape[0],), True, dtype=bool)\n", + "for i in range(corr.shape[0]):\n", + " for j in range(i+1, corr.shape[0]):\n", + " if corr.iloc[i,j] >= 0.9:\n", + " if columns[j]:\n", + " columns[j] = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD7CAYAAABkO19ZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVb0lEQVR4nO3dcWycd33H8fdnKQ2hAZpQcjJJtITJsCWzKNSK2Dqhy8JIaKc5+yOSUZmcKZL3R2AweRrO+AP2h6UwLWhIayd5lM1bWS2PUsWiFWvIOCGk0tCUtGkSshgSEjdeAoW2GKqAw3d/3BO4JLbvznfnB//u85Ki57nf/X7P8/vqHn/85PFzd4oIzMwsXb+R9wTMzKy1HPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZomrKegl/ZWkE5Kel/SwpNdKWi3pkKQz2XJVRf99kiYknZa0vXXTNzOzalTtPnpJa4GvA5si4lVJY8DjwCbghxGxX9IgsCoiPiZpE/AwsAV4C/AV4G0RcbWVhZiZ2exuqaPfCkk/B14HXAT2AcXs+RGgBHwM6AFGI+IKcFbSBOXQf3Kujd9xxx2xYcOGuif/k5/8hNtuu63ucSlw7a693bj2m2s/evToDyLizdXGVw36iHhB0j8A54FXgSci4glJhYiYyvpMSVqTDVkLfKNiE5NZ25w2bNjA008/XW0qNymVShSLxbrHpcC1F/OeRi5cezHvaeRirtolfa+W8VWDPrv23gNsBF4C/kvSB+cbMkvbTdeHJPUD/QCFQoFSqVTDdK83PT29oHEpcO2lvKeRC9deynsauWi09lou3bwXOBsR3weQ9EXg94FLkjqys/kO4HLWfxJYXzF+HeVLPdeJiGFgGKC7uzsW8pvav+GLeU8jF669mPc0cuHaiwseX8tdN+eBd0t6nSQB24BTwDjQl/XpAw5m6+NAr6TlkjYCncCRBc/QzMwaUss1+qckfQF4BpgBvkX5THwlMCZpD+VfBruy/ieyO3NOZv33+o4bM7P81HTXTUR8AvjEDc1XKJ/dz9Z/CBhqbGpmZtYMfmesmVniHPRmZolz0JuZJc5Bb2aWuFo/AsEsVxsGH/vl+kDXDLsrHrfauf33Ltq+zFrBZ/RmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVni/IYpq8uGRXyjkpk1h8/ozcwS56A3M0ucg97MLHEOejOzxFUNeklvl3Ss4t8rkj4qabWkQ5LOZMtVFWP2SZqQdFrS9taWYGZm86ka9BFxOiLujIg7gbuAnwKPAoPA4YjoBA5nj5G0CegFNgM7gAckLWvN9M3MrJp6L91sA74TEd8DeoCRrH0E2Jmt9wCjEXElIs4CE8CWJszVzMwWoN6g7wUeztYLETEFkC3XZO1rgQsVYyazNjMzy4EioraO0q3ARWBzRFyS9FJE3F7x/I8iYpWk+4EnI+KhrP1B4PGIeOSG7fUD/QCFQuGu0dHRuic/PT3NypUr6x6XgrxqP/7Cy4u+zxsVVsClVxdvf11r37h4O6vCx7xrr7R169ajEdFdbXw974x9P/BMRFzKHl+S1BERU5I6gMtZ+ySwvmLcOsq/IK4TEcPAMEB3d3cUi8U6plJWKpVYyLgU5FX7Yn6F31wGumY4cHzx3tR97r7iou2rGh/zxbynkYtGa6/n0s0H+NVlG4BxoC9b7wMOVrT3SlouaSPQCRxZ8AzNzKwhNZ0WSXod8EfAX1Q07wfGJO0BzgO7ACLihKQx4CQwA+yNiKtNnbWZmdWspqCPiJ8Cb7qh7UXKd+HM1n8IGGp4dmZm1jC/M9bMLHEOejOzxDnozcwS56A3M0ucg97MLHEOejOzxDnozcwS56A3M0ucg97MLHEOejOzxDnozcwS56A3M0ucg97MLHEOejOzxDnozcwS56A3M0ucg97MLHEOejOzxNUU9JJul/QFSd+WdErS70laLemQpDPZclVF/32SJiSdlrS9ddM3M7Nqaj2j/wzw5Yj4beAdwClgEDgcEZ3A4ewxkjYBvcBmYAfwgKRlzZ64mZnVpmrQS3oD8B7gQYCI+FlEvAT0ACNZtxFgZ7beA4xGxJWIOAtMAFuaO20zM6tVLWf0bwW+D/yrpG9J+qyk24BCREwBZMs1Wf+1wIWK8ZNZm5mZ5UARMX8HqRv4BnB3RDwl6TPAK8CHI+L2in4/iohVku4HnoyIh7L2B4HHI+KRG7bbD/QDFAqFu0ZHR+ue/PT0NCtXrqx7XAryqv34Cy8v+j5vVFgBl15dvP11rX3j4u2sCh/zrr3S1q1bj0ZEd7Xxt9Swj0lgMiKeyh5/gfL1+EuSOiJiSlIHcLmi//qK8euAizduNCKGgWGA7u7uKBaLNUzleqVSiYWMS0Fete8efGzR93mjga4ZDhyv5dBtjnP3FRdtX9X4mC/mPY1cNFp71Us3EfF/wAVJb8+atgEngXGgL2vrAw5m6+NAr6TlkjYCncCRBc/QzMwaUutp0YeBz0u6Ffgu8OeUf0mMSdoDnAd2AUTECUljlH8ZzAB7I+Jq02duZmY1qSnoI+IYMNt1oG1z9B8ChhY+LTMzaxa/M9bMLHEOejOzxC3erQtmS9SGnO40Orf/3lz2a+nxGb2ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klrqagl3RO0nFJxyQ9nbWtlnRI0plsuaqi/z5JE5JOS9reqsmbmVl19ZzRb42IOyPi2nfHDgKHI6ITOJw9RtImoBfYDOwAHpC0rIlzNjOzOjRy6aYHGMnWR4CdFe2jEXElIs4CE8CWBvZjZmYNqDXoA3hC0lFJ/VlbISKmALLlmqx9LXChYuxk1mZmZjlQRFTvJL0lIi5KWgMcAj4MjEfE7RV9fhQRqyTdDzwZEQ9l7Q8Cj0fEIzdssx/oBygUCneNjo7WPfnp6WlWrlxZ97il7vgLL1NYAZdezXsm+WiX2rvWvvGmtnY95sG1z1b71q1bj1ZcTp9TTV8OHhEXs+VlSY9SvhRzSVJHRExJ6gAuZ90ngfUVw9cBF2fZ5jAwDNDd3R3FYrGWqVynVCqxkHFL3e7BxxjomuHA8fb8bvd2qf3cfcWb2tr1mAfX3kjtVS/dSLpN0uuvrQPvA54HxoG+rFsfcDBbHwd6JS2XtBHoBI4seIZmZtaQWk6LCsCjkq71/8+I+LKkbwJjkvYA54FdABFxQtIYcBKYAfZGxNWWzN7MzKqqGvQR8V3gHbO0vwhsm2PMEDDU8OzMzKxhfmesmVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZomrOeglLZP0LUlfyh6vlnRI0plsuaqi7z5JE5JOS9reiombmVlt6jmj/whwquLxIHA4IjqBw9ljJG0CeoHNwA7gAUnLmjNdMzOrV01BL2kdcC/w2YrmHmAkWx8Bdla0j0bElYg4C0wAW5oyWzMzq1utZ/T/CPwN8IuKtkJETAFkyzVZ+1rgQkW/yazNzMxycEu1DpL+GLgcEUclFWvYpmZpi1m22w/0AxQKBUqlUg2bvt709PSCxi11A10zFFaUl+2oXWqf7dhu12MeXHsjtVcNeuBu4E8k3QO8FniDpIeAS5I6ImJKUgdwOes/CayvGL8OuHjjRiNiGBgG6O7ujmKxWPfkS6USCxm31O0efIyBrhkOHK/l5UtPu9R+7r7iTW3tesyDa2+k9qqXbiJiX0Ssi4gNlP/I+j8R8UFgHOjLuvUBB7P1caBX0nJJG4FO4MiCZ2hmZg1p5LRoPzAmaQ9wHtgFEBEnJI0BJ4EZYG9EXG14pmZmtiB1BX1ElIBStv4isG2OfkPAUINzMzOzJvA7Y83MEuegNzNLnIPezCxxDnozs8Q56M3MEuegNzNLnIPezCxxDnozs8Q56M3MEuegNzNLnIPezCxxDnozs8Q56M3MEuegNzNLnIPezCxxDnozs8Q56M3MEuegNzNLXNWgl/RaSUckPSvphKS/y9pXSzok6Uy2XFUxZp+kCUmnJW1vZQFmZja/Ws7orwB/GBHvAO4Edkh6NzAIHI6ITuBw9hhJm4BeYDOwA3hA0rIWzN3MzGpQNeijbDp7+JrsXwA9wEjWPgLszNZ7gNGIuBIRZ4EJYEszJ21mZrWr6Rq9pGWSjgGXgUMR8RRQiIgpgGy5Juu+FrhQMXwyazMzsxzcUkuniLgK3CnpduBRSb87T3fNtombOkn9QD9AoVCgVCrVMpXrTE9PL2jcUjfQNUNhRXnZjtql9tmO7XY95sG1N1J7TUF/TUS8JKlE+dr7JUkdETElqYPy2T6Uz+DXVwxbB1ycZVvDwDBAd3d3FIvFuidfKpVYyLilbvfgYwx0zXDgeF0vXzLapfZz9xVvamvXYx5ceyO113LXzZuzM3kkrQDeC3wbGAf6sm59wMFsfRzolbRc0kagEziy4BmamVlDajkt6gBGsjtnfgMYi4gvSXoSGJO0BzgP7AKIiBOSxoCTwAywN7v0Y2ZmOaga9BHxHPDOWdpfBLbNMWYIGGp4dmZtbMPgYze1DXTNsHuW9mY7t//elu/DFo/fGWtmljgHvZlZ4hz0ZmaJc9CbmSXOQW9mljgHvZlZ4hz0ZmaJc9CbmSXOQW9mljgHvZlZ4hz0ZmaJc9CbmSXOQW9mljgHvZlZ4hz0ZmaJS//72Fpots8LNzP7deMzejOzxDnozcwSV8uXg6+X9FVJpySdkPSRrH21pEOSzmTLVRVj9kmakHRa0vZWFmBmZvOr5Yx+BhiIiN8B3g3slbQJGAQOR0QncDh7TPZcL7AZ2AE8kH2xuJmZ5aBq0EfEVEQ8k63/GDgFrAV6gJGs2wiwM1vvAUYj4kpEnAUmgC1NnreZmdWormv0kjYA7wSeAgoRMQXlXwbAmqzbWuBCxbDJrM3MzHJQ8+2VklYCjwAfjYhXJM3ZdZa2mGV7/UA/QKFQoFQq1TqVX5qenl7QuGYZ6JrJbd+FFfnuP0+uvfW15/lzNZe8f97z1GjtNQW9pNdQDvnPR8QXs+ZLkjoiYkpSB3A5a58E1lcMXwdcvHGbETEMDAN0d3dHsVise/KlUomFjGuW3TneRz/QNcOB4+35NgjX3vraz91XbPk+6pX3z3ueGq29lrtuBDwInIqIT1c8NQ70Zet9wMGK9l5JyyVtBDqBIwueoZmZNaSWU4O7gT8Djks6lrX9LbAfGJO0BzgP7AKIiBOSxoCTlO/Y2RsRV5s9cTMzq03VoI+IrzP7dXeAbXOMGQKGGpiXmZk1id8Za2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniavly8M9Juizp+Yq21ZIOSTqTLVdVPLdP0oSk05K2t2riZmZWm1q+HPzfgH8C/r2ibRA4HBH7JQ1mjz8maRPQC2wG3gJ8RdLb/OXgZkvLhsHHctnvuf335rLf1FU9o4+IrwE/vKG5BxjJ1keAnRXtoxFxJSLOAhPAluZM1czMFmKh1+gLETEFkC3XZO1rgQsV/SazNjMzy0ktl27qoVnaYtaOUj/QD1AoFCiVSnXvbHp6mlKpxPEXXq57bDMMdOWyWwAKK2Cgaya/CeTItadb+3w5cO3nvR01WvtCg/6SpI6ImJLUAVzO2ieB9RX91gEXZ9tARAwDwwDd3d1RLBbrnkSpVKJYLLI7p+uJeRromuHA8Wb/nl4aXHu6tZ+7rzjnc9d+3ttRo7Uv9NLNONCXrfcBByvaeyUtl7QR6ASOLHh2ZmbWsKqnBpIeBorAHZImgU8A+4ExSXuA88AugIg4IWkMOAnMAHt9x42ZWb6qBn1EfGCOp7bN0X8IGGpkUmZm1jx+Z6yZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiUv3q2rMbMnZMM+3xQ10zbTs2+TO7b+3Jdv9deEzejOzxDnozcwS56A3M0tcy4Je0g5JpyVNSBps1X7MzGx+LQl6ScuA+4H3A5uAD0ja1Ip9mZnZ/Fp1180WYCIivgsgaRToAU62aH9mZgs2390+rbYYd/y06tLNWuBCxePJrM3MzBZZq87oNUtbXNdB6gf6s4fTkk4vYD93AD9YwLgl7y9du2tvM6nWrk/V1G2u2n+zlsGtCvpJYH3F43XAxcoOETEMDDeyE0lPR0R3I9tYqly7a283rn3htbfq0s03gU5JGyXdCvQC4y3al5mZzaMlZ/QRMSPpQ8B/A8uAz0XEiVbsy8zM5teyz7qJiMeBx1u1/UxDl36WONfenlx7e2rsMndEVO9lZmZLlj8CwcwscUs26NvtIxYknZN0XNIxSU9nbaslHZJ0JluuynuezSDpc5IuS3q+om3OWiXty46D05K25zPr5pij9k9KeiF77Y9JuqfiuSRql7Re0lclnZJ0QtJHsvbkX/d5am/e6x4RS+4f5T/wfgd4K3Ar8CywKe95tbjmc8AdN7T9PTCYrQ8Cn8p7nk2q9T3Au4Dnq9VK+SM2ngWWAxuz42JZ3jU0ufZPAn89S99kagc6gHdl668H/jerL/nXfZ7am/a6L9Uz+l9+xEJE/Ay49hEL7aYHGMnWR4Cd+U2leSLia8APb2ieq9YeYDQirkTEWWCC8vGxJM1R+1ySqT0ipiLimWz9x8Apyu+mT/51n6f2udRd+1IN+nb8iIUAnpB0NHtXMUAhIqagfLAAa3KbXevNVWu7HAsfkvRcdmnn2uWLJGuXtAF4J/AUbfa631A7NOl1X6pBX/UjFhJ0d0S8i/Ingu6V9J68J/Rroh2OhX8Gfgu4E5gCDmTtydUuaSXwCPDRiHhlvq6ztKVWe9Ne96Ua9FU/YiE1EXExW14GHqX8X7VLkjoAsuXl/GbYcnPVmvyxEBGXIuJqRPwC+Bd+9d/0pGqX9BrKQff5iPhi1twWr/tstTfzdV+qQd9WH7Eg6TZJr7+2DrwPeJ5yzX1Ztz7gYD4zXBRz1ToO9EpaLmkj0AkcyWF+LXMt6DJ/Svm1h4RqlyTgQeBURHy64qnkX/e5am/q6573X5wb+Ev1PZT/Ov0d4ON5z6fFtb6V8l/ZnwVOXKsXeBNwGDiTLVfnPdcm1fsw5f+q/pzy2cue+WoFPp4dB6eB9+c9/xbU/h/AceC57Ie8I7XagT+gfPnhOeBY9u+ednjd56m9aa+73xlrZpa4pXrpxszMauSgNzNLnIPezCxxDnozs8Q56M3MEuegNzNLnIPezCxxDnozs8T9P404lzspWHLQAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "data['AccountWeeks'].hist()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ChurnAccountWeeksContractRenewalDataPlanDataUsageCustServCallsDayMinsDayCallsMonthlyChargeOverageFeeRoamMins
Churn1.0000000.016541-0.259852-0.102148-0.0871950.2087500.2051510.0184590.0723130.0928120.068239
AccountWeeks0.0165411.000000-0.0247350.0029180.014391-0.0037960.0062160.0384700.012581-0.0067490.009514
ContractRenewal-0.259852-0.0247351.000000-0.006006-0.0192230.024522-0.049396-0.003755-0.047291-0.019105-0.045871
DataPlan-0.1021480.002918-0.0060061.0000000.945982-0.017824-0.001684-0.0110860.7374900.021526-0.001318
DataUsage-0.0871950.014391-0.0192230.9459821.000000-0.0217230.003176-0.0079620.7816600.0196370.162746
CustServCalls0.208750-0.0037960.024522-0.017824-0.0217231.000000-0.013423-0.018942-0.028017-0.012964-0.009640
DayMins0.2051510.006216-0.049396-0.0016840.003176-0.0134231.0000000.0067500.5679680.007038-0.010155
DayCalls0.0184590.038470-0.003755-0.011086-0.007962-0.0189420.0067501.000000-0.007963-0.0214490.021565
MonthlyCharge0.0723130.012581-0.0472910.7374900.781660-0.0280170.567968-0.0079631.0000000.2817660.117433
OverageFee0.092812-0.006749-0.0191050.0215260.019637-0.0129640.007038-0.0214490.2817661.000000-0.011023
RoamMins0.0682390.009514-0.045871-0.0013180.162746-0.009640-0.0101550.0215650.117433-0.0110231.000000
\n", + "
" + ], + "text/plain": [ + " Churn AccountWeeks ContractRenewal DataPlan DataUsage \\\n", + "Churn 1.000000 0.016541 -0.259852 -0.102148 -0.087195 \n", + "AccountWeeks 0.016541 1.000000 -0.024735 0.002918 0.014391 \n", + "ContractRenewal -0.259852 -0.024735 1.000000 -0.006006 -0.019223 \n", + "DataPlan -0.102148 0.002918 -0.006006 1.000000 0.945982 \n", + "DataUsage -0.087195 0.014391 -0.019223 0.945982 1.000000 \n", + "CustServCalls 0.208750 -0.003796 0.024522 -0.017824 -0.021723 \n", + "DayMins 0.205151 0.006216 -0.049396 -0.001684 0.003176 \n", + "DayCalls 0.018459 0.038470 -0.003755 -0.011086 -0.007962 \n", + "MonthlyCharge 0.072313 0.012581 -0.047291 0.737490 0.781660 \n", + "OverageFee 0.092812 -0.006749 -0.019105 0.021526 0.019637 \n", + "RoamMins 0.068239 0.009514 -0.045871 -0.001318 0.162746 \n", + "\n", + " CustServCalls DayMins DayCalls MonthlyCharge OverageFee \\\n", + "Churn 0.208750 0.205151 0.018459 0.072313 0.092812 \n", + "AccountWeeks -0.003796 0.006216 0.038470 0.012581 -0.006749 \n", + "ContractRenewal 0.024522 -0.049396 -0.003755 -0.047291 -0.019105 \n", + "DataPlan -0.017824 -0.001684 -0.011086 0.737490 0.021526 \n", + "DataUsage -0.021723 0.003176 -0.007962 0.781660 0.019637 \n", + "CustServCalls 1.000000 -0.013423 -0.018942 -0.028017 -0.012964 \n", + "DayMins -0.013423 1.000000 0.006750 0.567968 0.007038 \n", + "DayCalls -0.018942 0.006750 1.000000 -0.007963 -0.021449 \n", + "MonthlyCharge -0.028017 0.567968 -0.007963 1.000000 0.281766 \n", + "OverageFee -0.012964 0.007038 -0.021449 0.281766 1.000000 \n", + "RoamMins -0.009640 -0.010155 0.021565 0.117433 -0.011023 \n", + "\n", + " RoamMins \n", + "Churn 0.068239 \n", + "AccountWeeks 0.009514 \n", + "ContractRenewal -0.045871 \n", + "DataPlan -0.001318 \n", + "DataUsage 0.162746 \n", + "CustServCalls -0.009640 \n", + "DayMins -0.010155 \n", + "DayCalls 0.021565 \n", + "MonthlyCharge 0.117433 \n", + "OverageFee -0.011023 \n", + "RoamMins 1.000000 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.corr()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing\n", + "\n", + "- Are there any duplicated values?\n", + "- Do we need to do feature scaling?\n", + "- Do we need to generate new features?\n", + "- Split Train and Test dataset. (0.7/0.3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ML Application\n", + "\n", + "- Define models.\n", + "- Fit models.\n", + "- Evaluate models for both train and test dataset.\n", + "- Generate Confusion Matrix and scores of Accuracy, Recall, Precision and F1-Score.\n", + "- Analyse occurrence of overfitting and underfitting. If there is any of them, try to overcome it within a different section." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluation\n", + "\n", + "- Select the best performing model and write your comments about why choose this model.\n", + "- Analyse results and make comment about how you can improve model." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Project/09-11-2020 ML Course Nigeria Project 'name'.ipynb b/Project/09-11-2020 ML Course Nigeria Project 'name'.ipynb index 1856d7e..3db6828 100644 --- a/Project/09-11-2020 ML Course Nigeria Project 'name'.ipynb +++ b/Project/09-11-2020 ML Course Nigeria Project 'name'.ipynb @@ -39,19 +39,28 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 189, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import numpy as np\n", - "import matplotlib.pyplot as plt" + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import confusion_matrix, classification_report\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn import svm" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 132, "metadata": {}, "outputs": [ { @@ -179,7 +188,7 @@ "4 166.7 113 41.0 7.42 10.1 " ] }, - "execution_count": 5, + "execution_count": 132, "metadata": {}, "output_type": "execute_result" } @@ -192,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -201,62 +210,350 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 134, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3333, 11)" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Exploratory Data Analysis" + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 3333 entries, 0 to 3332\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Churn 3333 non-null int64 \n", + " 1 AccountWeeks 3333 non-null int64 \n", + " 2 ContractRenewal 3333 non-null int64 \n", + " 3 DataPlan 3333 non-null int64 \n", + " 4 DataUsage 3333 non-null float64\n", + " 5 CustServCalls 3333 non-null int64 \n", + " 6 DayMins 3333 non-null float64\n", + " 7 DayCalls 3333 non-null int64 \n", + " 8 MonthlyCharge 3333 non-null float64\n", + " 9 OverageFee 3333 non-null float64\n", + " 10 RoamMins 3333 non-null float64\n", + "dtypes: float64(5), int64(6)\n", + "memory usage: 286.6 KB\n" + ] + } + ], + "source": [ + "data.info()" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 136, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "Churn 0\n", + "AccountWeeks 0\n", + "ContractRenewal 0\n", + "DataPlan 0\n", + "DataUsage 0\n", + "CustServCalls 0\n", + "DayMins 0\n", + "DayCalls 0\n", + "MonthlyCharge 0\n", + "OverageFee 0\n", + "RoamMins 0\n", + "dtype: int64" ] }, - "execution_count": 7, + "execution_count": 136, "metadata": {}, "output_type": "execute_result" - }, + } + ], + "source": [ + "data.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAPvklEQVR4nO3df6zddX3H8edL6o9saixpQWw7S0xdVvcD9A7JWDKdkV/JUn9MA4tSGVn9AxZNzBL0j8E0JGZDjTrCUmMFjEqIyuxMI9ZOp25Te2saoFTCHTJ6bUev1oCbylZ874/zveHQ3ns/p5eee245z0fyzfl+39/P93vel1x48f15U1VIkrSQZ426AUnS8mdYSJKaDAtJUpNhIUlqMiwkSU0rRt3AMKxatarWr18/6jYk6ZSyZ8+eH1fV6rnWPSPDYv369UxOTo66DUk6pST5z/nWeRpKktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLU9Ix8gvtkeNVf3TbqFrQM7fm7K0bdgjQSHllIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqWloYZFkXZKvJ9mfZF+Sd3X165P8KMnebrq0b5v3JplKcn+Si/rqF3e1qSTXDqtnSdLcVgxx30eB91TV95O8ANiTZGe37iNVdWP/4CQbgcuAVwAvAb6W5OXd6puA1wPTwO4k26vqviH2LknqM7SwqKpDwKFu/mdJ9gNrFthkE3B7VT0O/DDJFHBet26qqh4ESHJ7N9awkKQlsiTXLJKsB84FvtuVrklyd5JtSVZ2tTXAgb7NprvafPVjv2NLkskkkzMzMyf5J5Ck8Tb0sEjyfOALwLur6jHgZuBlwDn0jjw+NDt0js1rgfpTC1Vbq2qiqiZWr159UnqXJPUM85oFSZ5NLyg+U1VfBKiqR/rWfwL4crc4Dazr23wtcLCbn68uSVoCw7wbKsAngf1V9eG++ll9w94I3NvNbwcuS/LcJGcDG4DvAbuBDUnOTvIcehfBtw+rb0nS8YZ5ZHEB8HbgniR7u9r7gMuTnEPvVNJDwDsBqmpfkjvoXbg+ClxdVU8AJLkGuAs4DdhWVfuG2Lck6RjDvBvq28x9vWHHAtvcANwwR33HQttJkobLJ7glSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUNLSySrEvy9ST7k+xL8q6ufnqSnUke6D5XdvUk+ViSqSR3J3ll3742d+MfSLJ5WD1LkuY2zCOLo8B7quq3gPOBq5NsBK4FdlXVBmBXtwxwCbChm7YAN0MvXIDrgFcD5wHXzQaMJGlpDC0squpQVX2/m/8ZsB9YA2wCbu2G3Qq8oZvfBNxWPd8BXpTkLOAiYGdVHamqnwI7gYuH1bck6XhLcs0iyXrgXOC7wJlVdQh6gQKc0Q1bAxzo22y6q81XP/Y7tiSZTDI5MzNzsn8ESRprQw+LJM8HvgC8u6oeW2joHLVaoP7UQtXWqpqoqonVq1cvrllJ0pyGGhZJnk0vKD5TVV/syo90p5foPg939WlgXd/ma4GDC9QlSUtkmHdDBfgksL+qPty3ajswe0fTZuBLffUruruizgce7U5T3QVcmGRld2H7wq4mSVoiK4a47wuAtwP3JNnb1d4HfBC4I8lVwMPAW7p1O4BLgSng58CVAFV1JMkHgN3duPdX1ZEh9i1JOsbQwqKqvs3c1xsAXjfH+AKunmdf24BtJ687SdKJ8AluSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpaaCwSLJrkJok6ZlpxUIrkzwP+DVgVZKVQLpVLwReMuTeJEnLxIJhAbwTeDe9YNjDk2HxGHDTEPuSJC0jC4ZFVX0U+GiSv6yqjy9RT5KkZaZ1ZAFAVX08yR8A6/u3qarbhtSXJGkZGSgsknwaeBmwF3iiKxdgWEjSGBgoLIAJYGNV1TCbkSQtT4M+Z3Ev8OJhNiJJWr4GDYtVwH1J7kqyfXZaaIMk25IcTnJvX+36JD9KsrebLu1b994kU0nuT3JRX/3irjaV5NoT/QElSU/foKehrl/Evm8B/p7jr2t8pKpu7C8k2QhcBryC3m26X0vy8m71TcDrgWlgd5LtVXXfIvqRJC3SoHdD/cuJ7riqvplk/YDDNwG3V9XjwA+TTAHndeumqupBgCS3d2MNC0laQoO+7uNnSR7rpl8meSLJY4v8zmuS3N2dplrZ1dYAB/rGTHe1+epz9bglyWSSyZmZmUW2Jkmay0BhUVUvqKoXdtPzgDfTO8V0om6mdwvuOcAh4ENdPXOMrQXqc/W4taomqmpi9erVi2hNkjSfRb11tqr+EfjjRWz3SFU9UVW/Aj7Bk6eapoF1fUPXAgcXqEuSltCgD+W9qW/xWfSeuzjhZy6SnFVVh7rFN9K7JRdgO/DZJB+md4F7A/A9ekcWG5KcDfyI3kXwPzvR75UkPT2D3g31J33zR4GH6F1onleSzwGvoffG2mngOuA1Sc6hFzQP0XtRIVW1L8kd9C5cHwWurqonuv1cA9wFnAZsq6p9A/YsSTpJBr0b6soT3XFVXT5H+ZMLjL8BuGGO+g5gx4l+vyTp5Bn0bqi1Se7sHrJ7JMkXkqwddnOSpOVh0Avcn6J3XeEl9G5d/aeuJkkaA4OGxeqq+lRVHe2mWwDvT5WkMTFoWPw4yduSnNZNbwN+MszGJEnLx6Bh8efAW4H/ovcw3Z8CJ3zRW5J0ahr01tkPAJur6qcASU4HbqQXIpKkZ7hBjyx+dzYoAKrqCHDucFqSJC03g4bFs/pe+jd7ZDHoUYkk6RQ36H/wPwT8W5LP03v6+q3M8QCdJOmZadAnuG9LMknv5YEB3uQfIJKk8THwqaQuHAwISRpDi3pFuSRpvBgWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqWloYZFkW5LDSe7tq52eZGeSB7rPlV09ST6WZCrJ3Ule2bfN5m78A0k2D6tfSdL8hnlkcQtw8TG1a4FdVbUB2NUtA1wCbOimLcDN0AsX4Drg1cB5wHWzASNJWjpDC4uq+iZw5JjyJuDWbv5W4A199duq5zvAi5KcBVwE7KyqI1X1U2AnxweQJGnIlvqaxZlVdQig+zyjq68BDvSNm+5q89WPk2RLkskkkzMzMye9cUkaZ8vlAnfmqNUC9eOLVVuraqKqJlavXn1Sm5OkcbfUYfFId3qJ7vNwV58G1vWNWwscXKAuSVpCSx0W24HZO5o2A1/qq1/R3RV1PvBod5rqLuDCJCu7C9sXdjVJ0hJaMawdJ/kc8BpgVZJpenc1fRC4I8lVwMPAW7rhO4BLgSng58CVAFV1JMkHgN3duPdX1bEXzSVJQza0sKiqy+dZ9bo5xhZw9Tz72QZsO4mtSZJO0HK5wC1JWsYMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoaSVgkeSjJPUn2Jpnsaqcn2Znkge5zZVdPko8lmUpyd5JXjqJnSRpnozyyeG1VnVNVE93ytcCuqtoA7OqWAS4BNnTTFuDmJe9UksbccjoNtQm4tZu/FXhDX/226vkO8KIkZ42iQUkaV6MKiwK+mmRPki1d7cyqOgTQfZ7R1dcAB/q2ne5qT5FkS5LJJJMzMzNDbF2Sxs+KEX3vBVV1MMkZwM4kP1hgbOao1XGFqq3AVoCJiYnj1kuSFm8kYVFVB7vPw0nuBM4DHklyVlUd6k4zHe6GTwPr+jZfCxxc0oalZebh9//OqFvQMvQbf33P0Pa95Kehkvx6khfMzgMXAvcC24HN3bDNwJe6+e3AFd1dUecDj86erpIkLY1RHFmcCdyZZPb7P1tVX0myG7gjyVXAw8BbuvE7gEuBKeDnwJVL37IkjbclD4uqehD4vTnqPwFeN0e9gKuXoDVJ0jyW062zkqRlyrCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktR0yoRFkouT3J9kKsm1o+5HksbJKREWSU4DbgIuATYClyfZONquJGl8nBJhAZwHTFXVg1X1v8DtwKYR9yRJY2PFqBsY0BrgQN/yNPDq/gFJtgBbusX/TnL/EvU2DlYBPx51E8tBbtw86hZ0PH8/Z12Xp7uHl8634lQJi7n+CdRTFqq2AluXpp3xkmSyqiZG3Yc0F38/l8apchpqGljXt7wWODiiXiRp7JwqYbEb2JDk7CTPAS4Dto+4J0kaG6fEaaiqOprkGuAu4DRgW1XtG3Fb48TTe1rO/P1cAqmq9ihJ0lg7VU5DSZJGyLCQJDUZFlqQr1nRcpRkW5LDSe4ddS/jwrDQvHzNipaxW4CLR93EODEstBBfs6Jlqaq+CRwZdR/jxLDQQuZ6zcqaEfUiaYQMCy2k+ZoVSePBsNBCfM2KJMCw0MJ8zYokwLDQAqrqKDD7mpX9wB2+ZkXLQZLPAf8O/GaS6SRXjbqnZzpf9yFJavLIQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFtEhJXpzk9iT/keS+JDuSbEny5VH3Jp1shoW0CEkC3Al8o6peVlUbgfcBZz7N/Z4Sf+pY48ewkBbntcD/VdU/zBaqai/wLeD5ST6f5AdJPtMFC0keSrKqm59I8o1u/vokW5N8FbgtyTuSfDHJV5I8kORvl/ynk47h/8VIi/PbwJ551p0LvILee7T+FbgA+HZjf68C/rCqfpHkHcA53X4eB+5P8vGqOrDQDqRh8shCOvm+V1XTVfUrYC+wfoBttlfVL/qWd1XVo1X1S+A+4KVD6FMamGEhLc4+ekcDc3m8b/4JnjyCP8qT/84975ht/mfAfUgjYVhIi/PPwHOT/MVsIcnvA3+0wDYP8WTAvHl4rUknn2EhLUL13sD5RuD13a2z+4DrWfjvffwN8NEk36J3tCCdMnzrrCSpySMLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLU9P+BS+lzMeDnBAAAAABJRU5ErkJggg==\n", "text/plain": [ - "
" + "Churn 0\n", + "AccountWeeks 0\n", + "ContractRenewal 0\n", + "DataPlan 0\n", + "DataUsage 0\n", + "CustServCalls 0\n", + "DayMins 0\n", + "DayCalls 0\n", + "MonthlyCharge 0\n", + "OverageFee 0\n", + "RoamMins 0\n", + "dtype: int64" ] }, - "metadata": { - "needs_background": "light" + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isin(['?']).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the above, the data have 3333 rows and 11 column and it is revealed that there is no missing value in our data" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ChurnAccountWeeksContractRenewalDataPlanDataUsageCustServCallsDayMinsDayCallsMonthlyChargeOverageFeeRoamMins
count3333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.0000003333.000000
mean0.144914101.0648060.9030900.2766280.8164751.562856179.775098100.43564456.30516110.05148810.237294
std0.35206739.8221060.2958790.4473981.2726681.31549154.46738920.06908416.4260322.5357122.791840
min0.0000001.0000000.0000000.0000000.0000000.0000000.0000000.00000014.0000000.0000000.000000
25%0.00000074.0000001.0000000.0000000.0000001.000000143.70000087.00000045.0000008.3300008.500000
50%0.000000101.0000001.0000000.0000000.0000001.000000179.400000101.00000053.50000010.07000010.300000
75%0.000000127.0000001.0000001.0000001.7800002.000000216.400000114.00000066.20000011.77000012.100000
max1.000000243.0000001.0000001.0000005.4000009.000000350.800000165.000000111.30000018.19000020.000000
\n", + "
" + ], + "text/plain": [ + " Churn AccountWeeks ContractRenewal DataPlan DataUsage \\\n", + "count 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 \n", + "mean 0.144914 101.064806 0.903090 0.276628 0.816475 \n", + "std 0.352067 39.822106 0.295879 0.447398 1.272668 \n", + "min 0.000000 1.000000 0.000000 0.000000 0.000000 \n", + "25% 0.000000 74.000000 1.000000 0.000000 0.000000 \n", + "50% 0.000000 101.000000 1.000000 0.000000 0.000000 \n", + "75% 0.000000 127.000000 1.000000 1.000000 1.780000 \n", + "max 1.000000 243.000000 1.000000 1.000000 5.400000 \n", + "\n", + " CustServCalls DayMins DayCalls MonthlyCharge OverageFee \\\n", + "count 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 \n", + "mean 1.562856 179.775098 100.435644 56.305161 10.051488 \n", + "std 1.315491 54.467389 20.069084 16.426032 2.535712 \n", + "min 0.000000 0.000000 0.000000 14.000000 0.000000 \n", + "25% 1.000000 143.700000 87.000000 45.000000 8.330000 \n", + "50% 1.000000 179.400000 101.000000 53.500000 10.070000 \n", + "75% 2.000000 216.400000 114.000000 66.200000 11.770000 \n", + "max 9.000000 350.800000 165.000000 111.300000 18.190000 \n", + "\n", + " RoamMins \n", + "count 3333.000000 \n", + "mean 10.237294 \n", + "std 2.791840 \n", + "min 0.000000 \n", + "25% 8.500000 \n", + "50% 10.300000 \n", + "75% 12.100000 \n", + "max 20.000000 " + ] }, - "output_type": "display_data" + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# Our label Distribution (countplot)\n" + "data.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exploratory Data Analysis" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 139, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 3, + "execution_count": 139, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAP60lEQVR4nO3dcayd9V3H8fdnMBm6ESEUVtrOsqVTCyqEayXyh0yi1CWmbHNLMRuNErsQZkaymMD+ENQ0WSLbHHPDdBmDmm2k2YZUBSfD6VxkY7dLs9JiXR0Id630spmARtF2X/84T8NZe3p/p7c959z2vl/JyXnO93l+z/lecssnz/P8nuemqpAkaS6vmHQDkqSFz7CQJDUZFpKkJsNCktRkWEiSms6cdAOjcv7559fKlSsn3YYknVK2b9/+fFUtObJ+2obFypUrmZ6ennQbknRKSfJvg+qehpIkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDWdtndwn6grfm/LpFvQArT9j2+YdAvSRHhkIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqSmkYVFkhVJvpzkySS7kry3q9+R5LtJdnSvN/eNuS3J3iR7klzbV78iyc5u3V1JMqq+JUlHO3OE+z4IvK+qvpnkNcD2JI906z5cVXf2b5xkNbAeuAS4CPhSkjdW1SHgbmAj8DXgIWAt8PAIe5ck9RnZkUVV7a+qb3bLLwJPAsvmGLIOuL+qXqqqp4C9wJokS4FzquqxqipgC3DdqPqWJB1tLNcskqwELge+3pXek+RbSe5Jcm5XWwY82zdspqst65aPrA/6no1JppNMz87OnswfQZIWtZGHRZJXA58HbqmqF+idUnoDcBmwH/jg4U0HDK856kcXqzZX1VRVTS1ZsuREW5ckdUYaFkleSS8oPl1VXwCoqueq6lBV/QD4BLCm23wGWNE3fDmwr6svH1CXJI3JKGdDBfgk8GRVfaivvrRvs7cAT3TL24D1Sc5KcjGwCni8qvYDLya5stvnDcCDo+pbknS0Uc6Gugp4F7AzyY6u9n7g+iSX0TuV9DTwboCq2pVkK7Cb3kyqm7uZUAA3AfcCZ9ObBeVMKEkao5GFRVV9lcHXGx6aY8wmYNOA+jRw6cnrTpJ0PLyDW5LUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lS08jCIsmKJF9O8mSSXUne29XPS/JIkm937+f2jbktyd4ke5Jc21e/IsnObt1dSTKqviVJRxvlkcVB4H1V9dPAlcDNSVYDtwKPVtUq4NHuM9269cAlwFrg40nO6PZ1N7ARWNW91o6wb0nSEUYWFlW1v6q+2S2/CDwJLAPWAfd1m90HXNctrwPur6qXquopYC+wJslS4JyqeqyqCtjSN0aSNAZjuWaRZCVwOfB14MKq2g+9QAEu6DZbBjzbN2ymqy3rlo+sD/qejUmmk0zPzs6e1J9BkhazkYdFklcDnwduqaoX5tp0QK3mqB9drNpcVVNVNbVkyZLjb1aSNNBIwyLJK+kFxaer6gtd+bnu1BLd+4GuPgOs6Bu+HNjX1ZcPqEuSxmSUs6ECfBJ4sqo+1LdqG7ChW94APNhXX5/krCQX07uQ/Xh3qurFJFd2+7yhb4wkaQzOHOG+rwLeBexMsqOrvR/4ALA1yY3AM8DbAapqV5KtwG56M6lurqpD3bibgHuBs4GHu5ckaUxGFhZV9VUGX28AuOYYYzYBmwbUp4FLT153kqTj4R3ckqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lS01BhkeTRYWqSpNPTmXOtTPIq4EeB85OcC6RbdQ5w0Yh7kyQtEHOGBfBu4BZ6wbCdl8PiBeBjo2tLkrSQzBkWVfUR4CNJfreqPjqmniRJC0zryAKAqvpokl8EVvaPqaotI+pLkrSADBUWSf4ceAOwAzjUlQswLCRpERgqLIApYHVV1SibkSQtTMPeZ/EE8NpRNiJJWriGDYvzgd1Jvphk2+HXXAOS3JPkQJIn+mp3JPlukh3d6819625LsjfJniTX9tWvSLKzW3dXkhz5XZKk0Rr2NNQd89j3vcCfcvR1jQ9X1Z39hSSrgfXAJfSm6X4pyRur6hBwN7AR+BrwELAWeHge/UiS5mnY2VD/cLw7rqqvJFk55ObrgPur6iXgqSR7gTVJngbOqarHAJJsAa7DsJCksRr2cR8vJnmhe/1PkkNJXpjnd74nybe601TndrVlwLN928x0tWXd8pH1Y/W5Mcl0kunZ2dl5tidJOtJQYVFVr6mqc7rXq4C30TvFdLzupjcF9zJgP/DBrj7oOkTNUT9Wn5uraqqqppYsWTKP9iRJg8zrqbNV9RfAL89j3HNVdaiqfgB8AljTrZoBVvRtuhzY19WXD6hLksZo2Jvy3tr38RX07rs47nsukiytqv3dx7fQm5ILsA34TJIP0bvAvQp4vKoOdafArgS+DtwA+NgRSRqzYWdD/Xrf8kHgaXoXpY8pyWeBq+k9sXYGuB24Osll9ILmaXoPKqSqdiXZCuzu9n9zNxMK4CZ6M6vOpndh24vbkjRmw86G+q3j3XFVXT+g/Mk5tt8EbBpQnwYuPd7vlySdPMPOhlqe5IHuJrvnknw+yfL2SEnS6WDYC9yfondd4SJ6U1f/sqtJkhaBYcNiSVV9qqoOdq97AeemStIiMWxYPJ/knUnO6F7vBL43ysYkSQvHsGHx28A7gH+ndzPdbwDHfdFbknRqGnbq7B8BG6rqPwCSnAfcSS9EJEmnuWGPLH72cFAAVNX3gctH05IkaaEZNixe0ffQv8NHFsMelUiSTnHD/g//g8A/Jfkcvbuv38GAG+gkSaenYe/g3pJkmt7DAwO8tap2j7QzSdKCMfSppC4cDAhJWoTm9YhySdLiYlhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkppGFRZJ7khxI8kRf7bwkjyT5dvfe/3e9b0uyN8meJNf21a9IsrNbd1eSjKpnSdJgozyyuBdYe0TtVuDRqloFPNp9JslqYD1wSTfm40nO6MbcDWwEVnWvI/cpSRqxkYVFVX0F+P4R5XXAfd3yfcB1ffX7q+qlqnoK2AusSbIUOKeqHquqArb0jZEkjcm4r1lcWFX7Abr3C7r6MuDZvu1mutqybvnI+kBJNiaZTjI9Ozt7UhuXpMVsoVzgHnQdouaoD1RVm6tqqqqmlixZctKak6TFbtxh8Vx3aonu/UBXnwFW9G23HNjX1ZcPqEuSxmjcYbEN2NAtbwAe7KuvT3JWkovpXch+vDtV9WKSK7tZUDf0jZEkjcmZo9pxks8CVwPnJ5kBbgc+AGxNciPwDPB2gKralWQrsBs4CNxcVYe6Xd1Eb2bV2cDD3UuSNEYjC4uquv4Yq645xvabgE0D6tPApSexNUnScVooF7glSQuYYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajIsJElNEwmLJE8n2ZlkR5LprnZekkeSfLt7P7dv+9uS7E2yJ8m1k+hZkhazSR5ZvKmqLquqqe7zrcCjVbUKeLT7TJLVwHrgEmAt8PEkZ0yiYUlarBbSaah1wH3d8n3AdX31+6vqpap6CtgLrBl/e5K0eE0qLAr42yTbk2zsahdW1X6A7v2Crr4MeLZv7ExXO0qSjUmmk0zPzs6OqHVJWnzOnND3XlVV+5JcADyS5J/n2DYDajVow6raDGwGmJqaGriNJOn4TSQsqmpf934gyQP0Tis9l2RpVe1PshQ40G0+A6zoG74c2DfWhqUF5pk//JlJt6AF6HW/v3Nk+x77aagkP5bkNYeXgV8FngC2ARu6zTYAD3bL24D1Sc5KcjGwCnh8vF1L0uI2iSOLC4EHkhz+/s9U1d8k+QawNcmNwDPA2wGqaleSrcBu4CBwc1UdmkDfkrRojT0squo7wM8NqH8PuOYYYzYBm0bcmiTpGBbS1FlJ0gJlWEiSmgwLSVKTYSFJajIsJElNhoUkqcmwkCQ1GRaSpCbDQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqQmw0KS1GRYSJKaDAtJUpNhIUlqMiwkSU2GhSSpybCQJDUZFpKkJsNCktRkWEiSmgwLSVKTYSFJajplwiLJ2iR7kuxNcuuk+5GkxeSUCIskZwAfA34NWA1cn2T1ZLuSpMXjlAgLYA2wt6q+U1X/C9wPrJtwT5K0aJw56QaGtAx4tu/zDPALR26UZCOwsfv4n0n2jKG3xeB84PlJN7EQ5M4Nk25BR/P387DbczL28hODiqdKWAz6L1BHFao2A5tH387ikmS6qqYm3Yc0iL+f43GqnIaaAVb0fV4O7JtQL5K06JwqYfENYFWSi5P8CLAe2DbhniRp0TglTkNV1cEk7wG+CJwB3FNVuybc1mLiqT0tZP5+jkGqjjr1L0nSDzlVTkNJkibIsJAkNRkWmpOPWdFCleSeJAeSPDHpXhYDw0LH5GNWtMDdC6yddBOLhWGhufiYFS1YVfUV4PuT7mOxMCw0l0GPWVk2oV4kTZBhobkM9ZgVSac/w0Jz8TErkgDDQnPzMSuSAMNCc6iqg8Dhx6w8CWz1MStaKJJ8FngM+MkkM0lunHRPpzMf9yFJavLIQpLUZFhIkpoMC0lSk2EhSWoyLCRJTYaFdAKSvDbJ/Un+NcnuJA8l2Zjkrybdm3QyGRbSPCUJ8ADw91X1hqpaDbwfuPAE93tK/LljLS7+Ukrz9ybg/6rqzw4XqmpHkh8HrknyOeBSYDvwzqqqJE8DU1X1fJIp4M6qujrJHcBFwErg+ST/ArwOeH33/idVddf4fjTph3lkIc3f4SAY5HLgFnp/B+T1wFVD7O8KYF1V/Wb3+aeAa+k9Kv72JK88oW6lE2BYSKPxeFXNVNUPgB30jhhatlXVf/d9/uuqeqmqngcOcIKnt6QTYVhI87eL3tHAIC/1LR/i5VO+B3n5392rjhjzX0PuQxo7w0Kav78DzkryO4cLSX4e+KU5xjzNywHzttG1Jp1choU0T9V7CudbgF/pps7uAu5g7r/58QfAR5L8I72jBemU4FNnJUlNHllIkpoMC0lSk2EhSWoyLCRJTYaFJKnJsJAkNRkWkqSm/wf03QODNr6OSgAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] @@ -268,43 +565,1750 @@ } ], "source": [ - "# Example EDA\n" + "# Our label Distribution (countplot)\n", + "sns.countplot(x='Churn', data=data)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 140, "metadata": {}, + "outputs": [], "source": [ - "# Preprocessing\n", - "\n", - "- Are there any duplicated values?\n", - "- Do we need to do feature scaling?\n", - "- Do we need to generate new features?\n", - "- Split Train and Test dataset. (0.7/0.3)" + "# Example EDA\n" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 141, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "# ML Application\n", - "\n", - "- Define models.\n", - "- Fit models.\n", - "- Evaluate models for both train and test dataset.\n", - "- Generate Confusion Matrix and scores of Accuracy, Recall, Precision and F1-Score.\n", - "- Analyse occurrence of overfitting and underfitting. If there is any of them, try to overcome it within a different section." + "#fig = plt.figure(figsize = (30, 25))\n", + "#name = list('AccountWeeks', 'ContractRenewal', \"DataPlan\", \"DataUsage\", 'CustServCalls', 'DayMins', 'DayCalls', 'MonthlyCharge', 'OverageFee', 'RoamMins')\n", + "plt.figure(figsize=(60, 25))\n", + "data.hist()\n", + "plt.show()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 142, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ChurnAccountWeeksContractRenewalDataPlanDataUsageCustServCallsDayMinsDayCallsMonthlyChargeOverageFeeRoamMins
Churn1.0000000.016541-0.259852-0.102148-0.0871950.2087500.2051510.0184590.0723130.0928120.068239
AccountWeeks0.0165411.000000-0.0247350.0029180.014391-0.0037960.0062160.0384700.012581-0.0067490.009514
ContractRenewal-0.259852-0.0247351.000000-0.006006-0.0192230.024522-0.049396-0.003755-0.047291-0.019105-0.045871
DataPlan-0.1021480.002918-0.0060061.0000000.945982-0.017824-0.001684-0.0110860.7374900.021526-0.001318
DataUsage-0.0871950.014391-0.0192230.9459821.000000-0.0217230.003176-0.0079620.7816600.0196370.162746
CustServCalls0.208750-0.0037960.024522-0.017824-0.0217231.000000-0.013423-0.018942-0.028017-0.012964-0.009640
DayMins0.2051510.006216-0.049396-0.0016840.003176-0.0134231.0000000.0067500.5679680.007038-0.010155
DayCalls0.0184590.038470-0.003755-0.011086-0.007962-0.0189420.0067501.000000-0.007963-0.0214490.021565
MonthlyCharge0.0723130.012581-0.0472910.7374900.781660-0.0280170.567968-0.0079631.0000000.2817660.117433
OverageFee0.092812-0.006749-0.0191050.0215260.019637-0.0129640.007038-0.0214490.2817661.000000-0.011023
RoamMins0.0682390.009514-0.045871-0.0013180.162746-0.009640-0.0101550.0215650.117433-0.0110231.000000
\n", + "
" + ], + "text/plain": [ + " Churn AccountWeeks ContractRenewal DataPlan DataUsage \\\n", + "Churn 1.000000 0.016541 -0.259852 -0.102148 -0.087195 \n", + "AccountWeeks 0.016541 1.000000 -0.024735 0.002918 0.014391 \n", + "ContractRenewal -0.259852 -0.024735 1.000000 -0.006006 -0.019223 \n", + "DataPlan -0.102148 0.002918 -0.006006 1.000000 0.945982 \n", + "DataUsage -0.087195 0.014391 -0.019223 0.945982 1.000000 \n", + "CustServCalls 0.208750 -0.003796 0.024522 -0.017824 -0.021723 \n", + "DayMins 0.205151 0.006216 -0.049396 -0.001684 0.003176 \n", + "DayCalls 0.018459 0.038470 -0.003755 -0.011086 -0.007962 \n", + "MonthlyCharge 0.072313 0.012581 -0.047291 0.737490 0.781660 \n", + "OverageFee 0.092812 -0.006749 -0.019105 0.021526 0.019637 \n", + "RoamMins 0.068239 0.009514 -0.045871 -0.001318 0.162746 \n", + "\n", + " CustServCalls DayMins DayCalls MonthlyCharge OverageFee \\\n", + "Churn 0.208750 0.205151 0.018459 0.072313 0.092812 \n", + "AccountWeeks -0.003796 0.006216 0.038470 0.012581 -0.006749 \n", + "ContractRenewal 0.024522 -0.049396 -0.003755 -0.047291 -0.019105 \n", + "DataPlan -0.017824 -0.001684 -0.011086 0.737490 0.021526 \n", + "DataUsage -0.021723 0.003176 -0.007962 0.781660 0.019637 \n", + "CustServCalls 1.000000 -0.013423 -0.018942 -0.028017 -0.012964 \n", + "DayMins -0.013423 1.000000 0.006750 0.567968 0.007038 \n", + "DayCalls -0.018942 0.006750 1.000000 -0.007963 -0.021449 \n", + "MonthlyCharge -0.028017 0.567968 -0.007963 1.000000 0.281766 \n", + "OverageFee -0.012964 0.007038 -0.021449 0.281766 1.000000 \n", + "RoamMins -0.009640 -0.010155 0.021565 0.117433 -0.011023 \n", + "\n", + " RoamMins \n", + "Churn 0.068239 \n", + "AccountWeeks 0.009514 \n", + "ContractRenewal -0.045871 \n", + "DataPlan -0.001318 \n", + "DataUsage 0.162746 \n", + "CustServCalls -0.009640 \n", + "DayMins -0.010155 \n", + "DayCalls 0.021565 \n", + "MonthlyCharge 0.117433 \n", + "OverageFee -0.011023 \n", + "RoamMins 1.000000 " + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Evaluation\n", - "\n", - "- Select the best performing model and write your comments about why choose this model.\n", - "- Analyse results and make comment about how you can improve model." + "data.corr()" ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "corr = data.corr()\n", + "sns.heatmap(corr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD7CAYAAABkO19ZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVb0lEQVR4nO3dcWycd33H8fdnKQ2hAZpQcjJJtITJsCWzKNSK2Dqhy8JIaKc5+yOSUZmcKZL3R2AweRrO+AP2h6UwLWhIayd5lM1bWS2PUsWiFWvIOCGk0tCUtGkSshgSEjdeAoW2GKqAw3d/3BO4JLbvznfnB//u85Ki57nf/X7P8/vqHn/85PFzd4oIzMwsXb+R9wTMzKy1HPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZomrKegl/ZWkE5Kel/SwpNdKWi3pkKQz2XJVRf99kiYknZa0vXXTNzOzalTtPnpJa4GvA5si4lVJY8DjwCbghxGxX9IgsCoiPiZpE/AwsAV4C/AV4G0RcbWVhZiZ2exuqaPfCkk/B14HXAT2AcXs+RGgBHwM6AFGI+IKcFbSBOXQf3Kujd9xxx2xYcOGuif/k5/8hNtuu63ucSlw7a693bj2m2s/evToDyLizdXGVw36iHhB0j8A54FXgSci4glJhYiYyvpMSVqTDVkLfKNiE5NZ25w2bNjA008/XW0qNymVShSLxbrHpcC1F/OeRi5cezHvaeRirtolfa+W8VWDPrv23gNsBF4C/kvSB+cbMkvbTdeHJPUD/QCFQoFSqVTDdK83PT29oHEpcO2lvKeRC9deynsauWi09lou3bwXOBsR3weQ9EXg94FLkjqys/kO4HLWfxJYXzF+HeVLPdeJiGFgGKC7uzsW8pvav+GLeU8jF669mPc0cuHaiwseX8tdN+eBd0t6nSQB24BTwDjQl/XpAw5m6+NAr6TlkjYCncCRBc/QzMwaUss1+qckfQF4BpgBvkX5THwlMCZpD+VfBruy/ieyO3NOZv33+o4bM7P81HTXTUR8AvjEDc1XKJ/dz9Z/CBhqbGpmZtYMfmesmVniHPRmZolz0JuZJc5Bb2aWuFo/AsEsVxsGH/vl+kDXDLsrHrfauf33Ltq+zFrBZ/RmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVni/IYpq8uGRXyjkpk1h8/ozcwS56A3M0ucg97MLHEOejOzxFUNeklvl3Ss4t8rkj4qabWkQ5LOZMtVFWP2SZqQdFrS9taWYGZm86ka9BFxOiLujIg7gbuAnwKPAoPA4YjoBA5nj5G0CegFNgM7gAckLWvN9M3MrJp6L91sA74TEd8DeoCRrH0E2Jmt9wCjEXElIs4CE8CWJszVzMwWoN6g7wUeztYLETEFkC3XZO1rgQsVYyazNjMzy4EioraO0q3ARWBzRFyS9FJE3F7x/I8iYpWk+4EnI+KhrP1B4PGIeOSG7fUD/QCFQuGu0dHRuic/PT3NypUr6x6XgrxqP/7Cy4u+zxsVVsClVxdvf11r37h4O6vCx7xrr7R169ajEdFdbXw974x9P/BMRFzKHl+S1BERU5I6gMtZ+ySwvmLcOsq/IK4TEcPAMEB3d3cUi8U6plJWKpVYyLgU5FX7Yn6F31wGumY4cHzx3tR97r7iou2rGh/zxbynkYtGa6/n0s0H+NVlG4BxoC9b7wMOVrT3SlouaSPQCRxZ8AzNzKwhNZ0WSXod8EfAX1Q07wfGJO0BzgO7ACLihKQx4CQwA+yNiKtNnbWZmdWspqCPiJ8Cb7qh7UXKd+HM1n8IGGp4dmZm1jC/M9bMLHEOejOzxDnozcwS56A3M0ucg97MLHEOejOzxDnozcwS56A3M0ucg97MLHEOejOzxDnozcwS56A3M0ucg97MLHEOejOzxDnozcwS56A3M0ucg97MLHEOejOzxNUU9JJul/QFSd+WdErS70laLemQpDPZclVF/32SJiSdlrS9ddM3M7Nqaj2j/wzw5Yj4beAdwClgEDgcEZ3A4ewxkjYBvcBmYAfwgKRlzZ64mZnVpmrQS3oD8B7gQYCI+FlEvAT0ACNZtxFgZ7beA4xGxJWIOAtMAFuaO20zM6tVLWf0bwW+D/yrpG9J+qyk24BCREwBZMs1Wf+1wIWK8ZNZm5mZ5UARMX8HqRv4BnB3RDwl6TPAK8CHI+L2in4/iohVku4HnoyIh7L2B4HHI+KRG7bbD/QDFAqFu0ZHR+ue/PT0NCtXrqx7XAryqv34Cy8v+j5vVFgBl15dvP11rX3j4u2sCh/zrr3S1q1bj0ZEd7Xxt9Swj0lgMiKeyh5/gfL1+EuSOiJiSlIHcLmi//qK8euAizduNCKGgWGA7u7uKBaLNUzleqVSiYWMS0Fete8efGzR93mjga4ZDhyv5dBtjnP3FRdtX9X4mC/mPY1cNFp71Us3EfF/wAVJb8+atgEngXGgL2vrAw5m6+NAr6TlkjYCncCRBc/QzMwaUutp0YeBz0u6Ffgu8OeUf0mMSdoDnAd2AUTECUljlH8ZzAB7I+Jq02duZmY1qSnoI+IYMNt1oG1z9B8ChhY+LTMzaxa/M9bMLHEOejOzxC3erQtmS9SGnO40Orf/3lz2a+nxGb2ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klrqagl3RO0nFJxyQ9nbWtlnRI0plsuaqi/z5JE5JOS9reqsmbmVl19ZzRb42IOyPi2nfHDgKHI6ITOJw9RtImoBfYDOwAHpC0rIlzNjOzOjRy6aYHGMnWR4CdFe2jEXElIs4CE8CWBvZjZmYNqDXoA3hC0lFJ/VlbISKmALLlmqx9LXChYuxk1mZmZjlQRFTvJL0lIi5KWgMcAj4MjEfE7RV9fhQRqyTdDzwZEQ9l7Q8Cj0fEIzdssx/oBygUCneNjo7WPfnp6WlWrlxZ97il7vgLL1NYAZdezXsm+WiX2rvWvvGmtnY95sG1z1b71q1bj1ZcTp9TTV8OHhEXs+VlSY9SvhRzSVJHRExJ6gAuZ90ngfUVw9cBF2fZ5jAwDNDd3R3FYrGWqVynVCqxkHFL3e7BxxjomuHA8fb8bvd2qf3cfcWb2tr1mAfX3kjtVS/dSLpN0uuvrQPvA54HxoG+rFsfcDBbHwd6JS2XtBHoBI4seIZmZtaQWk6LCsCjkq71/8+I+LKkbwJjkvYA54FdABFxQtIYcBKYAfZGxNWWzN7MzKqqGvQR8V3gHbO0vwhsm2PMEDDU8OzMzKxhfmesmVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZomrOeglLZP0LUlfyh6vlnRI0plsuaqi7z5JE5JOS9reiombmVlt6jmj/whwquLxIHA4IjqBw9ljJG0CeoHNwA7gAUnLmjNdMzOrV01BL2kdcC/w2YrmHmAkWx8Bdla0j0bElYg4C0wAW5oyWzMzq1utZ/T/CPwN8IuKtkJETAFkyzVZ+1rgQkW/yazNzMxycEu1DpL+GLgcEUclFWvYpmZpi1m22w/0AxQKBUqlUg2bvt709PSCxi11A10zFFaUl+2oXWqf7dhu12MeXHsjtVcNeuBu4E8k3QO8FniDpIeAS5I6ImJKUgdwOes/CayvGL8OuHjjRiNiGBgG6O7ujmKxWPfkS6USCxm31O0efIyBrhkOHK/l5UtPu9R+7r7iTW3tesyDa2+k9qqXbiJiX0Ssi4gNlP/I+j8R8UFgHOjLuvUBB7P1caBX0nJJG4FO4MiCZ2hmZg1p5LRoPzAmaQ9wHtgFEBEnJI0BJ4EZYG9EXG14pmZmtiB1BX1ElIBStv4isG2OfkPAUINzMzOzJvA7Y83MEuegNzNLnIPezCxxDnozs8Q56M3MEuegNzNLnIPezCxxDnozs8Q56M3MEuegNzNLnIPezCxxDnozs8Q56M3MEuegNzNLnIPezCxxDnozs8Q56M3MEuegNzNLXNWgl/RaSUckPSvphKS/y9pXSzok6Uy2XFUxZp+kCUmnJW1vZQFmZja/Ws7orwB/GBHvAO4Edkh6NzAIHI6ITuBw9hhJm4BeYDOwA3hA0rIWzN3MzGpQNeijbDp7+JrsXwA9wEjWPgLszNZ7gNGIuBIRZ4EJYEszJ21mZrWr6Rq9pGWSjgGXgUMR8RRQiIgpgGy5Juu+FrhQMXwyazMzsxzcUkuniLgK3CnpduBRSb87T3fNtombOkn9QD9AoVCgVCrVMpXrTE9PL2jcUjfQNUNhRXnZjtql9tmO7XY95sG1N1J7TUF/TUS8JKlE+dr7JUkdETElqYPy2T6Uz+DXVwxbB1ycZVvDwDBAd3d3FIvFuidfKpVYyLilbvfgYwx0zXDgeF0vXzLapfZz9xVvamvXYx5ceyO113LXzZuzM3kkrQDeC3wbGAf6sm59wMFsfRzolbRc0kagEziy4BmamVlDajkt6gBGsjtnfgMYi4gvSXoSGJO0BzgP7AKIiBOSxoCTwAywN7v0Y2ZmOaga9BHxHPDOWdpfBLbNMWYIGGp4dmZtbMPgYze1DXTNsHuW9mY7t//elu/DFo/fGWtmljgHvZlZ4hz0ZmaJc9CbmSXOQW9mljgHvZlZ4hz0ZmaJc9CbmSXOQW9mljgHvZlZ4hz0ZmaJc9CbmSXOQW9mljgHvZlZ4hz0ZmaJS//72Fpots8LNzP7deMzejOzxDnozcwSV8uXg6+X9FVJpySdkPSRrH21pEOSzmTLVRVj9kmakHRa0vZWFmBmZvOr5Yx+BhiIiN8B3g3slbQJGAQOR0QncDh7TPZcL7AZ2AE8kH2xuJmZ5aBq0EfEVEQ8k63/GDgFrAV6gJGs2wiwM1vvAUYj4kpEnAUmgC1NnreZmdWormv0kjYA7wSeAgoRMQXlXwbAmqzbWuBCxbDJrM3MzHJQ8+2VklYCjwAfjYhXJM3ZdZa2mGV7/UA/QKFQoFQq1TqVX5qenl7QuGYZ6JrJbd+FFfnuP0+uvfW15/lzNZe8f97z1GjtNQW9pNdQDvnPR8QXs+ZLkjoiYkpSB3A5a58E1lcMXwdcvHGbETEMDAN0d3dHsVise/KlUomFjGuW3TneRz/QNcOB4+35NgjX3vraz91XbPk+6pX3z3ueGq29lrtuBDwInIqIT1c8NQ70Zet9wMGK9l5JyyVtBDqBIwueoZmZNaSWU4O7gT8Djks6lrX9LbAfGJO0BzgP7AKIiBOSxoCTlO/Y2RsRV5s9cTMzq03VoI+IrzP7dXeAbXOMGQKGGpiXmZk1id8Za2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniHPRmZolz0JuZJc5Bb2aWOAe9mVniavly8M9Juizp+Yq21ZIOSTqTLVdVPLdP0oSk05K2t2riZmZWm1q+HPzfgH8C/r2ibRA4HBH7JQ1mjz8maRPQC2wG3gJ8RdLb/OXgZkvLhsHHctnvuf335rLf1FU9o4+IrwE/vKG5BxjJ1keAnRXtoxFxJSLOAhPAluZM1czMFmKh1+gLETEFkC3XZO1rgQsV/SazNjMzy0ktl27qoVnaYtaOUj/QD1AoFCiVSnXvbHp6mlKpxPEXXq57bDMMdOWyWwAKK2Cgaya/CeTItadb+3w5cO3nvR01WvtCg/6SpI6ImJLUAVzO2ieB9RX91gEXZ9tARAwDwwDd3d1RLBbrnkSpVKJYLLI7p+uJeRromuHA8Wb/nl4aXHu6tZ+7rzjnc9d+3ttRo7Uv9NLNONCXrfcBByvaeyUtl7QR6ASOLHh2ZmbWsKqnBpIeBorAHZImgU8A+4ExSXuA88AugIg4IWkMOAnMAHt9x42ZWb6qBn1EfGCOp7bN0X8IGGpkUmZm1jx+Z6yZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiXPQm5klzkFvZpY4B72ZWeIc9GZmiUv3q2rMbMnZMM+3xQ10zbTs2+TO7b+3Jdv9deEzejOzxDnozcwS56A3M0tcy4Je0g5JpyVNSBps1X7MzGx+LQl6ScuA+4H3A5uAD0ja1Ip9mZnZ/Fp1180WYCIivgsgaRToAU62aH9mZgs2390+rbYYd/y06tLNWuBCxePJrM3MzBZZq87oNUtbXNdB6gf6s4fTkk4vYD93AD9YwLgl7y9du2tvM6nWrk/V1G2u2n+zlsGtCvpJYH3F43XAxcoOETEMDDeyE0lPR0R3I9tYqly7a283rn3htbfq0s03gU5JGyXdCvQC4y3al5mZzaMlZ/QRMSPpQ8B/A8uAz0XEiVbsy8zM5teyz7qJiMeBx1u1/UxDl36WONfenlx7e2rsMndEVO9lZmZLlj8CwcwscUs26NvtIxYknZN0XNIxSU9nbaslHZJ0JluuynuezSDpc5IuS3q+om3OWiXty46D05K25zPr5pij9k9KeiF77Y9JuqfiuSRql7Re0lclnZJ0QtJHsvbkX/d5am/e6x4RS+4f5T/wfgd4K3Ar8CywKe95tbjmc8AdN7T9PTCYrQ8Cn8p7nk2q9T3Au4Dnq9VK+SM2ngWWAxuz42JZ3jU0ufZPAn89S99kagc6gHdl668H/jerL/nXfZ7am/a6L9Uz+l9+xEJE/Ay49hEL7aYHGMnWR4Cd+U2leSLia8APb2ieq9YeYDQirkTEWWCC8vGxJM1R+1ySqT0ipiLimWz9x8Apyu+mT/51n6f2udRd+1IN+nb8iIUAnpB0NHtXMUAhIqagfLAAa3KbXevNVWu7HAsfkvRcdmnn2uWLJGuXtAF4J/AUbfa631A7NOl1X6pBX/UjFhJ0d0S8i/Ingu6V9J68J/Rroh2OhX8Gfgu4E5gCDmTtydUuaSXwCPDRiHhlvq6ztKVWe9Ne96Ua9FU/YiE1EXExW14GHqX8X7VLkjoAsuXl/GbYcnPVmvyxEBGXIuJqRPwC+Bd+9d/0pGqX9BrKQff5iPhi1twWr/tstTfzdV+qQd9WH7Eg6TZJr7+2DrwPeJ5yzX1Ztz7gYD4zXBRz1ToO9EpaLmkj0AkcyWF+LXMt6DJ/Svm1h4RqlyTgQeBURHy64qnkX/e5am/q6573X5wb+Ev1PZT/Ov0d4ON5z6fFtb6V8l/ZnwVOXKsXeBNwGDiTLVfnPdcm1fsw5f+q/pzy2cue+WoFPp4dB6eB9+c9/xbU/h/AceC57Ie8I7XagT+gfPnhOeBY9u+ednjd56m9aa+73xlrZpa4pXrpxszMauSgNzNLnIPezCxxDnozs8Q56M3MEuegNzNLnIPezCxxDnozs8T9P404lzspWHLQAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "data['AccountWeeks'].hist()" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ChurnAccountWeeksContractRenewalDataPlanDataUsageCustServCallsDayMinsDayCallsMonthlyChargeOverageFeeRoamMins
Churn1.0000000.016541-0.259852-0.102148-0.0871950.2087500.2051510.0184590.0723130.0928120.068239
AccountWeeks0.0165411.000000-0.0247350.0029180.014391-0.0037960.0062160.0384700.012581-0.0067490.009514
ContractRenewal-0.259852-0.0247351.000000-0.006006-0.0192230.024522-0.049396-0.003755-0.047291-0.019105-0.045871
DataPlan-0.1021480.002918-0.0060061.0000000.945982-0.017824-0.001684-0.0110860.7374900.021526-0.001318
DataUsage-0.0871950.014391-0.0192230.9459821.000000-0.0217230.003176-0.0079620.7816600.0196370.162746
CustServCalls0.208750-0.0037960.024522-0.017824-0.0217231.000000-0.013423-0.018942-0.028017-0.012964-0.009640
DayMins0.2051510.006216-0.049396-0.0016840.003176-0.0134231.0000000.0067500.5679680.007038-0.010155
DayCalls0.0184590.038470-0.003755-0.011086-0.007962-0.0189420.0067501.000000-0.007963-0.0214490.021565
MonthlyCharge0.0723130.012581-0.0472910.7374900.781660-0.0280170.567968-0.0079631.0000000.2817660.117433
OverageFee0.092812-0.006749-0.0191050.0215260.019637-0.0129640.007038-0.0214490.2817661.000000-0.011023
RoamMins0.0682390.009514-0.045871-0.0013180.162746-0.009640-0.0101550.0215650.117433-0.0110231.000000
\n", + "
" + ], + "text/plain": [ + " Churn AccountWeeks ContractRenewal DataPlan DataUsage \\\n", + "Churn 1.000000 0.016541 -0.259852 -0.102148 -0.087195 \n", + "AccountWeeks 0.016541 1.000000 -0.024735 0.002918 0.014391 \n", + "ContractRenewal -0.259852 -0.024735 1.000000 -0.006006 -0.019223 \n", + "DataPlan -0.102148 0.002918 -0.006006 1.000000 0.945982 \n", + "DataUsage -0.087195 0.014391 -0.019223 0.945982 1.000000 \n", + "CustServCalls 0.208750 -0.003796 0.024522 -0.017824 -0.021723 \n", + "DayMins 0.205151 0.006216 -0.049396 -0.001684 0.003176 \n", + "DayCalls 0.018459 0.038470 -0.003755 -0.011086 -0.007962 \n", + "MonthlyCharge 0.072313 0.012581 -0.047291 0.737490 0.781660 \n", + "OverageFee 0.092812 -0.006749 -0.019105 0.021526 0.019637 \n", + "RoamMins 0.068239 0.009514 -0.045871 -0.001318 0.162746 \n", + "\n", + " CustServCalls DayMins DayCalls MonthlyCharge OverageFee \\\n", + "Churn 0.208750 0.205151 0.018459 0.072313 0.092812 \n", + "AccountWeeks -0.003796 0.006216 0.038470 0.012581 -0.006749 \n", + "ContractRenewal 0.024522 -0.049396 -0.003755 -0.047291 -0.019105 \n", + "DataPlan -0.017824 -0.001684 -0.011086 0.737490 0.021526 \n", + "DataUsage -0.021723 0.003176 -0.007962 0.781660 0.019637 \n", + "CustServCalls 1.000000 -0.013423 -0.018942 -0.028017 -0.012964 \n", + "DayMins -0.013423 1.000000 0.006750 0.567968 0.007038 \n", + "DayCalls -0.018942 0.006750 1.000000 -0.007963 -0.021449 \n", + "MonthlyCharge -0.028017 0.567968 -0.007963 1.000000 0.281766 \n", + "OverageFee -0.012964 0.007038 -0.021449 0.281766 1.000000 \n", + "RoamMins -0.009640 -0.010155 0.021565 0.117433 -0.011023 \n", + "\n", + " RoamMins \n", + "Churn 0.068239 \n", + "AccountWeeks 0.009514 \n", + "ContractRenewal -0.045871 \n", + "DataPlan -0.001318 \n", + "DataUsage 0.162746 \n", + "CustServCalls -0.009640 \n", + "DayMins -0.010155 \n", + "DayCalls 0.021565 \n", + "MonthlyCharge 0.117433 \n", + "OverageFee -0.011023 \n", + "RoamMins 1.000000 " + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.corr()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing\n", + "\n", + "- Are there any duplicated values?\n", + "- Do we need to do feature scaling?\n", + "- Do we need to generate new features?\n", + "- Split Train and Test dataset. (0.7/0.3)" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2850\n", + "1 483\n", + "Name: Churn, dtype: int64" + ] + }, + "execution_count": 147, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"Churn\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.41167182, 0.67648946, 0.32758048, ..., 1.99072703, 0.0715836 ,\n", + " 0.08500823],\n", + " [0.41167182, 0.14906505, 0.32758048, ..., 1.56451025, 0.10708191,\n", + " 1.24048169],\n", + " [0.41167182, 0.9025285 , 0.32758048, ..., 0.26213309, 1.57434567,\n", + " 0.70312091],\n", + " ...,\n", + " [0.41167182, 1.83505538, 0.32758048, ..., 0.01858065, 1.73094204,\n", + " 1.3837779 ],\n", + " [0.41167182, 2.08295458, 3.05268496, ..., 0.38390932, 0.81704825,\n", + " 1.87621082],\n", + " [0.41167182, 0.67974475, 0.32758048, ..., 2.66049626, 1.28129669,\n", + " 1.24048169]])" + ] + }, + "execution_count": 148, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy import stats\n", + "import numpy as np\n", + "z = np.abs(stats.zscore(data))\n", + "z" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "414" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliers = list(set(np.where(z > 3)[0]))\n", + "\n", + "len(outliers)" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "from sklearn.model_selection import train_test_split, cross_validate" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [], + "source": [ + "y = data.Churn" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + " ..\n", + "3328 0\n", + "3329 0\n", + "3330 0\n", + "3331 0\n", + "3332 0\n", + "Name: Churn, Length: 3333, dtype: int64" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "X=data.drop('Churn',axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AccountWeeksContractRenewalDataPlanDataUsageCustServCallsDayMinsDayCallsMonthlyChargeOverageFeeRoamMins
0128112.701265.111089.09.8710.0
1107113.701161.612382.09.7813.7
2137100.000243.411452.06.0612.2
384000.002299.47157.03.106.6
475000.003166.711341.07.4210.1
.................................
3328192112.672156.27771.710.789.9
332968100.343231.15756.47.679.6
333028100.002180.810956.014.4414.1
3331184000.002213.810550.07.985.0
333274113.700234.4113100.013.3013.7
\n", + "

3333 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " AccountWeeks ContractRenewal DataPlan DataUsage CustServCalls \\\n", + "0 128 1 1 2.70 1 \n", + "1 107 1 1 3.70 1 \n", + "2 137 1 0 0.00 0 \n", + "3 84 0 0 0.00 2 \n", + "4 75 0 0 0.00 3 \n", + "... ... ... ... ... ... \n", + "3328 192 1 1 2.67 2 \n", + "3329 68 1 0 0.34 3 \n", + "3330 28 1 0 0.00 2 \n", + "3331 184 0 0 0.00 2 \n", + "3332 74 1 1 3.70 0 \n", + "\n", + " DayMins DayCalls MonthlyCharge OverageFee RoamMins \n", + "0 265.1 110 89.0 9.87 10.0 \n", + "1 161.6 123 82.0 9.78 13.7 \n", + "2 243.4 114 52.0 6.06 12.2 \n", + "3 299.4 71 57.0 3.10 6.6 \n", + "4 166.7 113 41.0 7.42 10.1 \n", + "... ... ... ... ... ... \n", + "3328 156.2 77 71.7 10.78 9.9 \n", + "3329 231.1 57 56.4 7.67 9.6 \n", + "3330 180.8 109 56.0 14.44 14.1 \n", + "3331 213.8 105 50.0 7.98 5.0 \n", + "3332 234.4 113 100.0 13.30 13.7 \n", + "\n", + "[3333 rows x 10 columns]" + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexChurnAccountWeeksContractRenewalDataPlanDataUsageCustServCallsDayMinsDayCallsMonthlyChargeOverageFeeRoamMins
000128112.701265.111089.09.8710.0
110107113.701161.612382.09.7813.7
220137100.000243.411452.06.0612.2
360121112.033218.28887.317.437.5
480117100.191184.59763.917.588.7
.......................................
29143327079100.002134.79840.09.4911.8
291533280192112.672156.27771.710.789.9
29163329068100.343231.15756.47.679.6
29173330028100.002180.810956.014.4414.1
29183332074113.700234.4113100.013.3013.7
\n", + "

2919 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " index Churn AccountWeeks ContractRenewal DataPlan DataUsage \\\n", + "0 0 0 128 1 1 2.70 \n", + "1 1 0 107 1 1 3.70 \n", + "2 2 0 137 1 0 0.00 \n", + "3 6 0 121 1 1 2.03 \n", + "4 8 0 117 1 0 0.19 \n", + "... ... ... ... ... ... ... \n", + "2914 3327 0 79 1 0 0.00 \n", + "2915 3328 0 192 1 1 2.67 \n", + "2916 3329 0 68 1 0 0.34 \n", + "2917 3330 0 28 1 0 0.00 \n", + "2918 3332 0 74 1 1 3.70 \n", + "\n", + " CustServCalls DayMins DayCalls MonthlyCharge OverageFee RoamMins \n", + "0 1 265.1 110 89.0 9.87 10.0 \n", + "1 1 161.6 123 82.0 9.78 13.7 \n", + "2 0 243.4 114 52.0 6.06 12.2 \n", + "3 3 218.2 88 87.3 17.43 7.5 \n", + "4 1 184.5 97 63.9 17.58 8.7 \n", + "... ... ... ... ... ... ... \n", + "2914 2 134.7 98 40.0 9.49 11.8 \n", + "2915 2 156.2 77 71.7 10.78 9.9 \n", + "2916 3 231.1 57 56.4 7.67 9.6 \n", + "2917 2 180.8 109 56.0 14.44 14.1 \n", + "2918 0 234.4 113 100.0 13.30 13.7 \n", + "\n", + "[2919 rows x 12 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "2919" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "new_df = data.drop(outliers,axis = 0).reset_index(drop = False)\n", + "display(new_df)\n", + "\n", + "y_new = y[list(new_df[\"index\"])]\n", + "len(y_new)" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.35150616, 0.69196206, 0. , ..., 2.05929645,\n", + " -0.068431 , -0.0941079 ],\n", + " [-0.35150616, 0.1624696 , 0. , ..., 1.62161451,\n", + " -0.10450144, 1.29308134],\n", + " [-0.35150616, 0.9188874 , 0. , ..., -0.25416527,\n", + " -1.59541298, 0.73070732],\n", + " ...,\n", + " [-0.35150616, -0.82087355, 0. , ..., 0.0209491 ,\n", + " -0.95015288, -0.2440743 ],\n", + " [-0.35150616, -1.82943062, 0. , ..., -0.0040613 ,\n", + " 1.76314581, 1.44304774],\n", + " [-0.35150616, -0.66958999, 0. , ..., 2.74708237,\n", + " 1.30625356, 1.29308134]])" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_new = new_df.drop('index', axis = 1)\n", + "\n", + "X_scaled = StandardScaler().fit_transform(X_new)\n", + "X_scaled" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.35150616, 0.69196206, 0. , ..., 2.05929645,\n", + " -0.068431 , -0.0941079 ],\n", + " [-0.35150616, 0.1624696 , 0. , ..., 1.62161451,\n", + " -0.10450144, 1.29308134],\n", + " [-0.35150616, 0.9188874 , 0. , ..., -0.25416527,\n", + " -1.59541298, 0.73070732],\n", + " ...,\n", + " [-0.35150616, -0.82087355, 0. , ..., 0.0209491 ,\n", + " -0.95015288, -0.2440743 ],\n", + " [-0.35150616, -1.82943062, 0. , ..., -0.0040613 ,\n", + " 1.76314581, 1.44304774],\n", + " [-0.35150616, -0.66958999, 0. , ..., 2.74708237,\n", + " 1.30625356, 1.29308134]])" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = new_data.drop('index', axis = 1)\n", + "\n", + "data = StandardScaler().fit_transform(data)\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ML Application\n", + "\n", + "- Define models.\n", + "- Fit models.\n", + "- Evaluate models for both train and test dataset.\n", + "- Generate Confusion Matrix and scores of Accuracy, Recall, Precision and F1-Score.\n", + "- Analyse occurrence of overfitting and underfitting. If there is any of them, try to overcome it within a different section." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Logistic Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean training accuracy: 0.8551219232453166\n", + "Test accuracy: 0.848\n" + ] + } + ], + "source": [ + "models = LogisticRegression(random_state=42, n_jobs=-1)\n", + "cv = cross_validate(models,X_train,y_train,cv = 3, n_jobs=-1, return_estimator=True)\n", + "\n", + "print(\"Mean training accuracy: {}\".format(np.mean(cv['test_score'])))\n", + "print(\"Test accuracy: {}\".format(cv[\"estimator\"][0].score(X_test,y_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[830, 27],\n", + " [125, 18]], dtype=int64)" + ] + }, + "execution_count": 209, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_prob1 = cv[\"estimator\"][0].predict(X_test)\n", + "\n", + "cm = confusion_matrix(y_test, pred_prob1)\n", + "cm" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 8))\n", + "ax =sns.heatmap(cm, square=True, annot=True, cbar=False)\n", + "ax.xaxis.set_ticklabels([\"False\",\"True\"], fontsize = 12)\n", + "ax.yaxis.set_ticklabels([\"False\",\"True\"], fontsize = 12, rotation=0)\n", + "ax.set_xlabel('Predicted Labels',fontsize = 15)\n", + "ax.set_ylabel('True Labels',fontsize = 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.87 0.97 0.92 857\n", + " 1 0.40 0.13 0.19 143\n", + "\n", + " accuracy 0.85 1000\n", + " macro avg 0.63 0.55 0.55 1000\n", + "weighted avg 0.80 0.85 0.81 1000\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_test,pred_prob1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Decision Tree" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy of train: 0.9309901414487785\n", + "Accuracy of test: 0.92\n" + ] + } + ], + "source": [ + "clf = DecisionTreeClassifier(max_depth=4 , random_state=42)\n", + "clf.fit(X_train,y_train)\n", + "print(\"Accuracy of train:\",clf.score(X_train,y_train))\n", + "print(\"Accuracy of test:\",clf.score(X_test,y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [], + "source": [ + "pred_prob2 = clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.86 1.00 0.92 857\n", + " 1 1.00 0.01 0.01 143\n", + "\n", + " accuracy 0.86 1000\n", + " macro avg 0.93 0.50 0.47 1000\n", + "weighted avg 0.88 0.86 0.79 1000\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_test,pred_prob2))" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[857, 0],\n", + " [142, 1]], dtype=int64)" + ] + }, + "execution_count": 218, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm = confusion_matrix(y_test, pred_prob2)\n", + "cm" + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 8))\n", + "ax =sns.heatmap(cm, square=True, annot=True, cbar=False)\n", + "ax.xaxis.set_ticklabels([\"False\",\"True\"], fontsize = 12)\n", + "ax.yaxis.set_ticklabels([\"False\",\"True\"], fontsize = 12, rotation=0)\n", + "ax.set_xlabel('Predicted Labels',fontsize = 15)\n", + "ax.set_ylabel('True Labels',fontsize = 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SVM - Support Vector Machine" + ] + }, + { + "cell_type": "code", + "execution_count": 232, + "metadata": {}, + "outputs": [], + "source": [ + "clf_svm = svm.SVC(random_state=24)" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy of train: 0.8551221603086155\n", + "Accuracy of test: 0.858\n" + ] + } + ], + "source": [ + "clf_svm.fit(X_train,y_train)\n", + "print(\"Accuracy of train:\",clf.score(X_train,y_train))\n", + "print(\"Accuracy of test:\",clf.score(X_test,y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 239, + "metadata": {}, + "outputs": [], + "source": [ + "pred_prob3 = clf_svm.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 240, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[857, 0],\n", + " [142, 1]], dtype=int64)" + ] + }, + "execution_count": 240, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm = confusion_matrix(y_test, pred_prob3)\n", + "cm" + ] + }, + { + "cell_type": "code", + "execution_count": 241, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.86 1.00 0.92 857\n", + " 1 1.00 0.01 0.01 143\n", + "\n", + " accuracy 0.86 1000\n", + " macro avg 0.93 0.50 0.47 1000\n", + "weighted avg 0.88 0.86 0.79 1000\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_test,pred_prob3))" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 8))\n", + "ax =sns.heatmap(cm, square=True, annot=True, cbar=False)\n", + "ax.xaxis.set_ticklabels([\"False\",\"True\"], fontsize = 12)\n", + "ax.yaxis.set_ticklabels([\"False\",\"True\"], fontsize = 12, rotation=0)\n", + "ax.set_xlabel('Predicted Labels',fontsize = 15)\n", + "ax.set_ylabel('True Labels',fontsize = 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluation\n", + "\n", + "- Select the best performing model and write your comments about why choose this model.\n", + "- Analyse results and make comment about how you can improve model." + ] + }, + { + "cell_type": "code", + "execution_count": 244, + "metadata": {}, + "outputs": [ + { + "ename": "IndexError", + "evalue": "too many indices for array: array is 1-dimensional, but 2 were indexed", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;31m# roc curve for models\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mfpr1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtpr1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mthresh1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mroc_curve\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpred_prob1\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpos_label\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5\u001b[0m \u001b[0mfpr2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtpr2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mthresh2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mroc_curve\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpred_prob2\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpos_label\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mfpr3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtpr3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mthresh3\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mroc_curve\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpred_prob3\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpos_label\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mIndexError\u001b[0m: too many indices for array: array is 1-dimensional, but 2 were indexed" + ] + } + ], + "source": [ + "from sklearn.metrics import roc_curve\n", + "\n", + "# roc curve for models\n", + "fpr1, tpr1, thresh1 = roc_curve(y_test, pred_prob1[:,1], pos_label=1)\n", + "fpr2, tpr2, thresh2 = roc_curve(y_test, pred_prob2[:,1], pos_label=1)\n", + "fpr3, tpr3, thresh3 = roc_curve(y_test, pred_prob3[:,1], pos_label=1)\n", + "\n", + "# roc curve for tpr = fpr \n", + "random_probs = [0 for i in range(len(y_test))]\n", + "p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)\n", + "\n", + "\n", + "\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "# auc scores\n", + "auc_score1 = roc_auc_score(y_test, pred_prob1[:,1])\n", + "auc_score2 = roc_auc_score(y_test, pred_prob2[:,1])\n", + "auc_score3 = roc_auc_score(y_test, pred_prob3[:,1])\n", + "\n", + "print(auc_score1, auc_score2, auc_score3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -323,7 +2327,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.9" } }, "nbformat": 4, diff --git a/Regularization/.ipynb_checkpoints/regularization-checkpoint.ipynb b/Regularization/.ipynb_checkpoints/regularization-checkpoint.ipynb new file mode 100644 index 0000000..7b59912 --- /dev/null +++ b/Regularization/.ipynb_checkpoints/regularization-checkpoint.ipynb @@ -0,0 +1,1307 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eGMjJYeplnfS" + }, + "source": [ + " All rights reserved © Global AI Hub 2020 \n", + " \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "20RFH5fJlmoU" + }, + "source": [ + "# Regularization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "NhnitxlJlmoW" + }, + "source": [ + "## What is Regularization?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/1.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "L-et45HalmoY" + }, + "source": [ + "As we move towards the right in this image, our model tries to learn too well the details and the noise from the training data, which ultimately results in poor performance on the unseen data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Cj-EgOdTlmoZ" + }, + "source": [ + "In other words, while going towards the right, the complexity of the model increases such that the training error reduces but the testing error doesn’t. This is shown in the image below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XTxBAXp9lmoa" + }, + "source": [ + "![](img/2.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_fbx2bb-lmob" + }, + "source": [ + "Regularization is a technique which makes slight modifications to the learning algorithm such that the model generalizes better. This in turn improves the model’s performance on the unseen data as well." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZoBOI3x8lmod" + }, + "source": [ + "## Underfitting and Overfitting" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "RXHl79wolmoe" + }, + "source": [ + "Before diving further let’s understand the important terms:\n", + "\n", + "Bias – Assumptions made by a model to make a function easier to learn.\n", + "\n", + "Variance – If you train your data on training data and obtain a very low error, upon changing the data and then training the same previous model you experience high error, this is variance." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "H5FVNnzhlmof" + }, + "source": [ + "#### Underfitting" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0dJ-prYHlmog" + }, + "source": [ + "A statistical model or a machine learning algorithm is said to have underfitting when it cannot capture the underlying trend of the data. Underfitting destroys the accuracy of our machine learning model. Its occurrence simply means that our model or the algorithm does not fit the data well enough. It usually happens when we have less data to build an accurate model and also when we try to build a linear model with a non-linear data. In such cases the rules of the machine learning model are too easy and flexible to be applied on such minimal data and therefore the model will probably make a lot of wrong predictions. Underfitting can be avoided by using more data and also reducing the features by feature selection.\n", + "\n", + "Underfitting – High bias and low variance\n", + "\n", + "Techniques to reduce underfitting :\n", + "1. Increase model complexity\n", + "2. Increase number of features, performing feature engineering\n", + "3. Remove noise from the data.\n", + "4. Increase the number of epochs or increase the duration of training to get better results." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "D0Z1D2C3lmoj" + }, + "source": [ + "#### Overfitting" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "c06krbaqlmol" + }, + "source": [ + "A statistical model is said to be overfitted, when we train it with a lot of data. When a model gets trained with so much of data, it starts learning from the noise and inaccurate data entries in our data set. Then the model does not categorize the data correctly, because of too many details and noise. The causes of overfitting are the non-parametric and non-linear methods because these types of machine learning algorithms have more freedom in building the model based on the dataset and therefore they can really build unrealistic models. A solution to avoid overfitting is using a linear algorithm if we have linear data or using the parameters like the maximal depth if we are using decision trees.\n", + "\n", + "Overfitting – High variance and low bias\n", + "\n", + "Techniques to reduce overfitting :\n", + "1. Increase training data.\n", + "2. Reduce model complexity.\n", + "3. Early stopping during the training phase (have an eye over the loss over the training period as soon as loss begins to increase stop training).\n", + "4. Ridge Regularization and Lasso Regularization\n", + "5. Use dropout for neural networks to tackle overfitting." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "sZUR9dp2lmom" + }, + "source": [ + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "qKBncR2llmop" + }, + "source": [ + "### Ridge Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "FYymr4AOlmoq" + }, + "source": [ + "Ridge regression is also called L2 regularization. It adds a constraint that is a linear function of the squared coefficients." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5xo2joEXlmos" + }, + "source": [ + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "CEveA8_8lmot" + }, + "source": [ + "To minimize the regularized loss function, we need to choose λ to minimize the sum of the area of the circle and the area of the ellipsoid chosen by the tangency.\n", + "\n", + "Note that when λ tends to zero, the regularized loss function becomes the OLS loss function.\n", + "\n", + "When λ tends to infinity, we get an intercept-only model (because in this case, the ridge regression coefficients tend to zero). Now we have smaller variance but larger bias.\n", + "\n", + "A critique of ridge regression is that all the variables tend to end up in the model. The model only shrinks the coefficients." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "D1rYoxO7lmou" + }, + "source": [ + "### Lasso " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Z50XcFW7lmov" + }, + "source": [ + "Lasso is also known as L1 regularization. It penalizes the model by the absolute weight coefficients." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Y6T9pjK7lmox" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "RXBwjdnKlmoz" + }, + "source": [ + "Lasso works in the following way: it forces the sum of the absolute value of the coefficients to be less than a constant, which forces some of the coefficients to be zero and results in a simpler model. This is because comparing to the L2 regularization, the ellipsoid has the tendency to touch the diamond-shaped constraint on the corner.\n", + "\n", + "Lasso performs better than ridge regression in the sense that it helps a lot with feature selection." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Q2Uf_S23lmo6" + }, + "outputs": [], + "source": [ + "# Import packages and observe dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "AryxEyFAlmpA" + }, + "outputs": [], + "source": [ + "#Import numerical libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "#Import graphical plotting libraries\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "#Import Linear Regression Machine Learning Libraries\n", + "from sklearn import preprocessing\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from sklearn.linear_model import LinearRegression, Ridge, Lasso\n", + "from sklearn.metrics import r2_score" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "OMmycw9glmpE", + "outputId": "678dea27-1b77-4ed8-a15a-00099d94ec03" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorsepowerweightaccelerationmodel_yearoriginname
018.08307.0130350412.0701chevrolet chevelle malibu
115.08350.0165369311.5701buick skylark 320
218.08318.0150343611.0701plymouth satellite
316.08304.0150343312.0701amc rebel sst
417.08302.0140344910.5701ford torino
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horsepower weight acceleration model_year \\\n", + "0 18.0 8 307.0 130 3504 12.0 70 \n", + "1 15.0 8 350.0 165 3693 11.5 70 \n", + "2 18.0 8 318.0 150 3436 11.0 70 \n", + "3 16.0 8 304.0 150 3433 12.0 70 \n", + "4 17.0 8 302.0 140 3449 10.5 70 \n", + "\n", + " origin name \n", + "0 1 chevrolet chevelle malibu \n", + "1 1 buick skylark 320 \n", + "2 1 plymouth satellite \n", + "3 1 amc rebel sst \n", + "4 1 ford torino " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_raw = pd.read_csv('datasets/reg.csv')\n", + "data_raw.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 398 entries, 0 to 397\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 mpg 398 non-null float64\n", + " 1 cylinders 398 non-null int64 \n", + " 2 displacement 398 non-null float64\n", + " 3 horsepower 398 non-null object \n", + " 4 weight 398 non-null int64 \n", + " 5 acceleration 398 non-null float64\n", + " 6 model_year 398 non-null int64 \n", + " 7 origin 398 non-null int64 \n", + " 8 name 398 non-null object \n", + "dtypes: float64(3), int64(4), object(2)\n", + "memory usage: 28.1+ KB\n" + ] + } + ], + "source": [ + "data_raw.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "82mm8sMrlmpJ" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorsepowerweightaccelerationmodel_yearorigin
018.08307.0130350412.0701
115.08350.0165369311.5701
218.08318.0150343611.0701
316.08304.0150343312.0701
417.08302.0140344910.5701
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horsepower weight acceleration model_year \\\n", + "0 18.0 8 307.0 130 3504 12.0 70 \n", + "1 15.0 8 350.0 165 3693 11.5 70 \n", + "2 18.0 8 318.0 150 3436 11.0 70 \n", + "3 16.0 8 304.0 150 3433 12.0 70 \n", + "4 17.0 8 302.0 140 3449 10.5 70 \n", + "\n", + " origin \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Drop car name\n", + "#Replace origin into 1,2,3.. dont forget get_dummies\n", + "#Replace ? with nan\n", + "#Replace all nan with median\n", + "\n", + "data = data_raw.drop(['name'], axis = 1)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horsepower 0\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "origin 0\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horsepower 6\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "origin 0\n", + "dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isin(['?']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorsepowerweightaccelerationmodel_yearorigin_americaorigin_asiaorigin_europe
018.08307.0130350412.070100
115.08350.0165369311.570100
218.08318.0150343611.070100
316.08304.0150343312.070100
417.08302.0140344910.570100
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horsepower weight acceleration model_year \\\n", + "0 18.0 8 307.0 130 3504 12.0 70 \n", + "1 15.0 8 350.0 165 3693 11.5 70 \n", + "2 18.0 8 318.0 150 3436 11.0 70 \n", + "3 16.0 8 304.0 150 3433 12.0 70 \n", + "4 17.0 8 302.0 140 3449 10.5 70 \n", + "\n", + " origin_america origin_asia origin_europe \n", + "0 1 0 0 \n", + "1 1 0 0 \n", + "2 1 0 0 \n", + "3 1 0 0 \n", + "4 1 0 0 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['origin'] = data['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})\n", + "data = pd.get_dummies(data, columns = ['origin'])\n", + "data = data.replace('?', np.nan)\n", + "data = data.apply(lambda x: x.fillna(x.median()), axis = 0)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PEzoE7HLlmpR" + }, + "outputs": [], + "source": [ + "# Model building" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "WnV4zqeQlmpW" + }, + "outputs": [], + "source": [ + "X = data.drop(['mpg'], axis = 1) # independent variable\n", + "y = data[['mpg']] #dependent variable" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "JxfdOeeOlmpa", + "outputId": "03846650-c3b3-4736-f4b9-234237103363" + }, + "outputs": [], + "source": [ + "#Scaling the data\n", + "\n", + "X_s = preprocessing.scale(X)\n", + "X_s = pd.DataFrame(X_s, columns = X.columns) #converting scaled data into dataframe\n", + "\n", + "y_s = preprocessing.scale(y)\n", + "y_s = pd.DataFrame(y_s, columns = y.columns) #ideally train, test data should be in columns" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cylindersdisplacementhorsepowerweightaccelerationmodel_yearorigin_americaorigin_asiaorigin_europe
01.4981911.0906040.6731180.630870-1.295498-1.6274260.773559-0.497643-0.461968
11.4981911.5035141.5899580.854333-1.477038-1.6274260.773559-0.497643-0.461968
21.4981911.1962321.1970270.550470-1.658577-1.6274260.773559-0.497643-0.461968
31.4981911.0617961.1970270.546923-1.295498-1.6274260.773559-0.497643-0.461968
41.4981911.0425910.9350720.565841-1.840117-1.6274260.773559-0.497643-0.461968
..............................
393-0.856321-0.513026-0.479482-0.2133240.0115861.6219830.773559-0.497643-0.461968
394-0.856321-0.925936-1.370127-0.9936713.2792961.621983-1.292726-0.4976432.164651
395-0.856321-0.561039-0.531873-0.798585-1.4407301.6219830.773559-0.497643-0.461968
396-0.856321-0.705077-0.662850-0.4084111.1008221.6219830.773559-0.497643-0.461968
397-0.856321-0.714680-0.584264-0.2960881.3912851.6219830.773559-0.497643-0.461968
\n", + "

398 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " cylinders displacement horsepower weight acceleration model_year \\\n", + "0 1.498191 1.090604 0.673118 0.630870 -1.295498 -1.627426 \n", + "1 1.498191 1.503514 1.589958 0.854333 -1.477038 -1.627426 \n", + "2 1.498191 1.196232 1.197027 0.550470 -1.658577 -1.627426 \n", + "3 1.498191 1.061796 1.197027 0.546923 -1.295498 -1.627426 \n", + "4 1.498191 1.042591 0.935072 0.565841 -1.840117 -1.627426 \n", + ".. ... ... ... ... ... ... \n", + "393 -0.856321 -0.513026 -0.479482 -0.213324 0.011586 1.621983 \n", + "394 -0.856321 -0.925936 -1.370127 -0.993671 3.279296 1.621983 \n", + "395 -0.856321 -0.561039 -0.531873 -0.798585 -1.440730 1.621983 \n", + "396 -0.856321 -0.705077 -0.662850 -0.408411 1.100822 1.621983 \n", + "397 -0.856321 -0.714680 -0.584264 -0.296088 1.391285 1.621983 \n", + "\n", + " origin_america origin_asia origin_europe \n", + "0 0.773559 -0.497643 -0.461968 \n", + "1 0.773559 -0.497643 -0.461968 \n", + "2 0.773559 -0.497643 -0.461968 \n", + "3 0.773559 -0.497643 -0.461968 \n", + "4 0.773559 -0.497643 -0.461968 \n", + ".. ... ... ... \n", + "393 0.773559 -0.497643 -0.461968 \n", + "394 -1.292726 -0.497643 2.164651 \n", + "395 0.773559 -0.497643 -0.461968 \n", + "396 0.773559 -0.497643 -0.461968 \n", + "397 0.773559 -0.497643 -0.461968 \n", + "\n", + "[398 rows x 9 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_s" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "8P5JzDBMlmpe", + "outputId": "ed4436cb-8aa2-4d85-fc94-9ff4cf61378b" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(278, 9)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Split into train, test set\n", + "\n", + "X_train, X_test, y_train,y_test = train_test_split(X_s, y_s, test_size = 0.30, random_state = 102)\n", + "X_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "QuEtPChAlmph" + }, + "outputs": [], + "source": [ + "# Simple Linear Model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "6oJNDtoIlmpl", + "outputId": "857b5f78-64fe-4263-bd4f-fb0c1488177f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Regression model coef: [[ 0.00786436 0.21673705 -0.11482021 -0.64460528 0.02689514 0.3781176\n", + " -0.12143029 0.10030822 0.04927016]]\n" + ] + } + ], + "source": [ + "#Fit simple linear model and find coefficients\n", + "regression_model = LinearRegression()\n", + "regression_model.fit(X_train, y_train)\n", + "\n", + "print(f'Regression model coef: {regression_model.coef_}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "xquEQhq9lmpo" + }, + "outputs": [], + "source": [ + "# Regularized Ridge Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "5tgMytDTlmpr", + "outputId": "068228d7-f34a-4335-99b5-3f4c8edc7466" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ridge model coef: [[ 0.00947064 0.20687242 -0.11623737 -0.6366149 0.02528395 0.37704409\n", + " -0.12085268 0.10023402 0.04861364]]\n" + ] + } + ], + "source": [ + "#alpha factor here is lambda (penalty term) which helps to reduce the magnitude of coeff\n", + "\n", + "ridge_model = Ridge(alpha = 0.3)\n", + "ridge_model.fit(X_train, y_train)\n", + "\n", + "print(f'Ridge model coef: {ridge_model.coef_}')\n", + "#As the data has 10 columns hence 10 coefficients appear here " + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "O3y8eJ0slmpv" + }, + "outputs": [], + "source": [ + "# Regularized Lasso Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "5F1M2L_ylmpx", + "outputId": "f987a2b2-a9b5-4816-d6f7-1a9a8b2a72f0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lasso model coef: [ 0.00728877 0.19201382 -0.11069878 -0.62910384 0.02365075 0.37598182\n", + " -0.17839937 0.04919513 0. ]\n" + ] + } + ], + "source": [ + "#alpha factor here is lambda (penalty term) which helps to reduce the magnitude of coeff\n", + "\n", + "lasso_model = Lasso(alpha = 0.001)\n", + "lasso_model.fit(X_train, y_train)\n", + "\n", + "print(f'Lasso model coef: {lasso_model.coef_}')\n", + "#As the data has 10 columns hence 10 coefficients appear here " + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "F2KmGpNXlmp0" + }, + "outputs": [], + "source": [ + "# Score Comparison" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "8YBStPwilmp3", + "outputId": "08dd9aba-9341-4afa-c3f6-a2ab96bdc325" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Simple Train: 0.815778380699972\n", + "Simple Test: 0.8244176296131174\n", + "*************************\n", + "Lasso Train: 0.815720478596037\n", + "Lasso Test: 0.824524813598757\n", + "*************************\n", + "Ridge Train: 0.8157699657350654\n", + "Ridge Test: 0.8239615515502365\n" + ] + } + ], + "source": [ + "#Model score - r^2 or coeff of determinant\n", + "#r^2 = 1-(RSS/TSS) = Regression error/TSS \n", + "\n", + "\n", + "#Simple Linear Model\n", + "print(\"Simple Train: \", regression_model.score(X_train, y_train))\n", + "print(\"Simple Test: \", regression_model.score(X_test, y_test))\n", + "print('*************************')\n", + "#Lasso\n", + "print(\"Lasso Train: \", lasso_model.score(X_train, y_train))\n", + "print(\"Lasso Test: \", lasso_model.score(X_test, y_test))\n", + "print('*************************')\n", + "#Ridge\n", + "print(\"Ridge Train: \", ridge_model.score(X_train, y_train))\n", + "print(\"Ridge Test: \", ridge_model.score(X_test, y_test))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Resources\n", + "\n", + "https://towardsdatascience.com/regularization-in-machine-learning-76441ddcf99a " + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "r.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Regularization/regularization.ipynb b/Regularization/regularization.ipynb index 7b59912..9c9c351 100644 --- a/Regularization/regularization.ipynb +++ b/Regularization/regularization.ipynb @@ -1299,7 +1299,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.9" } }, "nbformat": 4, diff --git a/Unsupervised Learning/.ipynb_checkpoints/unsupervised_learning-checkpoint.ipynb b/Unsupervised Learning/.ipynb_checkpoints/unsupervised_learning-checkpoint.ipynb new file mode 100644 index 0000000..d515b10 --- /dev/null +++ b/Unsupervised Learning/.ipynb_checkpoints/unsupervised_learning-checkpoint.ipynb @@ -0,0 +1,1019 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "GyzG2YmSs89r" + }, + "source": [ + "# Clustering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WmS8ct5vs89s" + }, + "source": [ + "Clustering can be considered the most important unsupervised learning problem; so, as every other problem of this kind, it deals with finding a structure in a collection of unlabeled data. A loose definition of clustering could be “the process of organizing objects into groups whose members are similar in some way”. A cluster is therefore a collection of objects which are “similar” between them and are “dissimilar” to the objects belonging to other clusters." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "AyDGFgYgs89s" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BDGsmEBIs89t" + }, + "source": [ + "#### Types of Clustering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "75FkrRlts89u" + }, + "source": [ + "\"Centroid-based Clustering\" organizes the data into non-hierarchical clusters, in contrast to hierarchical clustering defined below. k-means is the most widely-used centroid-based clustering algorithm. Centroid-based algorithms are efficient but sensitive to initial conditions and outliers." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "R7kc2Op3s89u" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tNSZvo3ts89u" + }, + "source": [ + "\"Density-based Clustering\" connects areas of high example density into clusters. This allows for arbitrary-shaped distributions as long as dense areas can be connected. These algorithms have difficulty with data of varying densities and high dimensions. Further, by design, these algorithms do not assign outliers to clusters." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "NSqTRsYUs89v" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "zJB0jOW5s89v" + }, + "source": [ + "\"Distribution-based Clustering\" approach assumes data is composed of distributions, such as Gaussian distributions. In the picture, the distribution-based algorithm clusters data into three Gaussian distributions. As distance from the distribution's center increases, the probability that a point belongs to the distribution decreases. The bands show that decrease in probability. When you do not know the type of distribution in your data, you should use a different algorithm." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nG0k0sDWs89w" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9IkgcrVEs89w" + }, + "source": [ + "\"Hierarchical clustering\" creates a tree of clusters. Hierarchical clustering, not surprisingly, is well suited to hierarchical data, such as taxonomies. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XCuBIfhfs89w" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "cHGA4U1rs89x" + }, + "source": [ + "## K-Means Clustering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "x7PArFvQs89x" + }, + "source": [ + "The K-Means algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. This algorithm requires the number of clusters to be specified. It scales well to large number of samples and has been used across a large range of application areas in many different fields." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tmG7LgDds89x" + }, + "source": [ + "The k-means algorithm divides a set of N samples X into K disjoint clusters C, each described by the mean $\\mu_j$\n", + " of the samples in the cluster. The means are commonly called the cluster “centroids”; note that they are not, in general, points from X, although they live in the same space." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "IvvIYnNOs89y" + }, + "source": [ + "The K-means algorithm aims to choose centroids that minimise the inertia, or within-cluster sum-of-squares criterion:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eomhyzRxs89y" + }, + "source": [ + "

" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PaLk-isXs89y" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HcRGNffYs89z" + }, + "source": [ + "#Step One" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6EhWH_t1s89z" + }, + "source": [ + "The algorithm randomly chooses a centroid for each cluster. In our example, we choose a k of 3, and therefore the algorithm randomly picks 3 centroids." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6eLi7FIXs89z" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "cnb2x8c7s890" + }, + "source": [ + "#Step Two" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "aMZfgFGGs890" + }, + "source": [ + "The algorithm assigns each point to the closest centroid to get k initial clusters." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_qkwP2zas890" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "rnpIjVfMs891" + }, + "source": [ + "#Step Three" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "4cuTpWaJs891" + }, + "source": [ + "For every cluster, the algorithm recomputes the centroid by taking the average of all points in the cluster. The changes in centroids are shown in Figure 3 by arrows. Since the centroids change, the algorithm then re-assigns the points to the closest centroid. Figure 4 shows the new clusters after re-assignment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "W6qwozARs891" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hvjUagQLs892" + }, + "source": [ + "#Step Four" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "X0AtQu60s892" + }, + "source": [ + "The algorithm repeats the calculation of centroids and assignment of points until points stop changing clusters. When clustering large datasets, you stop the algorithm before reaching convergence, using other criteria instead." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "W07RLZ4Ws892" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "IFEIUDv7s893" + }, + "source": [ + "#### K Means Parameters : https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "oLENs24Xs893" + }, + "source": [ + "#### Mini Batch K-Means" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "EUDMd38ls893" + }, + "source": [ + "The MiniBatchKMeans is a variant of the KMeans algorithm which uses mini-batches to reduce the computation time, while still attempting to optimise the same objective function. Mini-batches are subsets of the input data, randomly sampled in each training iteration. These mini-batches drastically reduce the amount of computation required to converge to a local solution. In contrast to other algorithms that reduce the convergence time of k-means, mini-batch k-means produces results that are generally only slightly worse than the standard algorithm." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "G5ODhTs8s893" + }, + "source": [ + "MiniBatchKMeans converges faster than KMeans, but the quality of the results is reduced. In practice this difference in quality can be quite small, as shown in the example and cited reference.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gjA6DTqbs894" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "b5aRPk_Zs894" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7sYybz9us898" + }, + "outputs": [], + "source": [ + "from sklearn.datasets import make_blobs\n", + "\n", + "X, y = make_blobs(n_samples=1000, n_features=2, cluster_std=5.5, random_state=42) " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "FxY0w9sHs89_" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split \n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 6.1086553 , -2.93325702, -1.38963414, ..., 0.32349635,\n", + " 4.88084193, -7.5865835 ],\n", + " [ 6.86613906, 10.89062199, -5.94711017, ..., 11.84010886,\n", + " 1.16465867, -17.23027934]])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.T" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plots" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "sns.set_style(\"whitegrid\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(X_train.T[0], X_train.T[1],color='g')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(X_test.T[0], X_test.T[1],color='g')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.distplot(X_train.T[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.distplot(X_train.T[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scaling" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler().fit(X_train)\n", + "X_train_n = scaler.transform(X_train)\n", + "X_test_n = scaler.transform(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "MtTnveWEs8-D" + }, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "\n", + "model = KMeans(n_clusters=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jraBTRB0s8-H", + "outputId": "c2d0ca2b-f8a6-4c92-92bb-04946f52eade" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n", + " n_clusters=1, n_init=10, n_jobs=None, precompute_distances='auto',\n", + " random_state=None, tol=0.0001, verbose=0)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(X_train_n) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CY_-eShbs8-J", + "outputId": "4dbc6aa8-b42e-4a74-cd58-52a7fc3540dc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train score: -1600.000000000001\n", + "Test score: -375.1898042304475\n" + ] + } + ], + "source": [ + "# model score, Opposite of the value of X on the K-means objective.\n", + "print(\"Train score:\", model.score(X_train_n))\n", + "print(\"Test score:\",model.score(X_test_n))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "e6p0mK2Ps8-M" + }, + "outputs": [], + "source": [ + "train=[]\n", + "test=[]\n", + "\n", + "for n in range(1, 10):\n", + " model = KMeans(n_clusters=n)\n", + " model.fit(X_train_n, y_train)\n", + " \n", + " train.append(model.score(X_train_n, y_train))\n", + " test.append(model.score(X_test_n, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "stWbC65As8-O", + "outputId": "5eb2f4b9-fd47-4eb3-8236-f079a82a731a" + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3deXRU9cH/8fdkh0zC7gIkEKyRIEsMKeFnE6h1a1Or1hIJuPxcWh78FRQKFkR2IeDTQtsDimi18iAqRNweT13QUpGIgNSAxAEFWRIQCAnbTDKZZOb+/oiMRgJJZu4wzOTzOodj5ubez3zDlfnk7hbDMAxERESAiGAPQERELhwqBRER8VIpiIiIl0pBRES8VAoiIuKlUhAREa+oYL2xx+Nh5syZ7Ny5k5iYGObMmUOPHj3OOn9xcTGxsbE+vVdNTY3Py57v3FDJDFRuqGQGKjdUMgOV25ozA5VbU1NDenp68xcwguTdd981Jk2aZBiGYXz22WfG6NGjzzn/F1984fN7+bPs+c4NlcxA5YZKZqByQyUzULmtOTNQuS3NDNruoy1btpCTkwNAeno627dvD9ZQRETkWxbDCM4VzY8++ig33HADQ4cOBeCnP/0p77//PlFRje/R8mf3kdPpJC4uzuexns/cUMkMVG6oZAYqN1QyA5XbmjMDmZuWltbseYN2TMFqteJwOLyvPR7PWQsBIDY2tkU/2PfZbDaflz3fuaGSGajcUMkMVG6oZAYqtzVnBirXZrO1aP6glUJGRgZr164lNzeX4uJiUlNTgzUUEWklamtrKSsrw+l0+p3T0g/bQOfGxcXRvXt3oqOj/RpD0Erh+uuvp6ioiPz8fAzDoKCgIFhDEZFWoqysjISEBHr27InFYvE5p7q6mjZt2pg4Mv9yDcOgoqKCsrIyUlJS/BpD0EohIiKC2bNnB+vtRaQVcjqdfhfChchisdCpUyfKy8v9ztLFayLSqoRbIZxm1s8VtC0FOc88bqg5Vf/HZf/265NQY/9umssOabdAFx3fEWmtVAqhqM4Fp76BqgqoqoTqyu++rqr43utjUFXBFVWV4G7mgbW49ioFkQCqqanhzTffJC8vr8l5X331Vdq1a8e11157HkZWT6VwoTGM+g/3E6VwouzbP9//ugxOHQIau7zEAm06QNtO0LYjtE+CSwdwrMpNp64pEJsAsdb6/8YkfPv6B9Mi9b+ESCCVl5dTWFjYrFK47bbbzsOIGtInQLDUuaDyayjf8b0/O7mi4uszf6uPjIV23ev/XHYttOsGid0gvst3BdC2E8S1g4jIM97qiM1GpwCcUy0SylZvKWPVp6U+LevxeIiIOPOQ7O2ZSfxmYPdzLvvUU0+xa9cuevfuzdVXX01VVRVz587l9ddfZ+vWrTidTi677DLmzZvHokWL6Ny5M7169eKZZ54hOjqasrIycnNzeeCBB3wae1NUCoHmroOjO70f+hyx1f+3cjd46r6dyQIdekKX3hxr359OKQO+K4F2SRDfGcL04JhIazN69Gi+/PJLcnJyOHHiBFOnTsVut5OYmMjSpUuJjY3ll7/8JYcPH26w3MGDB3nzzTdxuVzk5OSoFEKGxw2HtsGej2DvR7BvA7hO1X/PEgEdUuCiNEi7Cbr0hi5XQOdUiK4/N1m/1YucH78Z2L3J3+rPxqzrFE5fUxAbG0tlZSWTJ08mISGBqqoqamtrG8ybmppKVFQUUVFRAbkVxmkqBX95PHDki/oC2PMR7FsPzhP13+t0OfTPg+T/Axf1gU4/gujArUwRufBFRETg8Xi8XwOsW7eOb775hvnz51NdXc2aNWv44W3pzteptCoFX5w8CDv/Sbetb8H/bqs/0wfqdwGl3QwpQ6BnDiReGtRhisiFp1OnTtTW1ja41Ub//v158sknueuuu4iLiyMpKYkjR44EZXwqheY6tg9sb8IXb0LZJgDatL0ELr8RUnLqS6B9UpAHKSIXutjYWN54440G07p06cLq1avP2C01cOBA79dZWVner4uKigI2PpXCuRzdBbY36ovgm+L6aZf0h59NhbRb2FVeR1qfPsEdo4iIiVQK32cY9WcJffFtERwpqZ/eLROun12/a6jj9242ddT8uySKiASTSuG0o7vg9dFQthmwQPJg+Pl8SPtV/amhIiKtgErBMOCz5fD2JIiMgV/8N/S5BRIuCfbIRETOu9ZdClWV8L8P1R9A7pkDv15af7WwiEgr1XpLYc9H8Np/gf0wXDcTrn6w0VtEiIi0Jq3veQruWnh/Jiz7FUTFwf1rIHu8CkFEzouamhoKCwtbtMzmzZvZsWNHgEbUUOsqhYrd8Oz1sP4vcNWd8F/roFtGsEclIq3I6buktsTq1avP28Vspu0+OnXqFA8//DB2u53a2lomT57MVVddRXFxMXPnziUyMpLs7GzGjBmDx+Nh5syZ7Ny5k5iYGObMmUOPHj3MGsqZDAP+c/pgcjTc/j/1B5NFpPUqfgk+e8GnRWM87sb3Llx1J6SPOOeyp++SunjxYr788kuOHTsGwNSpU0lOTmby5Mns37+fmpoa7r//fpKTk/noo48oKSnhRz/6EV27dvVpzM1lWin84x//YPDgwdxzzz18/fXXTJgwgddee40ZM2awaNEikpKSGDVqFCUlJRw4cACXy8XKlSspLi5m/vz5LFmyxKyhNOQ8QbcNU6H0g28PJj+lU0xFJGhO3yW1urqawYMHM3LkSPbu3csjjzzCokWL2LhxI6tXrwbqr1zu27cvOTk55ObmBrwQwMRSuOeee4iJiQHA7XYTGxuL3W7H5XKRnJwMQHZ2Nhs2bKC8vJycnBwA0tPT2b59e5P5NTU12Gwtv1js0k1zaVe2liP9/x8VV9wBB0/BQXMuOnM6nT6NKRwyA5UbKpmByg2VzEDlBjqztraW6urq+m9ccWv9Hx8YhnH2G9Sdzj/HeDweDzabjY8//pi33noLgOPHj9O2bVsmTZrElClTcDgc5ObmUl1dTV1dHS6X67uxn0Vtba3ff38+lUJhYSHLli1rMK2goID+/ftTXl7Oww8/zJQpU7Db7VitVu888fHxlJaWnjE9MjKSuro6oqLOPpzY2FjSWnpL6RNlsO8dKi+7jYtum8dFLVu6STabreVjCpPMQOWGSmagckMlM1C5gc602Wym3PLan1tnt23bFoDLL7+cvn378qtf/YqKigoKCws5evQoX331FU899RQ1NTUMHTqUvLw8oqOjiY6ObvI9o6Ojz/j7a2lJ+FQKeXl5jT5KbufOnfzhD3/gj3/8I4MGDcJut+NwOLzfdzgcJCYm4nQ6G0z3eDznLASffbwYMKjofQcdzU8XEWmx03dJdTgcvP3226xatQq73c6YMWPo3Lkz5eXl3HrrrbRt25b77ruPqKgoBgwYwJ///Ge6d+/OZZddFtDxmfZJvGvXLh566CH++te/0rt3bwCsVivR0dHs37+fpKQk1q9fz5gxYzh06BBr164lNzeX4uJiUlMD8KB4x1HY8jz0u526eN3CWkQuDI3dJfW06upqZs+efcb0/Px88vPzAz00wMRSWLBgAS6Xi7lz5wL1hbBkyRJmzZrFxIkTcbvdZGdnM2DAAPr160dRURH5+fkYhkFBQYFZw/jOJ0ugzgnZ4+Cox/z8MGP75iTvlhxiaGoXrkruEOzhiEiQmFYKZzt7KD09nVWrVjWYFhER0WgbmsZ5EjY9U38zuy5X6G6mZ1FaWcWbWw/yZvFBdh4+RWSEha7t26gURFqx8LzNxafPQs0JyPlDsEdywamw1/DPz7/h9eKDbNlXf350Zo8OPHbLleT2u5RO1tggj1AksM555lAI++HjO30VfqVQWw0bnoDLfgZdrwr2aC4I1bUeXvusjDeKD/LRV0dxewxSL7by8I1XcPOAriR1bBvsIYqcF3FxcVRUVNCpU6ewKgbDMKioqCAuzv9nwIdfKXz2AjjKIWdCsEcSdLZvTvKPoj288dkBatwG3dq34Xc5vbj1qq70viQx2MMTOe+6d+9OWVkZ5eXlfuXU1tYSHR1t0qjMyY2Li6N7d/8vzA2vUnDXQtHfICkLevwk2KMJCo/H4F87jvBc0R4+3l1BXHQE1/Syct/P+pLZowMREeHz25FIS0VHR5OSktL0jE0IpWs/Wiq8SuHzQjhRCr9cAGG0adgc9po6Xvm0lOc/3sveiioubRfHpJ/3ZsSgJL7Zt5u0FF2pISJNC59S8Hjq7356cV+4/IZgj+a8Ka2s4n827OXlzaWcctZxVXJ7JtxwBT/vewnRkfU3wf0muEMUkRASPqWw4y04+iUMe65VbCVs3lvJc+v38G7JISwWC7n9LuXen/QkQ6eTiogfwqMUDAM+WgAde0Ef325wFSo+23+MP7+3k6JdFbRrE81/Db2Muwb3oGt7/+/nIiISHqWw+1/wTTHcvChsn6C289ApFry3k/e+OEzH+Bim/jKNkVnJtI0Jj1UoIheG8PhE+WghJHaD/ufn3iDn0/6KKv7y/pe8XnwAa0wUE65P5d7sFKyx4bHqROTCEvqfLPs3wr718PP5EBUT7NGY5vBJJ4v+9RUvbyolKtLCqCG9eGDoZbRvGz4/o4hceEK/FNYvhDYdIePuYI/EFCedbub908bzH+/F7TEYMSiZMT/7ERcn+n+loohIU0K7FA59Dl++A9dMhZj4YI/GL3VuD09/9DWLP9hPdZ3Br9O7Me66VJI76RYUInL+hHYprP8LxCTAoN8GeyR++brczvhVW9laepzBSW2ZPSyT1IsTgj0sEWmFQrcUKnZDyWtw9YPQJjTPzTcMg+Wf7KPgnzZioyJZPPIqLos+oUIQkaAJ3VIo+itExsD/+X2wR+KTQyecPPzKVj766ihDU7vw38P6c3FiHDbbiWAPTURasdAsBZcDil+CgfeA9aJgj6bF3ig+wLTXt1PrNphza1/uyEoOq9v4ikjoijA7cPfu3QwcOJCamhoAiouLycvLIz8/n8WLFwPg8XiYPn06w4cP56677mLfvn0te5OoOLhuJlwzxdzBB9jxKhdjXvwPD71czGUXWfnnQzncObiHCkFELhimbinY7XYef/xxYmK+O5d+xowZLFq0iKSkJEaNGkVJSQkHDhzA5XKxcuVKiouLmT9//lkf59moiEi4eoyZQw+4f+88wh9f2caxKhcP33gF/zWkF1GRpneyiIhfTPtUMgyDadOm8Yc//IE2bervw2O323G5XCQn1+8eyc7OZsOGDWzZsoWcnByg/hnO27dvN2sYF5wqVx2PvvY59/xjM+3bRvP673/C76/5kQpBRC5IPm0pFBYWsmzZsgbTunbtSm5uLr179/ZOs9vtWK1W7+v4+HhKS0vPmB4ZGUldXR1RUWcfTk1NDTabzZfh4nQ6fV7Wn1y7y8209w+xs7yG31zZjruv6kDEiYPYThw8r2MN1s8fzpmByg2VzEDltubMQOa2hE+lkJeXR15eXoNp119/PatXr2b16tWUl5dz3333sXTpUhwOh3ceh8NBYmIiTqezwXSPx3POQgCIjY31+YlEwXhK0vEqF3c9u4ndlS6W3JnBz/te6nemr0LpKVGhkhmo3FDJDFRua84MVG5LS8a0Ywpr1qzxfv2zn/2M5557jtjYWKKjo9m/fz9JSUmsX7+eMWPGcOjQIdauXUtubi7FxcWkpqaaNYwLQoW9hjuf3cTuI3aeunMg16ZdHOwhiYg0S8BPSZ01axYTJ07E7XaTnZ3NgAED6NevH0VFReTn52MYBgUFBYEexnlz5JSTO/++kX0VVfz9/2YyJLVLsIckItJsASmFf/3rX96v09PTWbVqVYPvR0REMHv27EC8dVAdOuFk5N8/4ZvjTv5x74+5+rLOwR6SiEiLhObFaxegA8erGfnMJ1TYXfzP/YP4cc+OwR6SiEiLqRRMsL+iihHPfMJJZy3L7x/EVXpOsoiEKJWCn/YcdTDymU+ornXz0u8G07dbu2APSUTEZ7qCyg/7j7u4fekGXHUeFYKIhAWVgo92HDrJH9+tvwjt5VGDSbs0McgjEhHxn3Yf+WDvUQcjnv6EqAgLK0cNplcXa9MLiYiEAG0ptFC1y83oF7ZgAP99Y1cVgoiEFW0ptIBhGEx9fTs7D5/i+XsHcZH7aLCHJCJiKm0ptMDLm0tZ/Z8yHvzZ5QzVlcoiEoZUCs30edkJZrxZwpDULjx47eXBHo6ISECoFJrheJWLB1ZsoXN8DH8dnk5khJ6UJiLhSccUmuDxGPxh1VYOn3RSOPpqOsbHNL2QiEiI0pZCE5789y7+teMI02/qQ3pS+2APR0QkoFQK51C06ygL13zJLelduXNwj2APR0Qk4FQKZ/HNiWoefOkzLutiZd5t/bBYdBxBRMKfSqERrjoPv1/xH5y1bpbcOZC2MTr0IiKtgz7tGjHvbRv/2X+cJ0Zm8KOLdMWyiLQe2lL4gbe2HeQfRXu59yc9+WX/S4M9HBGR88q0LQW32828efPYvn07LpeLsWPHcs0111BcXMzcuXOJjIwkOzubMWPG4PF4mDlzJjt37iQmJoY5c+bQo0fwD+TuOmJn0ivbyEhuzyO/SAv2cEREzjvTSuGNN96grq6Ol19+mcOHD/P2228DMGPGDBYtWkRSUhKjRo2ipKSEAwcO4HK5WLlyJcXFxcyfP58lS5aYNRSfTV69jbjoSJ64I4OYKG1EiUjrY1oprF+/ntTUVEaNGoVhGEybNg273Y7L5SI5ORmA7OxsNmzYQHl5OTk5OQCkp6ezfft2s4bhs5POWrbsP8ZD117Ope3aBHs4IiJB4VMpFBYWsmzZsgbTOnToQGxsLEuXLmXz5s088sgjLFiwAKv1uwO18fHxlJaWYrfbG0yPjIykrq6OqKizD6empgabzebLcHE6nU0uu6msCsOASyPtzX6f5uS2VKhkBio3VDIDlRsqmYHKbc2ZgcxtCZ9KIS8vj7y8vAbTxo8fz09/+lMsFguDBg1i7969WK1WHA6Hdx6Hw0FiYiJOp7PBdI/Hc85CAIiNjSUtzbf9/Dabrcll39i7g+hIC7dkDyAuOtK03JYKlcxA5YZKZqByQyUzULmtOTNQuS0tGdN2nA8cOJAPP/wQgB07dnDppZditVqJjo5m//79GIbB+vXryczMJCMjg3Xr1gFQXFxMamqqWcPw2aY9FfTv3r7ZhSAiEo5MO6Zw++23M2PGDG6//XYMw2DWrFkAzJo1i4kTJ+J2u8nOzmbAgAH069ePoqIi8vPzMQyDgoICs4bhk2qXm88PnOC3Ob2COg4RkWAzrRRiYmKYN2/eGdPT09NZtWpVg2kRERHMnj3brLf222elx6h1Gwzq2THYQxERCSqddwls2lOJxQIDe3YI9lBERIJKpUB9KfS5NJHEuOhgD0VEJKhafSm46jz8Z/8xBqVo15GISKsvhe0HT+Cs9eh4gogIKgU27akE4MfaUhARUSls2lPJZV3i6WyNDfZQRESCrlWXgttjsHlvJYNSOgV7KCIiF4RWXQo7D53ilLOOQSk6FVVEBFp5KWzaUwGgLQURkW+17lLYW0m39m3o1l63yhYRgVZcCoZhsGnPMbJ01pGIiFerLYU9Rx0ctdfoVFQRke9ptaVw+voEXcksIvKdVl0Kna0x9OocH+yhiIhcMFpvKeyt5Mc9O2KxWII9FBGRC0arLIUDx6spO1atXUciIj/QKkths44niIg0qlWWwsY9lSTERdH7ksRgD0VE5IJi2uM4T506xfjx46muriY6Opo//elPdOnSheLiYubOnUtkZCTZ2dmMGTMGj8fDzJkz2blzJzExMcyZM4cePXqYNZQmbd5bSWaPDkRG6HiCiMj3mbal8Oqrr5KamsqKFSvIzc3l2WefBWDGjBksWLCAl156ia1bt1JSUsL777+Py+Vi5cqVTJgwgfnz55s1jCYdtdew64hdt7YQEWmEaVsKqampfP311wDY7XaioqKw2+24XC6Sk5MByM7OZsOGDZSXl5OTkwNAeno627dvbzK/pqYGm83m09icTqd32aJ9DgAutpz0Oa+xXLOESmagckMlM1C5oZIZqNzWnBnI3JbwqRQKCwtZtmxZg2nTp0+nqKiI3NxcTpw4wYoVK7Db7VitVu888fHxlJaWnjE9MjKSuro6oqLOPpzY2FjS0tJ8GS42m8277KpdJcRFR3DTTwYQE+XfhtL3c80SKpmByg2VzEDlhkpmoHJbc2agcltaMj6VQl5eHnl5eQ2mjRkzht/+9rfk5+ezY8cOxo4dy0svvYTD4fDO43A4SExMxOl0Npju8XjOWQhm2ry3kquSOvhdCCIi4ci0T8bExEQSEhIA6NSpEw6HA6vVSnR0NPv378cwDNavX09mZiYZGRmsW7cOgOLiYlJTU80axjmddNbyxcGTOhVVROQsTPv1/KGHHmLq1Km8+OKL1NXV8dhjjwEwa9YsJk6ciNvtJjs7mwEDBtCvXz+KiorIz8/HMAwKCgrMGsY5bdl3DI+B7owqInIWppXCxRdfzDPPPHPG9PT0dFatWtVgWkREBLNnzzbrrZtt055KoiIsXJWsJ62JiDSmVe1Y37ynkn7d29EmJjLYQxERuSC1mlJw1rrZWnZcxxNERM6h1ZTCZ/uPU+s2dDxBROQcWk0pbNpTicUCA3uoFEREzqbVlMLmvZX0viSRdm2igz0UEZELVqsohTqPwZZ9x7TrSESkCa2iFHZV1FBd69ZBZhGRJrSKUth+2AnAj3uqFEREzqXVlEKvzvF0SYgN9lBERC5oYV8KHo9ByRGndh2JiDRD2JfCzsOnsLs8KgURkWYI+1LYvLcS0PEEEZHmCPtS2HXETnx0BN07tAn2UERELnhhXwplx6q5OCEKi8US7KGIiFzwWkEpVHFx/Pl5qpuISKgL61IwDIPSymoutqoURESaI6xLodLhorrWzcVW3e9IRKQ5/CqFNWvWMGHCBO/r4uJi8vLyyM/PZ/HixQB4PB6mT5/O8OHDueuuu9i3b99Z5zVb2bFqAG0piIg0k8+lMGfOHBYsWIDH4/FOmzFjBgsWLOCll15i69atlJSU8P777+NyuVi5ciUTJkxg/vz5Z53XbKdL4RKVgohIs/hcChkZGcycOdP72m6343K5SE5OxmKxkJ2dzYYNG9iyZQs5OTlA/fOat2/fftZ5zVZ2rAqAi7T7SESkWZr8FbqwsJBly5Y1mFZQUEBubi4bN270TrPb7VitVu/r+Ph4SktLz5geGRl51nnPpaamBpvN1vRP9D2ff30Ua0wEkR5Xi5dtDqfTaXpuqGQGKjdUMgOVGyqZgcptzZmBzG2JJkshLy+PvLy8JoOsVisOh8P72uFwkJiYiNPpbDDd4/Gcdd5ziY2NJS0trclxfJ/jk0306AxxcXEtXrY5bDab6bmhkhmo3FDJDFRuqGQGKrc1ZwYqt6UlY9rZR1arlejoaPbv349hGKxfv57MzEwyMjJYt24dUH9wOTU19azzmq3sWLWuZBYRaQFTj8DOmjWLiRMn4na7yc7OZsCAAfTr14+ioiLy8/MxDIOCgoKzzmsmwzAoO1bNkNQupuaKiIQzv0ohKyuLrKws7+v09HRWrVrVYJ6IiAhmz559xrKNzWumim+vUajfUnAG7H1ERMJJ2F68dvp01O4d2gZ5JCIioSOMS6H+dFQdUxARab4wLoXTWwoqBRGR5grjUqiifdtoEuJ04ZqISHOFcSnodFQRkZYK71Jor4PMIiItEZalUH+NQpW2FEREWigsS6HC4cJZ61EpiIi0UFiWgq5REBHxTViWQmnlt9codNSWgohIS4RlKWhLQUTEN2FaClV0aBuNNVZPXBMRaYkwLYVqbSWIiPggTEtBp6OKiPgi7Erh9HMUVAoiIi0XdqVw1O6ips6j3UciIj4Iu1LQLbNFRHwXhqWg01FFRHzlVymsWbOGCRMmeF9v2LCB4cOHc8cdd/Dggw9SXV3/Ab148WKGDRtGfn4+27ZtA2Dfvn2MGDGCkSNHMmPGDDwejz9D8Sr9dkuhm7YURERazOdSmDNnDgsWLGjwYT5z5kyeeOIJVqxYQY8ePSgsLKSkpIRNmzZRWFjIwoULmTVrFgDz5s1j3LhxvPjiixiGwQcffOD/T0P9loKuURAR8Y3PpZCRkcHMmTMbTFu+fDmdO3cGoK6ujtjYWLZs2UJ2djYWi4WuXbvidruprKykpKSEQYMGATBkyBA+/vhj33+K7yk7Vk1SR+06EhHxRZO/ThcWFrJs2bIG0woKCsjNzWXjxo0Npl900UVA/W6ljRs3Mm7cOJ599lnat2/vnSc+Pp5Tp05hGAYWi6XBtHOpqanBZrM1+QN9feg4PTvENJjX6XQ2a9mWCkRuqGQGKjdUMgOVGyqZgcptzZmBzG2JJkshLy+PvLy8Zgc+//zzvPPOO/z9738nNjYWq9WKw+Hwft/hcJCQkEBERESDaYmJiefMjY2NJS0t7ZzzGIZBedVectMvbjCvzWZrcllfBCI3VDIDlRsqmYHKDZXMQOW25sxA5ba0ZEw9+2jJkiV8+umnPP/883Ts2BGo3820fv16PB4PBw8exOPx0LFjR/r06ePd0li3bh2ZmZl+v3+5vebbaxR0kFlExBemHY09evQoTzzxBH369OF3v/sdAL/4xS8YOXIkmZmZDB8+HI/Hw/Tp0wGYNGkS06ZNY+HChfTq1Ysbb7zR7zF8dzqqSkFExBd+lUJWVhZZWVkAdO7cme3btzc639ixYxk7dmyDaSkpKbzwwgv+vP0ZdI2CiIh/wuritdNXM3drry0FERFfhFkpVNMxPoZ4XaMgIuKTsCqF0krdMltExB9hVQoHjlWTpOMJIiI+C5tS8HgMyo7rOQoiIv4Im1I4aq/BpWsURET8EjalUKrTUUVE/BY2paCH64iI+C+MSqF+S0HPURAR8V1YlUKn+BjaxugaBRERX4VRKegaBRERf4VRKVTrILOIiJ/CohQ8HoMDx3SNgoiIv8KiFMrtNbjcHrrrMZwiIn4Ji1LQ6agiIuYIk1KoPx01SaUgIuKXsCqFbu21+0hExB9hUgpVdLbG0CYmMthDEREJaX6Vwpo1a5gwYcIZ05csWcL48eO9rxcvXsywYYfvCc0AAAvpSURBVMPIz89n27ZtAOzbt48RI0YwcuRIZsyYgcfj8XkcZceq6abTUUVE/OZzKcyZM4cFCxac8WH+4Ycfsm7dOu/rkpISNm3aRGFhIQsXLmTWrFkAzJs3j3HjxvHiiy9iGAYffPCBr0P59hoFHU8QEfGXz/eEyMjI4LrrrmPlypXeafv27WPlypWMHTuWwsJCALZs2UJ2djYWi4WuXbvidruprKykpKSEQYMGATBkyBCKioq4/vrrz/p+NTU12Gy2M6Z7DIPSSgcDL4lq9PsATqfzrN/zRyByQyUzULmhkhmo3FDJDFRua84MZG5LNFkKhYWFLFu2rMG0goICcnNz2bhxo3eaw+Fg9uzZPP744+zevds73W630759e+/r+Ph4Tp06hWEYWCyWBtPOJTY2lrS0tDOmHzrhpM6zhwE/SiItrUejy9pstkaX9VcgckMlM1C5oZIZqNxQyQxUbmvODFRuS0umyVLIy8sjLy+vyaCioiLKy8sZP348J0+e5MiRIzz99NNYrVYcDod3PofDQUJCAhEREQ2mJSYmtmjgp52+RkGno4qI+M+0s49uuOEG3nzzTZYvX86UKVMYPHgwo0aNIiMjg/Xr1+PxeDh48CAej4eOHTvSp08f75bGunXryMzM9Ol9y/RwHRER0wT8PtN9+/YlMzOT4cOH4/F4mD59OgCTJk1i2rRpLFy4kF69enHjjTf6lK+rmUVEzONXKWRlZZGVldXk9LFjxzJ27NgG86SkpPDCCy/48/ZA/ZZCZ2sscdG6RkFExF8hf/GaTkcVETFPGJSCHq4jImKWkC4Fj8fgwHE9XEdExCwhXQpHTtVQ6za0pSAiYpKQLoVSnXkkImKqkC4F74VreuKaiIgpQrsUKk8/R0FbCiIiZgjtUjhWTZcEXaMgImKW0C6F4zodVUTETKFdCsd0OqqIiJlCthTcHoODx3U1s4iImUK2FI6ccuoaBRERk4VsKeiW2SIi5gvZUiit1IVrIiJmC9lSOL2loGsURETME8KlUMVFukZBRMRUIVwKOvNIRMRsfpXCmjVrmDBhgvf1vn37uOeee7jjjju49957OXbsGACLFy9m2LBh5Ofns23bNu+8I0aMYOTIkcyYMQOPx9Oi99Y1CiIi5vO5FObMmcOCBQsafJhPmzaNcePGsWLFCvLz89m7dy8lJSVs2rSJwsJCFi5cyKxZswCYN28e48aN48UXX8QwDD744INmv7euURARCQyfSyEjI4OZM2d6XzudTiorK1m7di133XUXxcXF9O/fny1btpCdnY3FYqFr16643W4qKyspKSlh0KBBAAwZMoSPP/642e9dfqqGOo+hLQUREZNFNTVDYWEhy5YtazCtoKCA3NxcNm7c6J124sQJvvrqK6ZOncq4ceN49NFHee2117Db7bRv3947X3x8PKdOncIwDCwWS4Np51JTU4PNZgPqtxTy+rajV8xJ77RzcTqdzZqvpQKRGyqZgcoNlcxA5YZKZqByW3NmIHNboslSyMvLIy8vr8mgdu3aER8fz+DBgwG45pprKCoqolevXjgcDu98DoeDhIQEIiIiGkxLTEw8Z35sbCxpaWne13+6sskhedlstgbLmiUQuaGSGajcUMkMVG6oZAYqtzVnBiq3pSVj2tlHcXFx9OzZk08//RSAzZs3c/nll5ORkcH69evxeDwcPHgQj8dDx44d6dOnj3dLY926dWRmZpo1FBER8VGTWwotUVBQwKxZs3C73XTv3p2JEycSExNDZmYmw4cPx+PxMH36dAAmTZrEtGnTWLhwIb169eLGG280cygiIuIDv0ohKyuLrKws7+vevXvz0ksvnTHf2LFjGTt2bINpKSkpvPDCC/68vYiImCxkL14TERHzqRRERMRLpSAiIl4qBRER8VIpiIiIl8UwDCPYg2iO4uJiYmNjgz0MEZGQUlNTQ3p6erPnD5lSEBGRwNPuIxER8VIpiIiIl0pBRES8VAoiIuKlUhARES+VgoiIeJl66+wL1datW/nzn//M8uXL/c5yu91MnTqVPXv2EBkZybx580hOTjZhlHDrrbeSkJAAQPfu3Zk3b55fea+++iqvvfYa8N2T64qKipp8oNG5uFwuHnnkEUpLS7FarUyfPp2ePXv6nNfYuikoKCAlJYURI0aYkrtr1y6mTZuGYRj07t2badOmERkZ6VdmSUkJo0eP9v7sI0aMIDc316/M8ePHc/ToUQAOHDjAgAED+Mtf/tLizMbGOmPGDGJiYkhLS+PRRx9t8JCrptTW1jJlyhQOHDiAy+XigQce4NprrwV8X1eNZfbo0cPv9dRY7iWXXOLXumos86233vJ7XZ1trP6sq3N9PjV7XRlh7umnnzZuuukmIy8vz5S8NWvWGJMnTzYMwzA++eQTY/To0abkOp1O45ZbbjElqzEzZ840Xn75Zb9zli9fbkydOtUwDMPYvXu3cd999/mc9cN1U1FRYdx///3Gtddea7z44oum5T7wwAPGpk2bDMMwjEmTJhnvvfee35mrVq0ynn32WZ/H2FjmacePHzduvvlm4/Dhw6bk/vrXvza2bNliGIZhLFy40Hj99ddblPfKK68Yc+bMMQzDMCorK42hQ4f6va4ayzRjPTWW6++6aizzNH/WVWO5/q6rxj6fWrquwn73UXJyMosWLTIt77rrruOxxx4D4ODBg3Tu3NmU3B07dlBdXc19993H3XffTXFxsSm5AJ9//jm7du1i+PDhfmft2rWLIUOGANCrVy92797tc9YP143D4WDs2LHccsstfo3xh7mLFi3ixz/+MS6Xi/Lycjp16uR35vbt2/n3v//NHXfcwZQpU7Db7X5nfn+8d955JxdddFGLMxvLPXz4MBkZGQBkZGSwZcuWFuX9/Oc/56GHHvK+joyM9HtdNZZpxnpqLNffddVY5mn+rKvGcv1dV419PrV0XYV9Kdx4441ERZm7lywqKopJkybx2GOPmfbEuLi4OO6//36effZZZs2axcSJE6mrqzMle+nSpfz+9783JSstLY21a9diGAbFxcUcPnwYt9vtU9YP101SUhIDBgzwe4w/zI2MjOTAgQPcdNNNHDt2jJSUFL8z+/fvzx//+EdWrFhBUlISTzzxhN+ZABUVFWzYsIHbbrutxXlny01KSmLTpk0ArF27lurq6hblxcfHY7VasdvtPPjgg4wbN87vddVYphnrqbFcf9dVY5ng/7o629+rP+sKzvx8aum6CvtSCJTHH3+cd999l2nTplFVVeV3XkpKCjfffDMWi4WUlBTat29PeXm537knT57k66+/ZvDgwX5nAfzmN7/BarVy9913s3btWq688kqf9s+fb926deO9995jxIgRzJ8/3++866+/nr59+3q//uKLL/zOBHjnnXe46aabTP07LSgoYOnSpYwaNYpOnTrRoUOHFmd888033H333dxyyy386le/MmVcjWWasZ5+mGvGumpsrGasqx/mmrGuwL/PJ5VCC73++ussXboUgDZt2mCxWEz5B/zKK694/xEcPnwYu91Oly5d/M7dvHkzV199td85p33++ecMHDiQ5cuXc91115GUlGRadqCMHj2avXv3AvW/nbXkwN3Z3H///Wzbtg2ADRs2cOWVV/qdeTrr9O45s3z44YcUFBTw9NNPc/z4cX7yk5+0aPmjR49y33338fDDDzNs2DBTxtRYphnrqbFcf9fV2X5+f9dVY7n+riszPp9axdlHZrrhhht45JFHuOOOO6irq2PKlCmm3L112LBhPPLII4wYMQKLxUJBQYEpu7327NlD9+7d/c45rUePHvztb3/jueeeIyEhgblz55qWHSijRo1i8uTJREdH06ZNG+bMmeN35syZM3nssceIjo6mc+fO3v24/tqzZ4/pRdujRw9GjRpFmzZtyMrKYujQoS1a/qmnnuLkyZM8+eSTPPnkkwA888wzxMXF+TymxjLHjRvn93pqLHfy5MkUFBT4vK7O9vP7u64ay7333nv9WldmfD7pLqkiIuKl3UciIuKlUhARES+VgoiIeKkURETES6UgIiJeKgUREfFSKYiIiNf/B15DmbQw7gQHAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(train)\n", + "plt.plot(test)\n", + "plt.legend((\"train\", \"test\"))\n", + "plt.xticks(range(1, 33, 2))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "var = []\n", + "\n", + "for n in range(1, 10):\n", + " model = KMeans(n_clusters=n)\n", + " model.fit(X_train_n)\n", + " \n", + " var.append(model.inertia_) " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1600.000000000001,\n", + " 937.22924469758,\n", + " 592.4940471817132,\n", + " 484.6290107428904,\n", + " 404.09818257937934,\n", + " 345.79343614790093,\n", + " 302.78693683830807,\n", + " 266.712394146979,\n", + " 236.4394596476161]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "var" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(var)\n", + "plt.xticks(range(10), range(2,9))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sihouette Score\n", + "Silhouette Coefficient or silhouette score is a metric used to calculate the goodness of a clustering technique. Its value ranges from -1 to 1. \n", + "\n", + "1: Means clusters are well apart from each other and clearly distinguished. \n", + "\n", + "0: Means clusters are indifferent, or we can say that the distance between clusters is not significant. \n", + "\n", + "-1: Means clusters are assigned in the wrong way.\n", + "\n", + "![](img/sil.jpeg)\n", + "\n", + "Silhouette Score = $\\frac{(b-a)}{max(a,b)}$\n", + "\n", + "Where: \n", + "$a$: average intra-cluster distance i.e the average distance between each point within a cluster. \n", + "$b$: average inter-cluster distance i.e the average distance between all clusters." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "### Code\n", + "from sklearn.metrics import silhouette_score\n", + "\n", + "silhouette = []\n", + "\n", + "for n in range(2, 10):\n", + " model = KMeans(n_clusters=n)\n", + " model.fit(X_train_n)\n", + " label=model.predict(X_train_n)\n", + " silhouette.append(silhouette_score(X_train_n,label))\n", + " \n", + "\n", + "ax = plt.plot(silhouette)\n", + "plt.xticks(range(8), range(2,10))\n", + "plt.title(\"Mean Sihouette Score for Each K\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Final run\n", + "\n", + "model = KMeans(n_clusters=3)\n", + "model.fit(X_train_n)\n", + "\n", + "clusters = model.predict(X_train_n)\n", + "sns.scatterplot(X_train_n.T[0], X_train_n.T[1], hue=clusters)\n", + "plt.scatter(x=model.cluster_centers_[0][0], y=model.cluster_centers_[0][1], color='r')\n", + "plt.scatter(x=model.cluster_centers_[1][0], y=model.cluster_centers_[1][1], color='r')\n", + "plt.scatter(x=model.cluster_centers_[2][0], y=model.cluster_centers_[2][1], color='r')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(X_train.T[0], X_train.T[1],color='g')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.26104373, 0.98223569],\n", + " [-0.84175757, -0.97962535],\n", + " [ 1.04528306, -0.02746083]])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.cluster_centers_" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAAD3CAYAAADSftWOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOydd5hcZdmH79Oml+29pWwaCQSkhd5DC1V6kV5EAUVQAZH+IagoSJEgKlIEBSQ0kSZVqlIC6X0329v0mdO+P2Z3kslOki0zW5JzX5eX7JuZc96zO/Oc9zzv8/x+gmmaJhYWFhYW4wpxtCdgYWFhYTF4rOBtYWFhMQ6xgreFhYXFOMQK3hYWFhbjECt4W1hYWIxD5JE60eeff47dbh+p040a8Xh8u7jOTFjXbl379sZIXHs8Hmf27Nn9xkcseNvtdqZPnz5Spxs1Fi1atF1cZyasa7eufXtjJK590aJFGcettImFhYXFOMQK3hYWFhbjECt4W1hYWIxDRiznbWFhYTFaqKpKQ0MDsVgs68fdXE56sDgcDqqqqlAUZUCvt4K3hYXFNk9DQwNer5e6ujoEQcjacaPRKE6nc9jHMU2Tjo4OGhoamDBhwoDeY6VNLCy2IUzDGO0pjElisRiFhYVZDdzZRBAECgsLB/VkYK28LSy2AQxNJRHoQQ32oHj92Hx+RHlgj9/bC2M1cPcx2PlZwdvCYpxj6BqR9etIdHcCoPZ0oeUV4KqsQZSsr/i2ipU2sbAY55iGkQrcfSS6O60UyhjCMAxuuOEGTjnlFM466yzWrFkz7GNat2ULi3GOACAIsLGviiAwtpMEY5t4VwfR5kYMNYGo2HCWVWLPLxzy8V5//XUSiQRPPfUUn3/+OXfccQcPPPDAsOZorbwtLMY7ooSjsDRtyFFUCqI0ShMa38S7Ogg3rMFQEwAYaoJwwxriXR1DPuZnn33GvvvuC8Ds2bNZuHDhsOc5pJW3qqpce+21NDY2kkgkuPTSSzn44IOHPRkLC4vBI0oSjpJSFJ8fNRRA8fiQHA5EyQreQyHa3AjmJikn0yDa3Djk1XcoFMLj8aR+liQJTdOQ5aEnP4b0zgULFpCXl8ddd91FV1cXxx9/vBW8LbZJTNPENA3EMb6KFWUF0aOgeLyjPZVxT9+Ke6DjA8Hj8RAOhzccyzCGFbhhiGmTww8/nCuuuCL1s2Td4S22QQxVJdbeQnjtKuLdnRiaOtpTshgBRMU2qPGBsMsuu/DOO+8ASXnsKVOmDPlYfQjDcY8PhUJceumlnHzyycybN2+Lr91e9LxjsRgOh2O0pzEqbEvXXllWhtnRgh4JpcZsRaWEBJmu7u5+r9+Wrn2wjIdrV1WV+vr6Ab1WC3Sjtq7vtwGslFQg+/LSXmua5oDqsw3D4Pbbb2fp0qUA3HTTTRk7KZctW5axPT6T7OyQ1+1NTU1cdtllnH766VsN3GDpeW8PbEvXrifi9DRE0sYSHW2UTJtJWXl5v9dvS9c+WMbDtS9atGjgbexOJ3GbbUDVJoNpj7/tttu2+hpFUfr9LjennTKk4N3e3s55553HDTfcwJw5c4ZyCAuLMYlpmpiahqEm8E6YjJFIEF6/FgwDQRTBKsDbLrDnFw6rNHAkGFLwfvDBBwkEAtx///3cf//9AMyfP3/MPzpZWGwNQ00QWL4IU9MAULx+3FW1hNeuwllWiZDF/R1DUzFUFVPXkOwOBFkZ8y3cFmOHIQXv66+/nuuvvz7bc7GwGFUMXSfasj4VuAHUYA+O4lJ89TMQFVvv6jsL59JUQmtXoYUCAAiShG/ydCS7tQCyGBhWk46FRR+miRGP9xs2NA3Z6UIcZmlX2jHjcbRQANHuwF5QhOzyEG1ej6HrWTuHxbaN1R5vYdGLIEnY8gvRNqowQRCQXe6sn0vXErgqahBkGTXQg+xyo3j9vc0hVumtxdaxgreFRS+CIGDz52MaOvGONkRZxllRkxNpVdnlJdHdQWTtytRYItCNt25g5WwWFlbaxMJiI0RZxlFUgm/SNDx19Sgud9by3OmYxNtb00b0aART1zbzeottgS+++IKzzjorK8eyVt4WFpsgCCKCMriA7XK5BneO5IkynXxQx7HIDWs+WsxXz79PpDOIq8DLrGP3pnaPacM65vz581mwYEFWbNPAWnlv1xi6Zmk+ZwFDVSly2Qk3rEYNhzC0ra+eBVnBWVqRNia7PVktRbQYGms+Wsynj79OpDMIQKQzyKePv86ajxYP67g1NTXce++92ZgiYAXv7RJDU5Oyl2tWprrILIaGoaoEVy8n3tRAvLOd4IrFqIEutqY6IQgCis+Pr346juIy3DUT8dROsqzLxgBfPf8+eiL9BqwnNL56/v1hHXfu3LnDFqPaGCttsp1hGgbxzvak7CWghgIkAt34Jk1DzKCpYLFlTF1Hj4bTxqItTSjePISt/D5FSUZ0ysjO7FezWAydvhX3QMdHC2vlvZ1h6hrxjvSNMiMRx9AtxbysYaWtxzWugsyyupsbHy2s4L3dISBk0KYWBOujMBQEWepXB+4sqUDI4uOxxcgy69i9kWzpfz/JJjPr2L1HaUaZsT5h2xmiouCsqCa0allqTPH5ESyX8SEhygqe2snEujsxEzHseYWIdoelUbIVDF3HNPRewS8JQZbHzO+sr6ok29UmAFVVVTz99NPDPg5YwXu7RHZ58E+diRoKINkdSA5nVlu/tzUMXcPUdTBNBFHqtzcgKgqt4Sh1dXWjM8FxhqHrJLo7iaxfC6aJqNjwTpwypnRdaveYlpVgnUusb+x2iChJIElj6ssyVjE0jWhLI/GONgBEuwPfxCn9XFWi0ehoTG9cYuo6kcY1qZ8NNUG4cS2emonWImIQWIlOC4stYKiJVOAGMOIxoq1NVn38MDAz2Mnp0Uh/01+LLWIFbwuLLaDH+q+o9WhkQMHbNE0ryGdAVBTYZINc8fpgjJs8jzWs4G1hsQVkl6ffmOLP32onpKEmiLasJ7xuFWooMKCuy7GOaegYagI9HkuaSAzR/laQJLwT6xFtSU9bxevHVV6VTOdZDBgrwWRhsQUEWcZTN5lI41oMTcNeUIQ9v3CLlRGGqhJYvjjVuZro6cJdMxGbP3/MVFQMFtPQUUMhQmtXJCtEZBnvhCnIzsFpugAIooTs8uCb1LshKAqIw6x26mzvoqOjC0woKMqnsCh/WMcbD1jB28JiC4iShOL145s8DRBAFLe6QtQT8X6SA7HWJhS3d6tdl2MVQ9dTgRvA1DRCa1fimzR1SMcTBKHf70LTdLq7ehBFgYLCgQffjvZOLj37GhZ/nSx/nTpjMg/+5S4KiwqGNLdcoKoq1157LY2NjSQSCS699FIOPvjgYR3TSptYWGwFQRAQFRuiogzo0V4QM6yuRTFNMdDQNfR4nESgGz0exxjrUrCGkQrcqaF4DIaYOtmU7q4AT/3lH5x9wmVcctbVfPT+Z0QiA6vgeetf76cCN8CSb5bz5qvvDWs+L/3jNebudTI71R3A3L1O5qV/vDas4y1YsIC8vDyeeOIJ5s+fzy233DKs44G18rawyDqibENyupIVFL24yipTZXCmYaD2dBFu2FAu566qxZZXmCPt8OEjiCKCrKRVisguT9YkbD/5z3/5xY33pH6++MwfseCtx6ipq9zqe5cuXtF/bNHyIc/lpX+8xk0/uYtYNGmJ19TYwk0/uQuAo447dEjHPPzww5k7d27qZykL+f2x+UmxsBhnbPxlFBUFb1097pqJOMsq8E+ZibSR+JSha4TXr0t7f3j9uqwZMRiqihaLosWiGBnK8kzT3GjjMYE5gBI9QVbwTqhH7O0NkF1u3DUTsqKCGA5FeO7pl9OvwTD44J2PB/T+Y048vP/Yt48Y8nzuuXN+KnD3EYvGuefO+UM+ptvtxuPxEAqFuPzyy7nyyiuHfKw+rJW3hcUwMXSd2opyoi3rkRxOZJcHUVGw56XnXA1NRQuHEG22fikIDCMrGQhDVQmuWpoqcZRcbry1k1NdoaZpokcjBFcvw9Q0BCm5ISu73FvcTBUEAdnpSua4TRMEIRW4S4qK0GNRtGgE2elCUJRBbUDa7AqTp0zgvbc+ShufMKlmQO+vqavil/fdyAO/+RMmJpdccQ61E6oHfP5NaV7fOqjxgdLU1MRll13G6aefzrx584Z1LLCCt4XFsNHCIcKrN+RcZZcHT126NrdpGMTaW4m1NuGqqEF2e9HCGyRGZbc3c658kCS6O9Jq0/VIGDUUwJ5fmJyHphFaswKzt3TR1JM/++unI2zSNZqJTVfapmHgNDV6ln6dGnOWV2MvLEIcYN22oiiced5JvPby2zSuawJg7/13p37axAG93+f3cOhRB7DrnrMxgYLCvGFV9ZRVlNDU2JJxfKi0t7dz3nnnccMNNzBnzpwhH2djrOBtMeYwDQPT0JOCRWM0B9yHoalEmxvSxrRI0k0nLXjrGrH2ZECItjbhrZ1EvKsdLRxC9nhxlpQPOwVhmiZahqYiLRbBTm/wNo1+lTCmNvSabUPXiLU0pY1Fmxux5eUPqummpKyIvzx3P+2tHTgcdvz5fvIL/AN+vyAIFGSpPPDyay5My3kDOJx2Lr/mwiEf88EHHyQQCHD//fdz//33A0lbNIdj6BIVVvC2GFMYqkqsoxU12IPscieD2gBWhKOGSeYuygzBUBAETJLBMrBqKfb8Qjx1kxEVW1ZuUoIgYM8vItHVkTZuzyvc6DUiot2RrBTpRbTZhy4JbNK/rd00kuODpKi4gKLi0S/v69uUvOfO+TSvb6WsooTLr7lwyJuVANdffz3XX399tqYIWMHbYgxh6Drh9WtRe7qAZBu6Fgnjrasfsy4/gizjKC5LE1oS7fZ+q2hBknGUVBBt6t2oNAyM3pxzNp8uJIcTd1Ud0dYmEARcpZXJHHvf3BQFb+1kQmtXoMeiSHYHntpJQ9YfF0QR2eNDCwVSY4rHN+afmLbGUccdOqxgPRJYwdti7GDoqcDdhx6NDKgaYiQwNG2DBrUkIyoKgiBg8+djShJ6Txeiw4mjoKjfzUYQRez5hSguN4lAN7Lbg+xyZ11FT5RlbPmFKN5kyiGTTrbkcOCdMAVI33gc8vlKK5FdLrRQENnjxVFYaqkDjgDWb3gbxzQMTNMcN7oRgiSnl8wJQtZqiYeDoamEGzc8FWwsDSvKMk2d3UyomwCCsNnNMlGWEWUPsru/Xko2ydS92G8uWXySWb1uHVOnTMEsMpL14MNYdUfCUbq7evhm4VImTKymqLgQf74vK/M0TXNMyxMMdt/BCt7bMHoiQay9GSMex15QhOz2jukVkSDJuCqqCa9blRpzlpRntG0bafR4LO2pwIjHiLW14CyrRBBF4vH4uE8VDBXTNIcdtAF0XefTjz7n8vOvxejdRzj3ktO48Htn4fEOz6TZ4XDQ0dFBYeGWdWlGC9M06ejoGNQG5tj9JlsMC0NVCa5YhKEmmzTUYA+uylrsBUVj8sMLydSC4svDP3VWsmbY4UCQB9aSnmsyScNqsSiGriGJY3hDdRzR1dnDbdffnQrcAH9+6ClOO+eEYQfvqqoqGhoaaGtr2/qLB4GqqihZeopxOBxUVVUN+PVW8N5G0dV4KnD3EW9vwebLG9PiSBtcfuyjPZU0FE//R3eb148aCiLmjX6FxDaBadLRnr7nYRgGaqJ/l+hgURSFCRMmDPs4m7Jo0SKmT5+e9eMOhO3zOW87IGPplySNifxxJgxNQ1cTGLo+2lPJiCAreGonIdrtqQoT0WYj3tqUanixGB4uj4tjTpybNjaxvhaXe/Cys9sD1sp7G0WUFWS3By0cSo25yqvGXM7bNE2MRJxw41r0WCQpzF9WNeZKA0VJQnQ4cJZUIIgiiUA3oTUrUbxeyEJnpAW4XE6+96PzKass5c1/vsv0mfVccsU524U291AYW99ki6whKgqemklosQhGPI7i9SFkQUQo25iaRnDl0g3GBV0dmLqOp3rCVt1qhoKh62DoSXkOURzUzUyUZKKB7tTGpSBKuMqrh2QkkCw7NIBeR/oxdlMdLQoK8znvktM46fR5OJ0OHE7LJHtzWJ+YbRhRUbApfvCO9kw2T5+11saoge5ke3yWg7ehacRam3vb1E0Urx93dd2A65xFWcFdWYNRWoGpqUh2x5CaWwxNJdLUSKKrHUjqmnhqJo65p43RQpZl8gvyRnsaYx4r520xumxiUgD0ehtmPxWhx2PE2pvp691Wgz0kujsHVV8rygqyw4ni8SXb2ofQVq7HoqnADaCFg8S7OwY8D0PPjpekxfhmuwvehq4lNYwtV++xgSjhmVCP4utdaQki7uoJOVmFapFgvzE1GBjxz4IWCfcfC4cG1ElqaBrxzja6F31Jz5KF9Cz7BiMR3+r7LLY9tqu0iZ6IE1nfkNwY8/lxFpdbj6qjiKGqJLraUUMBFI8PV0V1sjtQkjA0FUNN/k92OjerAWJoKkYiWaXSVxe+uTp2xe0jSmP6mC9vxJtrFE//edj8+QOSUDUNnWjTBhVDs7fz01Mz0cqbb2cM61P7xRdfcNZZZ2VrLjnFUFWCK5eiBrowEnHi7a1E1q8b+96B2yiGphFuWE20uREtFCTa3EhkfQOIIqZhEG5YQ2DZN4RWL6N7yUL0DKtLQ1UJrVlBYPkiQquW0rP06y2uQkWbHWdZJfSmOmz5hdhefgVhwoRk+qauDh5/PFeXnDYPV0VNMqcvCNiLSlC8A2sBTytLFJJNTaLNPmb0XyxGjiHfqufPn8+CBQtwOp3ZnE/OMA293xc70dOFq6IKRr+Bb7vDNAzUYE/amBrowjSqMTUNNdC94R8Mg+j6dbhrJ6ZVdujxWFoppKnrRHvNDjJ1ZYqyjL2oJGVMIPz1KYRLLoVIr9fkmjVw0UXJ/z7jjCxdaX9EWcZeUITN35sqEqUtdpEamoYej5EIdOEoKAZBRHa6cFVUk+jpSopl6QamZGy3LfrbI0P+S9fU1HDvvfdmcy45Jfmh3mRjTFH6jVmMDIJA/4ah3hVxJi9HQ1PBSG7MmaaZ2rvo97pEgi2JSYui1OsEb0P42c82BO4+IhG47rrU8Y0cNeAIopiax5YCt2kYxLvaCa5YTLythUhTA966SbgqqgmuWkqsrZl4R9tWnzostj2GvPKeO3cuDQ0NW39hL/F4nEWLFg31dMOmsKAAd0kZ8dZe1w9BwFFRzbrGRiLR/roVQyUWi43qdY4mg7n2gvx8PEWlxNuaU2P2kjLa2jvI8/sQRCm5ouxF9hewvrmZPL8PMR5DC3TjqqhO3gA2qrZQ8gpYu66B6AD+ptPWrs146zbXriW0dhVaOIjkcuMoq6KhqZlEov/NYijXPhhqq6uIb+RUowa6ESQZ2enCTOtGNYm0NhFWnHR1d/c/UJaRZRmn04VpGiQSiW36My8IAm6Xh57uIGtXNTJz9jR0QyMWi47q933EdjjsdvuoaQD0YegajvwiDDWOZEvW6Nb6s9u9NZpaB6PNYK/d0DTseQVJKzC3B1Gx4ZZlTNNEqZ9OpGkdhqpizy/ClleASxSJtTUTbVkPQLRlPd4J9URbmzF1DUdhCYovj7rC4oFNoKYmmSrZBLO8DK03paOHgkTXrWLCxKlb3NzO1d/dUBPENikFNOIxcPUXahIEgdLSUsrKy7M+j40JhyI0rF3P/PmPU1RcwAmnHkl5ZRniNpqy6ekO8ps7fs8zT74AgCRJPPiXu9h9r11YvHhxzr/vm7s5bFfb06IkgySPOdGj7ZWkvnVyFbkxgiAg2R24qyeAaSYrTQSh1yJtgyqcGuhGj0XxTqhHkKTU6wbMbbclc9wbpU5Ml4vI9y5Ne5kRj2EaOqYpbbauu7iwEENTBz+HrSFJOIpLiLVueEIRRBHF60/XPhcEnMVlI5LzXrFsNWcd/91UffkzT77A3/75CMUlhVt55/gkEo6kAjckpWtvv+G3PPLUb0ZxVttZ8N5WGOui8tkiU9u5KEno2gaVub4875DcYPo2Ja+7DtauhZoazJtvQt11Nmx0DkGUMNREUrN6Ez9N0zAwEnFs4R6CgY7k6t/rz1rZnihKOIpKkZ1uEj1dyG4vNn8egiTjnzKDeFcHpm5gLygcEa/PcCjCQ/c8mtYY1NnRzf8++YrDjjog5+cfDWKx/nsJne1dGMboNkcN6xNWVVXF008/na25WGwFQ1NJBHrQQkFsefm9NlrbT526qCg4K6oJrVqWGlP8+cNroz/jjLTKEsE08YSCBFcvTxrpCgKuimpiHW3JCpFNg7eu0bNsEZhGsmRP0zC0RNJeLEvt/aKsYPPno/j8aSt/QbHhLMltimRTBFHAZu9/k1Bs2+7n0OvzUFldTuO6DXsPJ552ND6fl/aO1lGbl7XyHicYmkZ43epUeV2iuwNHcTmO0rIBNXdsK8guT9KsIRxEtDuQMpj9DgdBEJCcLnyTp2FqGoIoEutsQ+3pyhgo1XAITCPpdF9WRbSlkWhrE4ov++qIQ3Z4zyIul5NLrvwO/379fTQ1mbKpqCpj1uxtd5+nqLiAPz59D7//7Z9ZtmQlc+cdxNHHH4rdMbomHFbwHieYht6vLjrW3oKjqATGePA2jezVH4+EWYMoy5iaSmDNcuhtnVd8eRkDcd+Nw1laSWjtilQTTaKrAwwTV1XNkFQHh4qha2CS027Lmroqnn/jL7z8/GsUFhWw9/67UVQ89gwpAj1BIpEooiji9bpxuobek1JWUcI1N36feDSO1+9BGgPuTlbwHs8IsKWa5tHG0FTUYAA10I3s8WHz542bNI9ot5M3dSZ6LIYgJ53iM81dcjiQXR4QhH6mDIlANy6jekSawEzDQI9HiTY1YhoGjuLSnHmWOhx2qmsruPjy7wDJaojyyrKsn2c4dHZ0cet1v+aNf76LzaZw0ffP5uSzjsWfN3QzY6fTgXMMSdRawXucIIgSij8/zQTXWVyOMIKrusFg6DrR5vXEO5PVIYmeLtRQAHdlbVYDimkYyYoLYXDa3FtDEJKbk1vbBBR7HXY2rknvQ7LbR6wHzNBUAssXp2reQ2tCeCdMQdxK231nexdf/O8blny9jAPn7kNFZRleX27d7XONruu8+OxrvP7KOwDE4wnu/eXD7HPQnsMK3mON0U+iWQwIUZZxV9bgqZ2EvagU78Sp2AuLx247tKET30j2FEDt6cqqgp+hqcTaWwisXEJozQq0aGRU1CJFRaErEMBZXr3RoIi7auBa4cNFDXSnNSsBxDpat2gr19XZzbU/vJ0rLriW++/+Iycdfj7v/fsj9EFa0WXLgDdbRKMx3n/7o37jn3/y1SjMJneM0W++RSb6qg7cFdUontw8EmeX3C07TdMg3tVBtLkRIx5HCwcJrFicsbV+JGhr78BeUIh/2ix89TPImzoTyTFyuj+ZnhCSeuOb/xuEghE+ePvjtLF77nyIrs6ezbxjA7qu09bSweuvvE1zQzsd7Z2Dn3SOcDod7L3/Hv3GZ+86cxRmkzvG+rffYrwiSThKSolt1NptKyzOWvmcqenJTcGNMQy0WLRfOd9I0dcENlxMw0g6CYnSgJ+sZJcHyeFEjyVlAQQpaZK8pfcbGVI9ibjabwWfiZamNk4+8gICPUmN9En1dTz817spLBr9jUtJkjj6hEP5/LOv0nLeFWMsLz9crOBtkRNEUcJRWIri8aEGAyhuD5LTlVaTbahqbyWKsFm97s0iCIiKLRWsUucdgxuihqYmXW80DcXlRpDlzZb9GWoyFaSGgihuT9KlfgBpCVFR8E6cgh6PYRoGssO1VYs2r9fDtB3qWfz1hrr5cy4+lbx8/xbfl0ioPPLAE6nADcmuy/9+/CVz9t0Nj7d/6/7WCIcjhAJhEokETpeTouICwuEosiRidwy+sqigMJ8bf3ENP/755QiCgNfnwekaO5uN2cAK3hY5I9n+7kVx9zfR1ONxgquXYsTjyfxwZW2yHG+AK3NRlnFVVBNYHkoJNCn+/DFnrpHUHF++wT1HFPHXz0Cy9w8kfRrnfSWhejSMFo3gqZ04oJuSKGeuiNkcBUX53P/nO3nx2Vf5+sslzDvxcHbceQaysuWwoOs67W390yRrVjVgdzrYa99dkQeR0gsGQjz71xf57S8eQtN0qmsrue9Pv2D+7/6CJElc+L0zKa8sQ5YH99Tm83vx+cewgeswsXLeFiOOoWmEG9ckAzeAYRBetxoyPMZvCdFmxz9lB7yTpuKbskNvJcvYCt56PJZue2YYRJobM24kmmZ/jXMtHMzpJmxRcQHfuehUbrv7OvY7aE/y8rdejeF0OjjrgpPSxux2G7vP2Zlbf/pLursCg5pDoCfIr257AE1L/k7WrWnk7v97kIrKMv7x9MucfOQFdI6hnPpYwVp5W4w8poEe3dTH0cTQtEHpcwiCMKByvtHCUNWkWJWiYM8rRJQV1FAAU8ucVxYg6eizcbAWxJzp2GiaTndnN5FIFKfTgS/Phz1D63smpkybxAOP3sWfH3oKl9vJ6eeeyJ8e+is93cEB5cw3pml9S7+xpYtWcNBh+wBJPZVPP/qCI489ZFDHzTaxWJyergBrVjdQXlE66mWHVvC2GHlEEdntQw1sqFlHEMZB9czA0eMxQquX466ZiKdmIrG2FtRQAJs/H5svL+O1CpKMq7yaSOMGmVpnWSXkoJvPNE2WLlrOpWdfTVdnD26Pi189cDO77jkb2wB0Srw+DzN3ms5hRx9Aw5r13PCjO1jf0My5l5yGxzu4OvHq2kpkRU612wPsuc+ufP3VktTPPt/opj9M0+TL/37DJWf/KDXPy646n8OPOWDU5mSlTSxGHFGScVdWI/fmwkVF6ZV1HXzwNjQNLRYh1tGKFo3kzPlmUHPq9dbU4zEwTUKrlyfla6MRos2NJII9GT0nBVHE5s/HP3Um7pqJ+KfMTKoF5kD+oLO9i6svuylVFhgORbj6shvp6dp6mWAf/jwvBxyyNy6Pi/qpE7n97us45+LTcDgHt8Hoz/Px4KN3UVldhiRJzD36II496XAW/P2fQLKSZfrM+kEdM9u0tXZw00/uSrvBPPibP6HIo/fUt+0sdSzGFaJiw1M7Kanch9BbgTG49EDSIqyDaNO61JiztAJ7UWnWShI3xuYGRswAACAASURBVNA0TF3D0DQkm20LTvUmeiyKICvoifgmjjcQ72zHnleIoPRfO4myDLKccUMzm2i6zro16Q72wUCIeHzzbkGZKCou4PBj9ues8749ZO0Qh8PObnN25rHnHsAwTex2G7FojJ/edAV5+X5m7jSNwlHWTjF0g+amdAVBXde36K6Ua6zgbTFq9KUOhqpPbuoa0Zb0ABRtbcJeUJT1VIOhqUSaGkn0do0KooRv8rTNNOIIiHYHhppIE6WSXe4NImKjrMdusynMmj2drz7f4NJSVlGCYwjaHeFwmJqammHNRxCEtADt83s59qQjhnXMbCJKIocesT8vP/96aqyyunzAewQ5mdOondliu8Y0TQw1QaSlkXDDmmTKY5Bt2UD65l7ywGlGAdki2RS0od3fNHTC69clVfxSp05ek6FreGqSTveGmsBWUIR30jRs/gIUtwd3ZQ2COLrBO78gjzvvu5Gdd50FwJTpk3jgz3dRUJg34nPp7OimpamV9tZOjFGQNxgIDoeDcy4+jVPOPo6aukoOmrsvdz9066j6l1srb4tRwdRUepYtSlZeAImudryTpiG6B7HZJYoovvy0jU/FkzQvzjbGRs46qTE1nnS0l5KGvEYiTrhxTbKuXZRw105ClCRkr4/gisW9zvbJpwN//YwRbZ/PRGVVGb99+DZUVUPXdbSESuO6JrxeD3kFW27UyRaN65r44SU/Y9HCZZRVlPDL+25kxqypW601H2l8fg+aVsRhRx7AfgfOweNzU1pWTEtrE+UVo9O5ObZ+QxbbDVokkgrcfcRa1iPVTBxw1Ykoybiraoh3ulCDAWSPF0dhcdarVkzDQLTZcJZXEe9oS1mv2fMKUx2j+Xl+tEgYe34R8a4OBElKenPKMnoknArcyQOaRFvW46quG/BmZDwWJxAIAcnOyMFuCm6OvHw/bS3tXHTGVaxesRaAAw/bh5/fcXXOV+HdXT1ce+VtLFqY7PBsXt/Kd8/5Mc+9/ucx6YdZUJhHwZydScQTKTeh5l4z7NHASptYjA6Zcr6CMOjHUFFWcBSX4amdhLOkLOtNOoamEmluILhiMWqgB0/tJLyTpuGqnpCm6mi320EQCK9bhRYKoPZ0Eey1a8vUZGOa5oCl2Lu7evjLH/7GvAPO4Oj9TucP9z9G9wDEowaCpmo88adnU4Eb4K1/vcfyJSuzcvytnft/n6Yr/QV6gkTC0c28Y2yQyQZuNLCCt8WoIDtd/ZprnKUVQ3KdEXprxLNtE2YYOtGWJuLtrRiqihYOEly5JOloLwiw0flMw0zLifcOJjVKPN5+PpuO4rIBV8QsX7KKe+6cTyQcJRqN8ft7HmXhF4uHfX2Q1LpeunhF/3MuXZ2V428JUZKYMWtq2pjb48K1jWmQ5AoreFuMCqKi4Js8DVdlDY6ScvxTZyI6xtiXVtdJ9KS3ZZu6jmkahBtWpznnRGMxBCmTTZqMICv46mdgLyzBlleAb/L0QeW733j13X5j/3zxzaxszLo9LuadcFjamCAI7LXfbsM+9tYoKMzjjnt+RlVNBZCs9/71g7fg24YME3KJlfPOMuJYNUcYg4iKDUdhyWhPY/MIApLNjrZJ448oSkn52Wgk5aXZEwhQVFuDGuhOuepIDiey0500NbbZcZVXJQ87yM/IrnvO5vFH/p42tsfeu2StbX7PfXblyp9ezBOPPIPL4+JH13+X4pKRqauunVDFo8/eRzwWx2ZTyCvwb9HcIR5P0NXZzVf/+4ayihIqqytGpUJmLGAF7ywQC4RpXrSWaHeY8h3r0BIq8gBajLdXDLV3o1IUc9JMky1EWcFVWUtgxeJUSaK9qBQ1lJRClTZ6UtB1HdFmwz91B9RwCFGSkRzONJXDoboe7bzrLObOO4hXX3gTgAMP3Tuj2cBgiMcShENhnG4nefl+zjrvJI45YS6CKFBQmJ8zPZVNEQRhUObFq5av4czjv0uit5lo3wP34NZfX0t+QeYA3t7WSVdnN263C7fHNep6JNnECt7DJBYI89av/k6wJVmutvD5Dzjkp6eRX108yjMbe5iGgRaLEFm3Gj0Rx+bPx1VenVMZ1z6PS0PXkzcKURrUDUOyO5JGxIkEgiCghgJEW9Yn8/ObbI72+V7a84a3ak3EEyRUFY8nqYtdUJjH9bf+gKuuvRTTNJMBdxhBqKOtk4fvf4yP3vsvs3edyaVXnkNxaRFFY7DCY2MCPUHuuuW+VOAGePetj2hv7cwYvNc3NPOdE79HS3PSR/XE0+ZxxY8v3Kpe+XjBCt7DpLuxPRW4IRksFi74gD3PPwLFMTZ2pccKhq4RXLG0tyUeEt2dIEq4Kqpyot8BoMeiBFcu7U1lCLirapO63wMM4IIoIohJ5UJDU7HlFWDrLRHM9lODaZo0r2/lkfsfp2FdEyefdRy77DYLf54v9b/h0tMd5GdX38F7byU9HpcvXcXSxSu59+HbyR/j6QdV1ejq6O43vrEpRB+RSJR77no4FbgBnnnyBU4/54ScBe+e7gDLlqzipef+xY4778D+B8+hoCg/J+cCa8Ny2Bhq/65AXdUwjex3+Y13DDWRCtx9qMFuGEpn5UDOp6mE1q3ayNndJNy4JqPT+0AQZQXJZkey2XKS7ulo6+S0Yy7mqcee5/23P+aKC67l3bc+zGrHaCwWSwXuPr7879dEY7GsnSNX5OX7OPms4zYZ81Mzoarfa+OxOKuWr+k33rAuN3XZmqbxyoI3Oe/ky3nmyRf5+TW/4IeX3kBnhptNtrCC9zDJry3B7nWljc04Yndsruw0UYwGhqaix6LJlnW1f2fhUMlUgy3ZnWkld1nFBCO+SVAyzf4t9WOEVSvW0tnelTb25qvvEgpuqn0+dERR7Ocu43DYEQSB9tYO1Cz+vbONJEkcPu9AbvnlT5i960yOPO5QnljwIIUZVrden5cjjjkobUxWZKbvMCUnc+vu7GH+vY+mjf334y8Jh7L3t9sUK20yTBw+N4deexrL//0Fka4gE/fbkbzKotGe1pAxNJXQ2pVovZtyomLDN3laVgwPBFHCWVZJtDkpJiXICu7KmtzpeAsCitef5k4jyMqQNw5zjduTvgj46c1XUFJWzB0/v4fps6ZwxLyDhq2ul5fn49pbruSnV9yaWtFf/uOLeOGZV/nb4ws44dSjOfXs4za7ATja5OX7Oebbh7P/IXtjtyubVTKUZYljTzqCQE+IZ596ieKSQq695cqctv1n2uTN5cavFbyHiSAIuPK9zDx2L0zDZNXqVZRkqXV5NNCjkVTghmSqI9bWgrOscthBT5Rl7IUl2PMKMU0j6Y6eQwMGUZZxV9USblyLGgogO5y4q+oQttKFaRg6pqajx6NINjuCJI+IUURZRQmzvzWTzz9byEFz90VVNX5w0fUAvPDsqyz4+z958NG7hpVHVWwK+x+8Fy+/+yQrlq2mtq6Klxe8wQN3/xGAB+7+Iz6/h1PPPh5pjFYCCYIwILu2/II8Lr7ibE475wQkSaSgMHf557x8Pxdf8R1uufZXqbHd5uyMy+3awruGhxW8s4QoiiAyph87B4Le5yuZNhZLyrZm4fiiJOXEGWaz51NseKonJPPcgpAxddOn063HokhON0Y81tvanlyZOkrKcRSV5jyAFxTm85uHbmXp4pX487z88JIb0v598dfL6O4ODHsTzO1Jls1VVpfz1mvvpwJ3Hy899xpHHnvImF19Dwa73U5xSe4XU7Iic9hRB1A/bSIvP/86O86ewZz9dstpDboVvC3SULz9VzT2/MIxXY+9NQRJ6tee3oeh68Q721KpHHfNxN7/3rBJGOvVCNcTetI3Mwet+H0UFOWz5z7for2tM6Nbupjlx/Dyiv5NUnWTanA4xu/T42jhz/Mx+1szmf2tmUDS3KK9tQNRFHNSdTI2k38Wo4YgK3gnTkFyuhBtdlwV1cie3DU2GLqW0sAeDUxDJ7qRMlyfBvemGKpKz+Kv6Fn2DVo4nFNHd0jWdl921flpYzvvOmuzreOdHd2sb2imtbl9UMJOpeUlHH38oamfi0sKueyq84fsimORpK21gxt+dAeHzTmJC077Ad98tSTNQi0bWCtvizRESUL0+PDWJT0Dh2JPNlD0RJxI41q0SAjZ5cFVWYtkG+HaeNNMcztXQwFseQUkujpSY6LNjtl7czE1jdCa5fin7IAg5m6uoiiy9/678+QLD/HKgtepm1jNbnN2RpKS6632tk7ee+tD1jc0c/gxB/Pmq+9yz53zkRWZS39wLqeceWy/qpJM5Bf4uebn3+fSK88lHI5QVFww6pZj451QKMydN92b0qRZvnQVF57+Q55/49GsNkJZK2+LjIiKgqhszqNx+BiqmjTmDfZg6jpqsIdww+oRX4ELoojk3LCpFOtoxVFYgqOkDMnuwOYvwFM9gWhTQ+o1pq5jGgaGmkBXExhDrBvfGol4gjtvvpf1Dc384+lXmHfAmTz/t3/S3tbJeadcwQ1X/4IHf/tnTjj0HOqnTqR+6kQ0VePeO+fTsUnJ4ZbIy/dTXVfJtB3qKSopHLHW+G2VaCTG+29/nDYWDITo7g5k9TzWytsiZyQicXRVRRAEbG4norSRhKppoMc2PN7bC4qw5RUSbV6PqCjY8wu3YPCbPURZwVs3mUhLE3okhOL1J40XSipwFJZimibhhlVJJ/heBEnG0HWCSxaCIOIsKaO8tDTrc+vs7OZ/n6TrXf/r5beYNrM+TX/bMAwee+RvzJ13IMt6dbiXL1nJhEnD85W0GBqKIjN56gQ+/3RhakyWJby+QbhEDQAreFvkhFggwmdPvsH6z1di97nY9YyDKZ5ajWJPVnsIgoAgSZi6jmizY/PnJ7Wy+97f3oK/fgZCFurL+zA0tTdFIqTpqYiKDVdFFRi95Yu9JZGCKKJGwjhLK4moq9HjMUTFhru6LrXBiWkQbVmPd9I0DDWBIMlZqyO3ZUghTZxclzF3qqlaWmnf9Jm5aUYB6OrspmHtetasamSX3WbhHGU7t7FGXr6fG39xDRecdiXtrZ3Iisy1N1+Jx+vO6nms4G2RddRYgm9e+ZjG/yVF/mM9Yd5/8AWOuvW8DcFbknFXTSC0dgU2fz6xjra0Y5iahhoODVvkqQ89HiO0dhV6NIzkcCYNgu2O1MpeFKUNzu4bISkKwVXLcJSUJxuVBIFEoBstlP4IrAZ7iMVi2AsKkT3erGi1+Pwe9jtoT95580MgecO78Htn0tnRTWl5MS1NG35nZ55/En988Eny8v385KbLh63foWkaoWAEp8uR5pDe3dXD/93wW/7Zq3AoSRIP/uUu6ibUDet82xq1E6p4+qWHCYcjOJwOvD4PrixvAlvB2yKrxENRgi1dtHyTrithGiaBlk5cBclNNEEUkb1e8qbNwjCMtJxytjFUldCaFak0jR6LEly1DN/kaVtd2QuSjLO8ivC61ZiamnTAyaCCKDtdJLo6CK3pJm/arIw3gsGSX5DHzb/8KcsWr2TFstUccMheLFuyknvunM+vH7yFN/75Du1tnRxz4lwmTK7l1w/egiAmV35b0sTeGp0d3Tzz5Iu8/cYH7LjzDM695LSUp2QwEEoFbkhK4d51y3089PivctoEM96QJImikkKKyJ1S45CDt2EY3HjjjSxZsgSbzcatt95KbW1tNudmMc6w2Wx0N7bTvHA1edXFaWqLAJ6i9NVg32pXJGmBpgY2iPgIspJ0Yc8Cpmmm5dch2Tk6EMEnQRRRPD789TMAs1eHxUQLBXvnK2AvLMLU9VSJoaGqWZETgGTJ4B5778Iee+8CwMIvFrFs8UrOPfly9jtoT3x+Ly88+yo/ufEKYtEYzU2ttLd2UlJWnFHzY2uEQxF+ffv9LPj7q0BStOp/n3zF3b+/BcPQM5Yh9nQFMPSxqRezLTPk5Nzrr79OIpHgqaee4qqrruKOO+7I5rwsxiFOp5O1Hy9m1QdfM+WgnfFVJFcdoiyx04n7YnNv/rFRtNvxT52JvbgUZ1kV/vrpWWudFwShXzAVpIGXQCbfryAqNkRZRpQV3FV1+KfNSq7eRYlwQ9+TRuYuzmyx865JidhEPMHrr7zDs399iRNOOYpEXOXcU67gtHkXc8pRF3LxmVfR3ta59QNuQiQS5eV/vJ429vWXi2lqbOHwvU8lHIpQXpm+OfvtM+aR98pLUFcHopj8/8cfH8ZVWgyEIX87PvvsM/bdd18AZs+ezcKFC7fyDottHVVVKZpUwar3v+ajP73KzGP2wpnnxuF14cxzI2/BdVsUJbBLuMursz4vQZbx1E4kuGo5pq4hSBKe2okIgzQ7NvTeDktR7G2Vl9GhV/jKRBAlXFV1OW3/LyjK5+lXHuaxP/yd9rYOzjr/ZOomVvPsX19Mq0BZumgF7/37I4476YhBHV8AfH5vmpSpJEkIYvJGd9v1d3PfH3/Bk48+x8qlqzj6hLkc1LEG+bvXQCSSfMOaNXDRRcn/PuOMYV2vxeYZcvAOhUJ4PBtKXyRJQtM05M2sluLxOIsWLRrq6cYNsVhsu7jOTMRiMSbW11EyrYrWxQ18+PDLlM6oZfZp+7NsZX+H8pHE5XJRUjsJQRAwTWju7CS0rnFA760oK8Nm6mg9XQg2G0p+MY3NzcTjcWw2GyUlldh6W9nbunsINjVnbd6yLCPLMvF4PJXmEUWRU885FoB4PEZPoJsVy1b3e++KpatoaGggGOxvVrA5XE4XV//sMn565W2psdPPPZF33vgPAMuWrOQXN97Dzb/6MfF4AkE0cc+5aEPg7iMSIXH11azYZZdBXvH4YjS/70MO3h6Ph3B4g1atYRibDdyQFIiZPn36UE83bli0aNF2cZ2ZWLRoEf7ifOZccBRaLAGCgGxXsHuc+MdY1161Z2A1t6ZpEu9sJ9K4YQNWC/RQN3l62sZl39+9ypOdPD0kNw7XrWlk3ZpV7LLbjuQX+Dfbtn7iqUfz3FMvp43NO2EuVVX9jQq2RmFhES+98wQLP1/EhPpaPvvoS355y32pf//WnjtRXFyErCS/72Zz5puVrbl5m/8ujMT3fXM3hyEH71122YW33nqLI488ks8//5wpU3JXV2oxvrB7nNg920btr6lpxNpb0sYMNYGhJrLuvWkYRlKdkmRJ3q3X/ZrXX3kbSDZ5zH/ibr61x04Z31s3KVlt8vt7/owoinz/RxdQUVU2pHl4vG48XjfVtZWEg2FWr1iHP89LT3eQufMO4uQzjk0FbgC1rAxbU1P/A9VYTUK5ZMjB+9BDD+X999/n1FNPxTRNbr/99mzOy8JibCAkTST6kUVDh0BPkBVLV/Pc0y9RP20SRxxzMPFYPBW4ATRN586b7+WBR3+ZUWbU5/dw8OH7ssvuOyIIZE3O1e11c8jh+7HrHjthAk6no1+zSdsPfkDljTemp05cLrjtNixyx5CDtyiK3Hzzzdmci4XFmEOUFVzlVWndn7LbmzVtb8MweO+tD/nJFbemxp7760vc+8j/ceBh+3DMiYfjcNp5+40P+ODtj7dYkicIQk70o2VF3qKgUuDoo6msqIDrroO1a5Mr7ttuszYrc4zVpDNCGLqRpu0xHjF0nXgohhZXkW0yNrcDSdn2P0KS04V/6kzUYADJbkdyuLJWDtjV2cPv7/1L2tjypavQNI19D9yT2392N+FwhONPPpJfPXDzgBxkRoUzzrCC9Qiz7X/zRplYIMK6z5bSsaqJ2t2nUzChFPsW6p3HKqZp0t3Qzjv3PEciHEO2K8y56ChKplQjKZlL47SEihZTkZ22tBzpeKPP/UeyO3JyfHmT0kKf30s8nuDmn/4yNfb4H59hyvRJTJk+adDHN02T7q4eFEXJur6GxegxvpeCY5xYMML7D77A/576N2s/XsK7v/sHqz/4BkPLjYRoLokHI3z48Mskwkl1PS2u8uHDr5CIZBb+j/aE+eLv7/D2b57hq3+8TyyQOxft8UxBYR6X/SjddOHQIw/gk//8r99r//XSvwmHIv3Gt0RPV4AXn/0X3/3Oj/nx929m+dJVqInMVn09PUHaWtppb+scUPepxegyfpdD4wAtptKxMn0XfvG/PqVm92k4/eNrBWQaJqG2nrQxNRpHT/S/EcVDUT74/Yupa+9Z30GwpYs9zz0cm3tgq1fTNIcsB5sIx4h0BWlb1kDhhHLcRX5ERUKLJtASKrLdhsPrSjWejCaCILDbnrP52yt/4KV/vEb91Insc+AeLF+6ut9rd9hpGuIg5/zBu59w3Q83FBN88p//8cK/H6e0vDjtdW0tHdxw9R188M4nVFaXc9vd17LDjtOw2ZLpoXAoTCgUQVM1nC5nTr0ZLQaGFbxzSYbv2XjNe4uyREFdGZ2rN9T0uvK9yLb+HyEt3v+m1bxwNVpCw7aFe5ZpmER7Qqx8byHxUJT6A3bCVeBDtg88v6ypGqs/XMTnf9tQqTFt7q6Uzazjnd88i6EbOPxuDvjBifjKsld7Phynda/Pw9QZk5k6Y3JqrKgonyOPOyTVqj595hTmHnUgDOKG1tMd4Om//CNtLBaL88V/v+awow5IjYVCYe68+d6UgUDD2vVccuaPePHtxykuLaK1uZ2nH3ueRx54HE3TmT5zCr975A6KS3MnujRe6e4OEOwJEQlHKCzOp7CoIGea9OMzkowTFIeNsh3q0sZ2mDcHuyc3udNcYvc4mXPhkRRNrgAgr6qYfS8/DrvX1e+1oiT228hUXHa25tkbC4R57bYn+Oalj1jx9pe8evNjBFu7t/ymTVAjMRYu+CBtbMlr/0UQhFSlRqwnzKePvU48NHCvxy0RiUQRUbj3rod5+rHnaW8dvKbIpnzz1VKmz5zCY8/dz5+f+R3nXHwqjzz4BLo+8JSbzab0W2EDlGwSdKORGP9599P0sWiMzs5uWlvaWbu6gYfufRStN923aOFS/vDA4yTi/b0+M6FpOu1tnbS1dhCNxrb+hnFKV2c3d954D0ftdxonHXE+px51EU2NLVt/4xCxVt45xO5xsvs5h9G+Yj2dq5up2rkeT5F/3Dqxuwt97H3JPAzdQBAFHBkCNyQD9Y4n7MP/nvp3amyXUw7YojAVQOvShrSAapom37zyEXt8Z+7AV98m6JuYFZiG0W/109PYgTGIQLglVi5bzVnHX5YKrH/6/V/5y7P3DcsLcpfdd+TYg84iFounxu770y/wegfuxuJ0ObnsqvN5962PCAZCAOw2Z2dqJqR3XdpsClNnTE7Ls0uShN/v47mnXsrohfnlf78mEo5i24JeDUAwGOKdN/7DXbfcRzgY5sTT53Hx5WdnrQ59c6gJlZ7uIGDi8XpwOO05PR9Ac1MbLz73Wurn1pZ27r/7Ea679Yc4ndlfsFnBO8c4vC6qZk+mavbkrb94lDEMA1M3N1s9Agyoc1K2KdTuMY2yHeoINnfiqyjE7nYgyVu+aWVyoBEFMWP6abPnttuo2X0aaz7c0FJcOqOWnvXtaa8rn1m3RaGsgRIKhrnvV4+krYgb1q5n6eKVzBli8O7s6OaVBa9z/5/v5LFH/k6gJ8iZ55/ETrvsMOhjVVSV8Y83HmXpohXk5fsoryztp7vtz/Nx4x1Xc9GZP6RxXTN2u43rb/shiiLz/r8/5gfXXtKrCbNhE3O/g/caUOVKe2sHP92ohv2JPz7DpPo6Tjzt6FQ3abbp6QnyzwVvcO9dD6MmVE4/90TOvvDknN8wGtb018pZtXwt8WjcCt4WuSPaHWLF218SbO1m0n6zyKsuxuYa+gfO5nJgcznwlgz8C1NcX4kzz020O1mZIkoiM47aA9mmYBomiUgMURJRtrCKUpw2Zp+4LwW1pTR9tYqSadXU7TmdaE8YX3kBwZYuKnacyE7f3g/FMfzgbZpmKp2wMZo2dCPl9/79Ib+54yHKK0s54piDcXtcFJcUDsgNflMkSaK4pDBlprA5qmor+MtzDxCNRLE77Ph8HmRZZs99d+XNV9/lZ7dfxe9+9QcC3QEOn3cQJ595zIDKPz/98It+Y6+9/DaHH3PQoJ4iBkPj2vXcdv3dqZ//cP/jTJ9Zz2FHHZiT8/UxY9ZUZEVOs6k74tiD8fpzc51W8LYgFgjzxp1PEelMqs+t+2wpcy48kqpd6kfUSdzhc3HIT05j3WfLiIejTJgzA4ffQzwcZf0XK1nxzpc4/G52PH4fPMWbTz/ZvS4m778TdXNmICkyoiTi8Lk54AffxjRNJJuMLUuP0V6fh0uu/A4ff/Df1Mq0pLRoyB6SuqbzwTufANDU2MIjDzwBJE0SZs3OnQCSIAgUZXhSOOWs47j7/x6gcV0Tt/36WqpqyvHl+cgfoM3axpuwfey483Scjtzt+7z71of9xl598S32P2TvNEu3bJJIqPznvU/59QM388Bv/khnezdHHnsI+xywx7A2s7fEmA/euqahRRPIdgXJljuR++2ZUFtPKnD3seifn1A8pWqzee1cIAgCzjwPUw7eOTVmmibrPl3CJ49ulEtcvI4jbvoOzrzNr2gEUei3snb4Ml+LoRvEgxE6VjZhczvwlRdu9rWZmDZ9Mk+88CB/f/wFyipKOf6UIzMGwoEgyRIHHrp3P0OEvQ/YY0jHGyqaqtHS3MbfHl/AvgfuyazZM5BlmcLi/EEFo5raSk4+81j+9vgCTNNkxqypnPqdE3LatLXjzjP6je286yyUHJ5T13Xee+sjWpraOPG0efj8Xt5/+2M+/egL6ibmRqBrTAfvWCDMktf/S8uitRRNqmD6EbuPu/ro8YCYIRct2WSErZWHjACJcIwV73yZNqbFVbrWtW0xeA+GSGeA1257AjWWrJ7Iqy5mv+8fP+AA7va6sTkkfnb7VVnJ4+6+1y6ccvbxPPPEAiRJ4uwLT6F+6oQhHaurs5vPP13IO2/+h/0OmsPsXWcOKPfb3tbJiXPPTdme1U+bxG/n30p7aweyLFNQlD+gp7K8Aj+XX3MRF3zvTDRNxzUCNeJTZ9Rz9AmH8eKzMEYuvAAAIABJREFU/wJg1z1nc8Sxh+Qsxw5Jwa6zLziZc076Pl9/uRgAh8POZT88N2fnHLPBOxGJ8+njb7D+i5UAdK9ro2tNK/tcdsw2Izc6VnAVeMmrLqZ7XdKNXBAEdjxunzFR0ihKIo4MN+xsPRFoCY1vXvooFbgh+VnrbmijbMbAPVk1TctacMgvyOPKH1/EhZedCUIyNTOUDa9QKMz9v/4jT/XWej/z5IucfOaxXPmTi7e62fjGq++mAvek+jquu/VKrrjwOpYtXkl1bSW/evAm6qdOHNAq3Of34MtR3jcTBYV5/Pjnl3P51Rei6wYutyPnm5UA9dMm8oe//oZHHngCj9fDd39wLgVFudOxH7PBW0uorP9yZdpYx6omtLhqBe8s4/C62Pd7x9G2rIFAUycVO07E5nagJbSMTTi5JBaIEO4IYJomniIfDl8yx926eB1aPNnWXTKtGldhdkwPDF0nGujfch7PMDaSuD0u3J7h3aAioSjPPPlC2tgzT77I2ReezLLFK6mZULVZk+Ip0yZw359+QSQcxeVycvvPfsOyxcnv47o1jVz2nR/z1Evzt6g2OJr487z487JnjDEQvD4Pu83ZmekzpyBJ4maNM7LFmA3eggA2p51EZEOdqyhL47ZDcaxj6DrL3voCxWFj2ZufoydU5t5wFt7SwTuQD5VYIMy/736GQFOyycVTkseBV52Ep9jPETd9h651bTi8LlyF3qytvG1OO1MO2pmWbzY45Ug2meKpg3egGS+0t3ZyzknfZ/KUCcx/4tf96tG7Ort5962P+PsTL+Dze7n/z3eydFG6jV1ba0daDbrFBkZK/GvMRkK728nOpxyQNjbruL2QR6DYfntk7cdL6FixnuavV6NG4xi6wdI3/4dhbF4/Ots0frEyFbgBQq3drP1kMaIk4czzUDFrAgV1pVnfRC2cWMY+lx1LydQqqnap59Cfnp6xc3S84fa4+PYZx6SNHXvS4XzwbrKaZfnSVXy8iQCWYRi89vLb/On3fyUUDLO+oZklXy+nblL6plt+gR+7w/oujiZjduUtyhIVO07iyFvPpaehLdno4XGiDELnwmLgiBkacyRFRhhMh8wwCWVohQ82dw1LpGog2FwOKmZNoGhSOYIkotht6LpOtDtE59pWHF4X7iLfiFbeDITurh4ScRVRFCkoyuuXc3d7XFx65Tnsvf/uvPvmh+y82yzsdlua8UNTY7r/ZCgY5tUX30ob+9NDf+X2u6/jBxdfT0tTGwWFefz697cMuFzQIjeM2eANyYYLxWnDU2R9SHJN9bemsOjlj1OSr7Jdof6AnUZUea92j+ksee2ztLGJ+84asVrzjZuSwm09vP5/T6bl2fc8/4gxE8Bbmtv46RW38umHn1NeWcrtv7mOmTtN71fHnF+Qx/4H78X+B+/F4q+XcfKRF6T+TZIkDjli/7TXO10Ops+cktYqv2jhUuwOG0++8BDxWByb3UZ+vn9ca7RvC1i/fQsAnD43h/3sTBr+uwxD06nZdSp2/8gGKnehj/0uP56FC/6DaZrscNQeg+rQzBZqLMGXz72XCtyQrC2PdATGRPAOBkLcdv3dfPrh50Cymee7Z1+TUgHcHJXV5fzuj3fw8H2PYbfbufyaC/j/9s48Psry3N/X7Hsy2TcSEiCBsAQI+yYKIogLVaFahUqtWm1P1Wq11vbY1iqnPUr1/GzF09qjWFERV6wLsiPIJibsRAJhCQSyJ7Pvvz8GRobJnskkkzzXP37m4Z13nncmft/nvZ/7/t5JycHHKxQKfnjP9/lq8y5Kvy0D4KZbryMxKT4iGRuCtiPEWwD4i1q0Rj15M0a3fnAXodAoSR3an7isZPD5whZ39nl92E1WPC43MoUclV7dojmY1+3B3hCabWI3hceFsCNcNIGSy2XY7Y6QZg02m52GelOL4m2I0XPFjEmMHD0UiVTWbPpeckoi/3jreawWK3KFHJ1O26HS/Jbw+XzUVtfh9XpRqpTEGntoe7cejBBvQY8jnKmgPp+PhrPVfPm3j7DVmVEZtEy573ris1OaFXClTs3A6QVB3uVytZK4zFB71a6mvraB7Vu/5pMP1jJ0RB4LFs5DpVaSPzwvsPIGf5PgtgpsbBti1QmJcc2mEXYWl8vFkYOlPP7AHzl98gwTphTy9F+eICU18t9vNNNjs00EgnDgMFn56u+fYKszB15ve/nfOMzN+0pLJBLSC3IYv/gaEnLSyBg1kKsfvw2VoXM3FXuDhcbzdVhrTTitrftaO50uVr7xEb/6+VNs2bCdl/9nOfct/CVup5s//PdjZPbPAPzWr//1wm8xxESuEKYz1Nc1cu8dD3P6ggvfzm3f8PRv/hKwrRW0DbHyFvRqvB5vSBaL40IIpSVUOg39J+STNjwHqVzWaQdCS00jG//yLtaaRpDA4FljGDJ7HKoW2sI11Dfy5qvvBo0dLTmO2WQhMzuD19/7Kzab3R92iDVETeqeudEc0otz59Y92O2OqLkB9QTEylvQq5HKZcSkB1cBaoz6NmVKSCQSf3pqB4TbbrJia7Dgsjv9G6Dvb/ULN4APSr7Yg6OVGLpEIkGrC437yxVyJBIJCUnx9MtKJzklMWqEG/xeMIrLTOaGDM/tUuOo3ogQb0HU4fP5cJisOG2tV/ipDVqm/OT6gIDrk41M+495KDsZAmkOj9tD7YlzbH7+PT797avseXM9brsTS01DyLFNjV1KfIKRX/72p0Fjs+ZOR9vJsvnuxmDQ8/TSXwf8WlLTk3nq2cfR63XU1za0ub1aT8Nhd3TKx729iFudIGL4fBeyPpwupHI5CrWy3atah8VOxf4yjm4sRqVTU3DzVAwp8S12/zGkxHHlQ7fg9XiQyqSoDNouyx13mm1sev69QJrhqV0lyJUK8udOYNtLqwPHSWVSYtNb9gWRSqVMmDKGjza8zrbNu8kbMoBBgwdgjPLMDI1WzVWzpjJm80gcdgdanRYJ8Mrf3mDLhu2MHDOMu+6/o9UGEj0FU6OJkkPHeGv5+6T3S2PhXfNJTk3s8voEId69CIfZhrmqnpqycyTl9kMXb0B5IaZqN1lpOFNN3ekq0oZlozHqUWoj+6htrTPTUF6Fx+1BpVNjrTOTPnJAuxojVB45za7X1gReVx0t59o/LEYb33KmRXv8uTuDrd4clB8OcHbfcfLnjmfgFQWc3HUEbZyeMbfPaLWnJ/h9MvQGHTkD2+5wGA2oNapAX0mzycIfn3iOz1ZvAODA3iPs3XOQv7765y63j+0sGo2GPTv38cDdTwTGPvnwC9759J8d9nRvK0K8ewkum4PDn+/m23XfBMZGfX86A6eNwO1w8c1bGyn/5igA+977kol3X0u/wtwmbUy9Xm/YvY9t9Wa2/u0jGs74e0lq4w1Muuc6XDZHm8XbabVzbEtwWy2Py0NV6Rn6jx8S1vl2FFWM1t9z87t2j8SkJyCRSBk+bzJDr5+ABEnEbibRgM1qY82/NwWNHdh7BJvVBj1cvPFJ+eeyFUFD1ZW1lJYc73LxFjHvXoLL7uTohuDCjQOrt+O02nHbXQHhvsi+97eGbJg5rXaqS8/y9b/WUbJuD/ZGS9jmd77kdEC4Aay1Js4UleJ1tb2Du1QuQxMXusLWxvWcDAWFRsXIm6cFbAXUsToKvjeFHa98StE7m8Dr63XCrVAosJitNDaYWj+4KSSSEPtWuUIeFeX3Egmom2jp1tRYuOn5346gTfh8PnxeX9CYx+kGH006A17+aO/z+jh38CQ7/vlZYKxs20Gu/MUtqGM6b3FpqQrdnLPWm1G0I3QjVyoYdv1EKvYdD1gFJw7KwJDS9hWO3WSlquQ0deVVZI7JQ5cQG9bwkVKjYsDUEWSOHYzb4cRhtvHNyk3Ulp2DY2exVDcw9f7e01DE6XThccFvH15CQ4OJO++5ldHjRrSrItMYF8Pjv3+AXz3wx8DYvf+xCL2+53fNcntcPPSre1l0808Djajz8geSmZ3R5Z8txLuXIFcqSMnP4vzhU4Gx7En5yNVKJDJpUKccgNwZo1DqvhMth9nKoU93Bp2zsaIWe6M1LOKdOTaPQ5/sDDTpBRgwbXi7Nyy18QZm/+6HNJypRqlVt8vb226ysv3vn1B11F8ccuTzr5l4z1wyR+eG1YDroqGatdbE58+9HvRvNccq/DfVXkJtdS0L592P40KGyNc7iln2+rNMmT6+zedQKBRMmzGJT7a8xaH9JQwanENiUkKnm1FEAqfTyYC8HD7a8AYb124lLSOFwrEjuqw69VKEePcSVHoNE340hxM7DlH5bTnpBQPoNzo3kNFxxX98j+PbDlB74jxZ4weTMiQLmfzSn1+CpKk4d5h2zDVxeq58eD77P9qG1+0hf844jBmJyNr5aCyVStHE6jrUy9RldQSE+yIHPvqK5NyMsNygLkcilaDQqHBdktKo0KqQyNr/nVrMVmxWGxKJhLiEUPvX7mLntm8Cwn2RFf/3LqPGDG+X+F7cmM3snx7uKXY5Go2azP7p/PDu70f0c4V49yLUMVryri5k4BUFyJWKoNWkOlbHkDnj8Lo8yJvwRFcZNAyfN4myrQdR6dVUlpSj1KnDFp9VqJQk5WYw5f4bwBde/5K2cnlY6eK8JDJZpz3DNWo1tnoz5w6dRKZUkJSbgVKvZswdM9j5z8/955dKGHvHzGazTLxeL06LHalcFrSJW1Ndx/P/9TKfrV5Pckoiv/vTLxlZOKzL22y1hcTk0JBVYnIC8iaaWgvCixDvXoZUKkXaTChCKpUiVTW9YpNIJMRlpdCYU4O5qp7xi2ejTzKG3QJV1Yb0uK5CqVMHhY+yxg9myKyxHPz4K2RKBYOmj0Rj1LXoONgccdpYPn/qX7guxOK1cQZmPn4baSNyuO6Zu7DUNqKLj0GhVSFrQtgcZhun9xzl+Jf7UcdqGXnLFeiTjXi8Xt745ypWv/s5AGdOV/DTOx/j0y/fDqt4O+wOGhpMOB1O1Bp1mzMlhgzLY8iwXI4c9G+IG2L03PvzRVFV8RmtCPEWAP7+kRufW4Wl2r+xWPbVIab+bB4aY043zyx8qGO0XPHz73FqdwnmqnoGTCtg3ZI38Xr8G7rHt+xn9u8WoW0io6UlvF4vx7fsDwg3gLXORMWBEwyYMgyFStliHrrP6+NMcSnfvOXPc6Ycqkvf5to/3InV7WLT2m1Bx7vdHo4dPUFqenK75tkcNquNTeu+4ve/ehab1UZWdgbLXn+uTSGMhMQ4nv/7U1ScqaSxwcTwkfkRifcKRKqg4ALmqoaAcF/k0Cc7cZi7z8O6K1DH6MibWUjBLVdQurE4INzgT7csLypt/0l9vkAHoktxWtr23Tmsdo59eSBozO1wUXe6Co1GzeChg0Le0y8rrf3zbIbGRjO/+cUz/rxq4NSJMzz16+doqG9b6l+jqYGxE0Yx45ppJKckIuvAk4ug/QjxFgA0mW0R9hZoK1ZAdjZIpf7/rljR2ju6DKlUikQW+ucvbWKs1XPJZAy8soBL233KFDIyx+S16f0yuazJXHV1jBatTsODj99L1oXUM6lUyn0PLQ5rV5uGusZAmttFDu0vwemMTo+RvoIImwgA0CXEEpMW/133dgkMv3Fy+DYWV6yAe+8F6wUr0JMn/a8B7rgjPJ/RDqQyKYOvLuTEV4cC9rAqvYaMkQM7dD6nzMvVj/+AI2t2I1MoyL92fJszWBRqJQU3TaWypDyQmZI6LBtdvN/DJC09hdfe/StWixWlSole78/MCBdx8UbUahV2+3dhn/GTRgeMowQ9E4nv0sTbdrJ27Vo+//xzli5d2uqxhw8fJj8/v6MfFTVE83XaGy2c3X8CU2Ud2RPy0cbrUbRj46nFa8/O9gv25fTvDydOtHhel82B2+lGoVY2mSnTUTxuD/YGCyd3HkamVJA5Ng9NjK5DTxwXr93tcCGRStqdAun1eHGYrDRW1KAyaFHH6iLWL9PhcLK/+BBPPPQM585WMm7SaJa88Js2d7aJ5r/5zhKJa2/uMzq88n766afZunVrn/3ReiPqGB0DpgzrmpOfOtW+8QtYahopensTdacrSR2axfB5UzqU490UMrkMXUIMQ+dOCMv5gA7fXKQyKRqjHo0x8qX+KpWSwnEFvPnR/+L1eVEqlRjjotu5sC/Q4ZX3p59+Snx8PCtXruT5559v9fji4mJUqt6fPmS32yPia9ATaenaB86cibKiImTcmZbGsfXrm3xPSlwSO176d1AnnLSCHAbPm0BNY114Jh0mxO8urr0r6dDKe9WqVSxfvjxobMmSJcydO5edO3c2865QVCpVn1il96ZHSK/Hi8Nsw+fxIlXIUBu0LRaztHjtzz4bHPMG0GpRPvtss+8xVzWEtDCr2F/G2DtmkpyR2qFr6ip60+/eXsS1d33YpClaFe8FCxawYMGCsE9I0LPxuPwdYXb881Ns9RaMmUlMunsux7cdQBtnIHNMbvtKyi9uSv7mN/5QSVYWPPNMi5uVMoUMqVyG95JMCH8Odtea3AsE0YBIFRQ0idNi48u/fYSt3m8LW3+6iq/fWIfGqKdo5SY2/88H2ButrZzlMu64w7856fX6/9tKlolco2L0gukBrZbKZYz74ayIbeQJBD0ZkSoYRTgtdjxuDxKppMsFzOVw4bYH5/lWHzvL0Ov8m3sNZ6pxmG1d6k2tUCnIGj+E1BHZ2Oot6OINKLTq8OefCwRRSKfEe8KECUyYEL6dekHzWOtM7H59LeePnMKYkcSEu+ZgSI3rMnc5hUqBXKUI8v2Oz0ml8VztdwddpqFd0bPvor3qxZznvoLT5sBaa6JifxlxmUkYs5LFE4cgCBE2iQIcFju7Xlvj9+r2QX15FZtfeC+kE044UerUTP7J9YEemIaUOEbePI2jG4oBiMtOCRTwuB0urPVmElWx/h6OvcivOpLYGy3Un6nGXNNIxf4yvvjjG+z/cBtbXvyQPSvW9zqrAkHnEGGTKMDr9lBZUh40Zm+0hnTDCScyhZykvH7MfnIhXrcXiUyKy+YgvWAAsRmJpA3LRm3Q4nG7OXfoJDte+RSvx4tULmPqT28keXC/Nrvz+QtbpC12gG8Jj8uN0+pAIqHVzvAOix2v24NcKUdxWe/MQKHM+TpUeg2aWF3ErGttDRY2Ll2FubKeSffMZf+HwWZUZ4qPMWrB9F7TgUfQeYR4RwESiSS4dB2QKeXIlV3788nkMjSxlxSNGPWMmn9F0DFOs53dr68NGDx53R52vbaGWb+5o9ViGpfNQcPZGkrW7kGp15A/ZxzaOH27LFkdZhulm/ZSunkvCo2KUQumk5SbEdKhx+fzYaluYPe/1lF/qpLkIZkU3nZVUFGMuaqe9X9eGShRzxg9iLF3zEDVxR1dvF4vxzbvDaRFSuWyJrvtXGqiJRCIsEkUoI7RMuGuOYEQhkwhZ8KPZqPQdn9hhNfjDeoUA/6VtM/rpfFcLTVl57A1WGiqFqzhTDUbnn2HM8XHKNt6gC+eXoG9mVCQy+7EcZlLn8/n4+y+4xz89w4cJhvmynq2vvQRjiayYOyNVja/8D5V35bjsjs5U3yMna+twXHBDdBpc7Dv/a1B13KmqDSQbdOVeF0e6s/UfPe5xccYMG140DHGzKR2t4wLB2azhZrqOtxuEQrraYiVd5QQm57A7CcX4bY7kasUKLSqLl95twWZUk5segINZ78Tn4k/nkvRO5s5c8FeVWXQcvWvbkWXGBs4xuVwcuSLPUHnctudVB4+RfakoYExj9uDpaqB/R9uw26yknvVKFKH9kepU+OyOwPt1SzVDRxZ8zWm83VUfnsafXKw657H6cZS0xg0VnnkNN4LplRetwdrXagFqq3BgrFf2zw+OopcpSBn8lDO7j0GwMmdhxm/eDYT75nL6d0lxGenkjNpaKuZPV6vl4a6RuRKOQZD58rsvV4vZ05X8JdnlnGyrJy537uaW35wfVjdDAWdo/v/7xe0CalM5g9DtNPXw+10IZXLuiwrRW3QMvVn89jz5gZqyypIKxiAKkYTEG4Ah8nKgdXbGXPHzID3h0QiDYk5gz+75FIcJhvr/vRWIL5fc7yCiXfPpd/ogZgr69mzYgOW6gbispIZ98NZ7Fr+BTFpCSHnlSlkyJTyoHCELjEmkHao1KnJmTyMopWbAv8uVykwZiR2/MtpB4mDMhh925V8u+4b5EoFSp2apEHppI8YgFQubfX3q69rYN1nm1m14mOSUxL5xRP3kdU/A3k7DbIuUltdx6KbfkptjT+U8//++x/YrHZ+8uCdKJXhMwcTdBwh3r0Uh9lGVekZTu44TFxWMjlThofN0OlydAkxTPzxHMyNZgzGGM4fCnUPNFXV43G5A+ItV8oZOnc85UVHA4KqTzaSMCC4yUBNWUXIxmzppr0kDkzjy79+GMi4qTtVSfGqLYz6/nQMyaGdXBRaFWMXXs3u5V+gMmhJyk1n8DXjUF1Iv5NKpWSNG4zP56Ns6wHUF+L7SkNkNghVOjUDp40gszAXJG3P47ebrPi8XtZ9toWnfu139zx84Ft27yhi9YY3SE7t2M2nqrImINwXWf3e59x2500kJYfeHAWRR4h3L8Tr9lD21UH2vb8V8MdQT+85yvSHbu6yXGGlVs35k2XEpyYSn50aUtaeM2koysti9NqEGK79/Z1UHDyBSqcmcWB6SMl9U6ECTawOn88XkipZe+IcxoxEVE0IrlypIH3kQOY+/SNs9WbOFB/DXFWPxvid9apKr2HQ9JFkjRuMVCYNmW9XI5XJ2mU5YK01se3lj0kaPYB33/w4+N8sNkoOl3ZYvPVNhF1SUpOQy4Vk9BTEhmUvxGGx8+26b4LGGs5UB/VY7EpUBg0zfrmAhIFp6JNiKbh5Kv0Kc0MqI2VyGdp4AwOnjaBfYS7qJp4MDMlxJA76rpeiQq1k+LxJSGWykLBLbHoC0ha6lsvkMs4fOc36P6/kyJqv2f73T9j5f58H5U9LZVLUBm3Ehbu9OK129ry5gbpTlXhsTpKbWA0nJLatiXBTxMTqueUHNwReq1RKfv3UQ8TFx7bwLkEkEbfRXohE4t9IDBnvQIuvjiBTyInPTmXq/Tfi83pR6jQdai8G/pX35Huvx1zdgNNsIy4rGZVBg88Hk++7nu1//wSnxY423sDEu+e2+GThNNs4sPqroLHzh0/hsjujLn/a43JTffwsABVfH+X+n9/J7p3FWMz+TJupV07oVIPiWGMMD/7qHhb9eD7nKioZlJeDUWxW9iiEePdQPB4PTrMdfD5/dkkTm3vNodRrKLhpKtv/8WlgLL0gJ+KpZuESRHWMtsnwSeLAdGb/50I8bg8yhbzVkJAP8DWVK92Kpb3H7cFptmGtNaGO0aHQqlBqu9ebXqaQkzggjYoDJ7DWmji36QBvvfsS5efOk5CcQGpqEnEJnRNbY1wsxrhYBuRmh2fSgrDSp8Tb6/Vib7ByavcRPE432ZOGoo7VIWvhUbs7cNkcVBw8QdHbm3Ba7fQrzGP096e32QRKKpWSMrQ/s59cxNn9ZcT1S8SYlRx1q8vWkMll7eo8o9KpGTx7LHtXbQmMxWenIG/lptZQXsXGv7wb2FgdfuMkBl01CmU7bqjhRqlVU3j7DLYt+5j601U0lFejdPmYOGUMcpEN0ifoU+Jtb7TyxdNv4LxQmHFkzdfMfnIh+qSe9TjosNjZ8cpngdenvy7BkGJk6LXjW4zpXorX5V+N9h8/GIVa2a6Ve7Ti8/mwN1pxWuzI1QoUKmWgsAn8lYvZE/OJTUvgxI7DJOSk+n3JW1ix201Wdv9rXVCK4cGPd9B/Yj5ylQKpVIrX68UYE/lYsC4+hiseuAmPy41UJkOpU/e4hYig6+hT4l3+zdGAcIM/bvjt+iJGfX96l+VBd4S6U5UhYxX7y8i9clSTmRSXY60zs+XFD2g8W4NEKiH/2vHkzRgdJGQd5WKWh8/r9zHpSat5S00jG59bha3eDMCgq0Yx7PqJqC65bpVOQ+rQ/iQPyWzTb+7z+kKKe3w+H7Y6M6d2HiF70lDKth2k9lQl3sk2Egelo9JF7jsRToN9l56jWBHA5w2NbTY11t3ENlFkkjAgDbm69cdht8PFwX9vp/FCxaPP6+PQJzuxm9rZOKEJvB4v9aer2PDcO3z8+Ct8+dePQoStu3DZnex7f2tAuAFKNxY3WSoPtPlmrVAr6VeYGzSmjtHidrrY/9FXFL+7xV+mv/cY25Z9zKldJXg9nmbOJhCEjz4l3pljcoPim1KZlLyZo3vUqhv84jD8xkmBDI24/inkzxmHrA3Vcm6ni7qToSt30/nON+x1mG1sefGDgIFS7Ylz7Pi/z8JiVepyOLHWmzFV1mNvxgulJTxOd7DX+AUstZ27uchVCgq+N4Xcq0ahidOTkp/FpHuu48Dq7YD/aS4pr1/g+G/XfRP0dCcQdBV9KmyijtEy+8mFHNu8D4/LzaArR7VrwytSKHVqcmeMJnvSUHweLzKVos2Px0qNivSRA6gvrwqMSSSSNvlzOC12vF4vKp2myW41bocrpDCm5lgFXo8Hl91JSlwSDrOt3aEUl83ByV0lFL+zCa/H6/cr+cUtTVZKNodSqyJzTC4HL/FYkcplxKZ3vrxdHaOl4KYp5F1dyKndJex6bU3giUOhUQXFw/0VpKLTj6Dr6VPiLZXJ0MXHUHDT1Ba7oPcEFGplh1L7pHIZg64cibXWxKldR1AZtBT+4KoWMyrcTjeNZ6vZ+/5WXFYHeTNHk14wICRGLlfJkauVQe3REgem4/PBN29v5Oy+48SkxjN20dUYkuPanNvtsjspentjYLVtqzOzZ8V6Jt97fZvj9FK5jIFXFOCyOTm54zAao57CH1wVljg/gEypQKnz4XEFG1wNv3ESJ3Yc8r+QQMHNU7u0NZxAcJE+Jd6X0pOFu7OoDVoKbp7KkNljMVfWc3zrfipLTjN07oQmV8UOk5UNz74T8IvetfxpDk67AAAL1klEQVQLptx/AxkjBwYdp9SqmXzPXLa/8hkumwONUc/4H81m33tfcmp3CeA3jtq09F2u+c+FgXPLFHKUOnWzK3KHyRYSJmk4U4PH3b7YsdqgZcSNkxg8a0yX9PlUqJXkzhhNv8JcGsqriM9JRa5UoEuMxZiZRMbIQWiMQrgFkaHPindvx2GyseapfwWNxaYnkjNlWMiN6/zhkyFG/8e27Cd5cGbQ6t/fXSeTOb9bhMflCVRxnt13PPizzTacFjtbl63GUtUA+IuExi5quvO7OkYb0i8zdXh2wMSqPciUCjRdmOes0qlR6dRBboMaox6nXkJMatvDPAJBZ+lZO3WCsFFZcipkrLyotMnWadqE0Oa+usSYJsMeMoW/MEafFIsmVodEKsGQEixa/ni5LyDcAGf3lQV1AroUpU7D9AdvxpASh0QqIWP0QApumtotzQc6itXa+WwegaA9iJV3D8Hj9iCRSsKW+RKXlRIyljgwrUnPE2NGEomDMqguPQP4V8JDZo9tU3aL2qBl3KJZbHz+XVxWBxKJhBHzplB59GzIsY5m0hVlChkJA9K46pH5+Hx+X5burF4UCKIBId7djNPmwHS+jqMbilDH6MibMRqNUd9ktkd7MKTEMfCKERz7cj/4/HniA6YOb/LmoI7RMvkn12GrM+N2ONEnx7Vr082QHs+c3y3CUt2IRCql9sQ5JJfEsKVyGRN+NBuZUkHRyo2kDs8hvn9KSAy8PXaoAkFfJ+rF22m1Y6lppGJ/GfE5qRj7JUVV1Vn96So2/eXdwOuTOw5zzX8u7HTjBJVew4jvTSF/zni8Xi/yVtIN1QZth783mczfqLi2vIptL34EwIS75jDs+omUbT9E3sxCqo6eoXTTXgCObtxL7oxRDL9xcptCI/6GAz5kClmPt2oVCCJFVIu31+3h9J6j7FmxPjCWNW4wo2+7Kqgkuqfisjk4/NmuoDGH2UZNWQX9Rg3q9PmVWnVExU6TYCD3qlGUbt7LrtfWMHL+dGY8Mh+AT377WtCxpZv2MfiasS2Kt9fjpbGihp2vrqHxbA3J+VmMWzQLbVzPy80XCCJNVIu302Jn/4fbgsZO7S5hxE1TokK8kUiajEFHqytcZX01w+dNZvDssQErW6VWjbXejN+Q9VJar6B0mG1sfuH9QAXn+UMn+fqNtUz88dxut2QVCLqbqM428UFQq63AeA/0K2kKhVrJ8BsmBWV16JONGPtFpultuPH5fCjUSrRGPdo4Q2DVr1ApyZk8LOjYnMnDW00FdDucIaX35w+dwuNyN/MOgaDvENUrb6XGXzRxaeghcVA6clX0pJjpk+OY84c7Kf/mKOpYHalDsnrdxp1Co2T4vCmkDO1Pxf4y0kbkkJzbr9WMErlSEdILMzYjsdObuQJBbyCqxVumVJA3czTGfomc+vpbEgem03/8ENQR6vgdDuRKOfrEWIZcM7a7p9KlqA0aMgtz6Td6UJurWxVaFePvvIbdr3+Bx+VBZdAwfvE1UbUhLRB0FVEt3uDPqsgck0f6iAFI5TKxKuuheFxunBY7Pp8PmULeJvMqf8f3AVz7xx/hcbiQq5U9yj9cIOhOol68L9LUxp+gZ+CyOSgvKqXonc247U6SB/djwl3XtikdUq5URO0GrkDQlUT1hqUgOnBY7Ox+fW3AjbCypJzDn+3C4wwt1W/+HDZsDRZclzgaCgR9GbFcFXQ5TXmaVH5bjsvhQtbKqtrn9WGuqufrFetpOFNN6rD+jLzlik4XMQkE0Y4Qb0GXE5MWHzKWnJvRJtdAh8nKphfew1bnb292alcJHqebcXdeI/xPBH0aETYRdDlKnZqxC68OiHVSXj/y505oUyzb5XAFhPsiZ/cdD+peIxD0RcTKW9DlKDUqsiYMIW14tt+jRNm2bBPwe4hLZdIgv3F9kpFe3EtDIGgTYuUtiAhyhRyNUY823tCqcDtMViy1jdjqzchVCkbfemUgN1yuUvhzvXtZIZNA0F7EylvQo7DWmfjqfz+h9sQ5FFoVYxfOJKMwl7QROTjNNlQxWpQ6kestEHRo5W0ymbjvvvtYuHAht956K0VFReGel6AP4rI7KV61hdoT5/yvrQ52vPIZXpcbbZwBY2Yymlg9Mrmsm2cqEHQ/HVp5v/rqq0ycOJHFixdz/PhxHnnkET744INwz00QZRh0BmwNFiQSUOo17e4K5Ha4qD4W3IHH5/VhqzOjjTOEc6oCQdTTIfFevHgxSqXf/Mnj8aBStZ6y5XA4OHz4cEc+Lqqw2+194jovJykuAVPJOYrWf4ZULmPoDZPQZBipM9W3+RxxMUYSB6VTvudoYEwilaDQq3v8d9pXf3cQ195d196qeK9atYrly5cHjS1ZsoSCggKqqqp49NFHeeKJJ1r9IJVKRX5+fsdnGiUcPny4T1zn5VQcKGP/e1sDr3e/uoZrfntHu7+LmPnTsdaaqC07h1KrYuyiWWhi9eQn9+zvtK/+7iCuvauvvbmbQ6vivWDBAhYsWBAyXlJSwsMPP8xjjz3G+PHjOz9DQdTicXs4sT30D6y8+BjGfkntOpc2Ts/Un87D63IhkUlR6jQixi0QNEGHwialpaU8+OCDvPDCCwwZMiTccxJEGVKplLjsFE7v+TZoPC6zfcJ9Eb+lr8goEQhaokPivXTpUpxOJ8888wwAer2eZcuWhXViguhBIpWQPWEIp3Ydof50FQApQ/uTMCCtm2cmEPReOiTeQqgFl6OO0TH27tkokCGRSlFoott722m147TYaayoJSY9AZVOjUJ4qQh6EKJIRxA2ztdW9YqNK7fTxcldJRS9vdE/IIEJi2eTOSYPqYi/C3oIojxeILgMl9XBvve+/G7AB0UrN+Gw2Jp/k0AQYYR4CwSX4fV4QzrUO60OfL5umpBA0ARCvAWCy5CrFCTkBG+2pg7PRq4QUUZBz0H8NQoEl6HSa5j8k+s4+OlOakrPkpyfRf7ssSh16u6emkAQQIi3QNAEGqOeUfOvwG13odAokYlVt6CHIf4iBYJmEJ3rBT0ZEfMWCASCKESIt0AgEEQhQrwFAoEgChHiLRAIBFGIEG+BQCCIQoR4CwQCQRQi8fkiU/RbXFzcpnZpAoFAIPgOh8PBqFGjQsYjJt4CgUAgCB8ibCIQCARRiBBvgUAgiEKEeAsEAkEUIsRbIBAIohAh3gKBQBCFCPEWCASCKESId5gxmUzcd999LFy4kFtvvZWioqLunlLEWbt2LY888kh3TyMieL1ennzySW699VYWLVrEyZMnu3tKEWfv3r0sWrSou6cRUVwuF48++ii333478+fPZ/369RGfg/DzDjOvvvoqEydOZPHixRw/fpxHHnmEDz74oLunFTGefvpptm7d2iu6yLeFdevW4XQ6WblyJcXFxfzpT39i2bJl3T2tiPGPf/yD1atXo9FounsqEWX16tUYjUaeffZZ6urquOmmm5g5c2ZE5yBW3mFm8eLF3HbbbQB4PJ4+V1VaWFjI73//++6eRsTYs2cP06ZNA2DUqFEcOHCgm2cUWbKysnjxxRe7exoRZ86cOTz44IOB1zKZLOJzECvvTrBq1SqWL18eNLZkyRIKCgqoqqri0Ucf5Yknnuim2XUtzV373Llz2blzZzfNKvKYzWb0en3gtUwmw+12I5f3jf+1Zs+eTXl5eXdPI+LodDrA//s/8MADPPTQQxGfQ9/4C+siFixYwIIFC0LGS0pKePjhh3nssccYP358N8ys62nu2vsaer0ei8USeO31evuMcPd1Kioq+NnPfsbtt9/ODTfcEPHPF2GTMFNaWsqDDz7I0qVLmT59endPR9DFFBYWsmXLFsBvvpaXl9fNMxJEgurqau666y4effRR5s+f3y1zEEuEMLN06VKcTifPPPMM4F+Z9aUNrL7GrFmz2LZtG7fddhs+n48lS5Z095QEEeDll1+msbGRl156iZdeegnwb96q1eqIzUG4CgoEAkEUIsImAoFAEIUI8RYIBIIoRIi3QCAQRCFCvAUCgSAKEeItEAgEUYgQb4FAIIhChHgLBAJBFPL/AeE7Fa2OewCvAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cluster_test = model.predict(X_test_n)\n", + "sns.scatterplot(X_test_n.T[0], X_test_n.T[1], hue=cluster_test)\n", + "plt.scatter(x=model.cluster_centers_[0][0], y=model.cluster_centers_[0][1], color='r')\n", + "plt.scatter(x=model.cluster_centers_[1][0], y=model.cluster_centers_[1][1], color='r')\n", + "plt.scatter(x=model.cluster_centers_[2][0], y=model.cluster_centers_[2][1], color='r')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resources\n", + "\n", + "https://realpython.com/k-means-clustering-python/ " + ] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "name": "ul.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Unsupervised Learning/unsupervised_learning.ipynb b/Unsupervised Learning/unsupervised_learning.ipynb index d515b10..eef2a50 100644 --- a/Unsupervised Learning/unsupervised_learning.ipynb +++ b/Unsupervised Learning/unsupervised_learning.ipynb @@ -1011,7 +1011,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.9" } }, "nbformat": 4,