|
64 | 64 |
|
65 | 65 | # Evaluate the model
|
66 | 66 | y_pred = model.predict(X_test)
|
67 |
| - classification_rep = classification_report(y_test, y_pred) |
68 |
| - roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) |
| 67 | + classification_rep = classification_report(y_test, y_pred, zero_division=1) |
| 68 | + roc_auc = roc_auc_score(pd.get_dummies(y_test).values[:, 1], model.predict_proba(X_test)[:, 1]) |
69 | 69 |
|
70 | 70 | # Get feature importance
|
71 | 71 | importances = model.named_steps['classifier'].feature_importances_
|
|
94 | 94 |
|
95 | 95 | # Plot ROC Curve
|
96 | 96 | st.header('ROC Curve')
|
97 |
| - y_test_binary = y_test.map({'No': 0, 'Yes': 1}) |
| 97 | + y_test_binary = pd.get_dummies(y_test).values[:, 1] # Convert to binary |
98 | 98 | fpr, tpr, _ = roc_curve(y_test_binary, model.predict_proba(X_test)[:, 1])
|
99 | 99 | roc_auc = auc(fpr, tpr)
|
100 | 100 | fig, ax = plt.subplots()
|
|
151 | 151 | except Exception as e:
|
152 | 152 | st.error(f"An error occurred during prediction: {e}")
|
153 | 153 |
|
| 154 | + # ================== EDA Enhancements ================== |
| 155 | + st.header('Enhanced Exploratory Data Analysis (EDA)') |
| 156 | + |
| 157 | + # Load full dataset for EDA |
| 158 | + eda_data = pd.read_csv(file_path) |
| 159 | + |
| 160 | + # Salary Analysis |
| 161 | + st.subheader('Salary Distribution') |
| 162 | + eda_data['ConvertedSalary'] = pd.to_numeric(eda_data['ConvertedSalary'], errors='coerce') |
| 163 | + fig, ax = plt.subplots() |
| 164 | + sns.histplot(eda_data['ConvertedSalary'].dropna(), kde=True, ax=ax) |
| 165 | + ax.set_title('Distribution of Salaries') |
| 166 | + ax.set_xlabel('Salary (USD)') |
| 167 | + st.pyplot(fig) |
| 168 | + |
| 169 | + # Job Satisfaction Analysis |
| 170 | + satisfaction_cols = ['JobSatisfaction', 'CareerSatisfaction'] |
| 171 | + for col in satisfaction_cols: |
| 172 | + st.subheader(f'Distribution of {col}') |
| 173 | + fig, ax = plt.subplots() |
| 174 | + eda_data[col].value_counts().plot(kind='bar', ax=ax) |
| 175 | + ax.set_title(f'Distribution of {col}') |
| 176 | + ax.set_xlabel('Satisfaction Level') |
| 177 | + ax.set_ylabel('Count') |
| 178 | + st.pyplot(fig) |
| 179 | + |
| 180 | + # Programming Languages Analysis |
| 181 | + st.subheader('Top 10 Programming Languages') |
| 182 | + languages = eda_data['LanguageWorkedWith'].str.split(';', expand=True).stack() |
| 183 | + fig, ax = plt.subplots() |
| 184 | + languages.value_counts().head(10).plot(kind='bar', ax=ax) |
| 185 | + ax.set_title('Top 10 Programming Languages') |
| 186 | + ax.set_xlabel('Language') |
| 187 | + ax.set_ylabel('Count') |
| 188 | + st.pyplot(fig) |
| 189 | + |
| 190 | + # Job Satisfaction by Company Size |
| 191 | + st.subheader('Job Satisfaction by Company Size') |
| 192 | + fig, ax = plt.subplots() |
| 193 | + sns.boxplot(x='CompanySize', y='JobSatisfaction', data=eda_data, ax=ax) |
| 194 | + ax.set_title('Job Satisfaction by Company Size') |
| 195 | + ax.set_xlabel('Company Size') |
| 196 | + ax.set_ylabel('Job Satisfaction') |
| 197 | + st.pyplot(fig) |
| 198 | + |
| 199 | + # Age Distribution |
| 200 | + st.subheader('Age Distribution of Respondents') |
| 201 | + fig, ax = plt.subplots() |
| 202 | + sns.histplot(eda_data['Age'], kde=True, ax=ax) |
| 203 | + ax.set_title('Age Distribution of Respondents') |
| 204 | + ax.set_xlabel('Age') |
| 205 | + st.pyplot(fig) |
| 206 | + |
| 207 | + # Top 10 Countries of Respondents |
| 208 | + st.subheader('Top 10 Countries of Respondents') |
| 209 | + country_counts = eda_data['Country'].value_counts().head(10) |
| 210 | + fig, ax = plt.subplots() |
| 211 | + ax.plot(country_counts.index, country_counts.values, marker='o') |
| 212 | + ax.set_title('Top 10 Countries of Respondents') |
| 213 | + ax.set_xlabel('Country') |
| 214 | + ax.set_ylabel('Number of Respondents') |
| 215 | + st.pyplot(fig) |
| 216 | + |
| 217 | + # Employment Status Distribution |
| 218 | + st.header("Employment Status Distribution") |
| 219 | + employment_counts = eda_data['Employment'].value_counts() |
| 220 | + fig, ax = plt.subplots() |
| 221 | + ax.pie(employment_counts.values, labels=employment_counts.index, autopct='%1.1f%%') |
| 222 | + ax.set_title('Employment Status Distribution') |
| 223 | + ax.axis('equal') |
| 224 | + st.pyplot(fig) |
| 225 | + |
| 226 | + # Databases Used |
| 227 | + st.header("Top 10 Databases Used") |
| 228 | + databases = eda_data['DatabaseWorkedWith'].str.split(';', expand=True).stack() |
| 229 | + db_counts = databases.value_counts().head(10) |
| 230 | + fig, ax = plt.subplots() |
| 231 | + db_counts.plot(kind='barh', ax=ax) |
| 232 | + ax.set_xlabel('Number of Users') |
| 233 | + ax.set_ylabel('Database') |
| 234 | + st.pyplot(fig) |
| 235 | + |
| 236 | + # Job Satisfaction by Gender |
| 237 | + st.header("Job Satisfaction by Gender") |
| 238 | + job_sat_gender = pd.crosstab(eda_data['JobSatisfaction'], eda_data['Gender']) |
| 239 | + fig, ax = plt.subplots() |
| 240 | + job_sat_gender.plot(kind='bar', ax=ax) |
| 241 | + ax.set_title('Job Satisfaction by Gender') |
| 242 | + ax.set_xlabel('Job Satisfaction Level') |
| 243 | + st.pyplot(fig) |
| 244 | + |
| 245 | + # Correlation Heatmap |
| 246 | + st.header("Correlation Heatmap of Numeric Variables") |
| 247 | + numeric_columns = eda_data.select_dtypes(include=['int64', 'float64']).columns |
| 248 | + fig, ax = plt.subplots() |
| 249 | + sns.heatmap(eda_data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax) |
| 250 | + ax.set_title('Correlation Heatmap of Numeric Variables') |
| 251 | + st.pyplot(fig) |
| 252 | + |
| 253 | + # Cumulative Distribution |
| 254 | + st.header(f"Cumulative Distribution of {numeric_columns[0]}") |
| 255 | + fig, ax = plt.subplots() |
| 256 | + sns.ecdfplot(data=eda_data, x=numeric_columns[0], ax=ax) |
| 257 | + ax.set_title(f'Cumulative Distribution of {numeric_columns[0]}') |
| 258 | + ax.set_xlabel(numeric_columns[0]) |
| 259 | + ax.set_ylabel('Cumulative Proportion') |
| 260 | + st.pyplot(fig) |
| 261 | + |
154 | 262 | except Exception as e:
|
155 | 263 | st.error(f"An error occurred while loading data: {e}")
|
0 commit comments