|
64 | 64 |
|
65 | 65 | # Evaluate the model |
66 | 66 | y_pred=model.predict(X_test) |
67 | | -classification_rep=classification_report(y_test,y_pred) |
68 | | -roc_auc=roc_auc_score(y_test,model.predict_proba(X_test)[:,1]) |
| 67 | +classification_rep=classification_report(y_test,y_pred,zero_division=1) |
| 68 | +roc_auc=roc_auc_score(pd.get_dummies(y_test).values[:,1],model.predict_proba(X_test)[:,1]) |
69 | 69 |
|
70 | 70 | # Get feature importance |
71 | 71 | importances=model.named_steps['classifier'].feature_importances_ |
|
94 | 94 |
|
95 | 95 | # Plot ROC Curve |
96 | 96 | st.header('ROC Curve') |
97 | | -y_test_binary=y_test.map({'No':0,'Yes':1}) |
| 97 | +y_test_binary=pd.get_dummies(y_test).values[:,1]# Convert to binary |
98 | 98 | fpr,tpr,_=roc_curve(y_test_binary,model.predict_proba(X_test)[:,1]) |
99 | 99 | roc_auc=auc(fpr,tpr) |
100 | 100 | fig,ax=plt.subplots() |
|
151 | 151 | exceptExceptionase: |
152 | 152 | st.error(f"An error occurred during prediction:{e}") |
153 | 153 |
|
| 154 | +# ================== EDA Enhancements ================== |
| 155 | +st.header('Enhanced Exploratory Data Analysis (EDA)') |
| 156 | + |
| 157 | +# Load full dataset for EDA |
| 158 | +eda_data=pd.read_csv(file_path) |
| 159 | + |
| 160 | +# Salary Analysis |
| 161 | +st.subheader('Salary Distribution') |
| 162 | +eda_data['ConvertedSalary']=pd.to_numeric(eda_data['ConvertedSalary'],errors='coerce') |
| 163 | +fig,ax=plt.subplots() |
| 164 | +sns.histplot(eda_data['ConvertedSalary'].dropna(),kde=True,ax=ax) |
| 165 | +ax.set_title('Distribution of Salaries') |
| 166 | +ax.set_xlabel('Salary (USD)') |
| 167 | +st.pyplot(fig) |
| 168 | + |
| 169 | +# Job Satisfaction Analysis |
| 170 | +satisfaction_cols= ['JobSatisfaction','CareerSatisfaction'] |
| 171 | +forcolinsatisfaction_cols: |
| 172 | +st.subheader(f'Distribution of{col}') |
| 173 | +fig,ax=plt.subplots() |
| 174 | +eda_data[col].value_counts().plot(kind='bar',ax=ax) |
| 175 | +ax.set_title(f'Distribution of{col}') |
| 176 | +ax.set_xlabel('Satisfaction Level') |
| 177 | +ax.set_ylabel('Count') |
| 178 | +st.pyplot(fig) |
| 179 | + |
| 180 | +# Programming Languages Analysis |
| 181 | +st.subheader('Top 10 Programming Languages') |
| 182 | +languages=eda_data['LanguageWorkedWith'].str.split(';',expand=True).stack() |
| 183 | +fig,ax=plt.subplots() |
| 184 | +languages.value_counts().head(10).plot(kind='bar',ax=ax) |
| 185 | +ax.set_title('Top 10 Programming Languages') |
| 186 | +ax.set_xlabel('Language') |
| 187 | +ax.set_ylabel('Count') |
| 188 | +st.pyplot(fig) |
| 189 | + |
| 190 | +# Job Satisfaction by Company Size |
| 191 | +st.subheader('Job Satisfaction by Company Size') |
| 192 | +fig,ax=plt.subplots() |
| 193 | +sns.boxplot(x='CompanySize',y='JobSatisfaction',data=eda_data,ax=ax) |
| 194 | +ax.set_title('Job Satisfaction by Company Size') |
| 195 | +ax.set_xlabel('Company Size') |
| 196 | +ax.set_ylabel('Job Satisfaction') |
| 197 | +st.pyplot(fig) |
| 198 | + |
| 199 | +# Age Distribution |
| 200 | +st.subheader('Age Distribution of Respondents') |
| 201 | +fig,ax=plt.subplots() |
| 202 | +sns.histplot(eda_data['Age'],kde=True,ax=ax) |
| 203 | +ax.set_title('Age Distribution of Respondents') |
| 204 | +ax.set_xlabel('Age') |
| 205 | +st.pyplot(fig) |
| 206 | + |
| 207 | +# Top 10 Countries of Respondents |
| 208 | +st.subheader('Top 10 Countries of Respondents') |
| 209 | +country_counts=eda_data['Country'].value_counts().head(10) |
| 210 | +fig,ax=plt.subplots() |
| 211 | +ax.plot(country_counts.index,country_counts.values,marker='o') |
| 212 | +ax.set_title('Top 10 Countries of Respondents') |
| 213 | +ax.set_xlabel('Country') |
| 214 | +ax.set_ylabel('Number of Respondents') |
| 215 | +st.pyplot(fig) |
| 216 | + |
| 217 | +# Employment Status Distribution |
| 218 | +st.header("Employment Status Distribution") |
| 219 | +employment_counts=eda_data['Employment'].value_counts() |
| 220 | +fig,ax=plt.subplots() |
| 221 | +ax.pie(employment_counts.values,labels=employment_counts.index,autopct='%1.1f%%') |
| 222 | +ax.set_title('Employment Status Distribution') |
| 223 | +ax.axis('equal') |
| 224 | +st.pyplot(fig) |
| 225 | + |
| 226 | +# Databases Used |
| 227 | +st.header("Top 10 Databases Used") |
| 228 | +databases=eda_data['DatabaseWorkedWith'].str.split(';',expand=True).stack() |
| 229 | +db_counts=databases.value_counts().head(10) |
| 230 | +fig,ax=plt.subplots() |
| 231 | +db_counts.plot(kind='barh',ax=ax) |
| 232 | +ax.set_xlabel('Number of Users') |
| 233 | +ax.set_ylabel('Database') |
| 234 | +st.pyplot(fig) |
| 235 | + |
| 236 | +# Job Satisfaction by Gender |
| 237 | +st.header("Job Satisfaction by Gender") |
| 238 | +job_sat_gender=pd.crosstab(eda_data['JobSatisfaction'],eda_data['Gender']) |
| 239 | +fig,ax=plt.subplots() |
| 240 | +job_sat_gender.plot(kind='bar',ax=ax) |
| 241 | +ax.set_title('Job Satisfaction by Gender') |
| 242 | +ax.set_xlabel('Job Satisfaction Level') |
| 243 | +st.pyplot(fig) |
| 244 | + |
| 245 | +# Correlation Heatmap |
| 246 | +st.header("Correlation Heatmap of Numeric Variables") |
| 247 | +numeric_columns=eda_data.select_dtypes(include=['int64','float64']).columns |
| 248 | +fig,ax=plt.subplots() |
| 249 | +sns.heatmap(eda_data[numeric_columns].corr(),annot=True,cmap='coolwarm',ax=ax) |
| 250 | +ax.set_title('Correlation Heatmap of Numeric Variables') |
| 251 | +st.pyplot(fig) |
| 252 | + |
| 253 | +# Cumulative Distribution |
| 254 | +st.header(f"Cumulative Distribution of{numeric_columns[0]}") |
| 255 | +fig,ax=plt.subplots() |
| 256 | +sns.ecdfplot(data=eda_data,x=numeric_columns[0],ax=ax) |
| 257 | +ax.set_title(f'Cumulative Distribution of{numeric_columns[0]}') |
| 258 | +ax.set_xlabel(numeric_columns[0]) |
| 259 | +ax.set_ylabel('Cumulative Proportion') |
| 260 | +st.pyplot(fig) |
| 261 | + |
154 | 262 | exceptExceptionase: |
155 | 263 | st.error(f"An error occurred while loading data:{e}") |