feature_selection.py
# To add a new cell, type '# %%'# To add a new markdown cell, type '# %% [markdown]'# %%import numpy as npimport pandas as pdimport matplotlib.pyplot as pltfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import MinMaxScalerfrom sklearn.feature_selection import SelectKBest,chi2,RFEfrom sklearn.ensemble import RandomForestClassifierdf = pd.read_csv("data/Heart_Disease_Prediction.csv")print(df.shape)df.head(5)# %%label = df["Heart Disease"]df.drop("Heart Disease", axis=1, inplace=True)# %%print(label.value_counts())label.value_counts().plot(kind="bar")# %%categorical_features = ["Sex", "Chest pain type", "FBS over 120", "EKG results", "Exercise angina", "Slope of ST", "Number of vessels fluro", "Thallium"]df[categorical_features] = df[categorical_features].astype("category")# %%continuous_features = set(df.columns) - set(categorical_features)scaler = MinMaxScaler()df_norm = df.copy()df_norm[list(continuous_features)] = scaler.fit_transform(df[list(continuous_features)])# %%X_new = SelectKBest(k=5, score_func=chi2).fit_transform(df_norm, label)X_new# %%rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)X_new = rfe.fit_transform(df_norm, label)X_new# %%clf = RandomForestClassifier()clf.fit(df_norm, label)# create a figure to plot a bar, where x axis is features, and Y indicating the importance of each featureplt.figure(figsize=(12,12))plt.bar(df_norm.columns, clf.feature_importances_)plt.xticks(rotation=45)