|
from typing import List, Union, cast, Tuple |
|
from dataclasses import dataclass |
|
from sklearn.model_selection import train_test_split |
|
import pandas as pd |
|
|
|
import streamlit as st |
|
|
|
|
|
from src.features.util_build_features import ( |
|
Dataset, |
|
SplitDataset, |
|
undersample_training_data, |
|
select_predictors, |
|
import_data) |
|
|
|
from src.visualization.metrics import ( |
|
streamlit_2columns_metrics_df_shape, |
|
streamlit_2columns_metrics_series, |
|
streamlit_2columns_metrics_pct_series, |
|
streamlit_2columns_metrics_df, |
|
streamlit_2columns_metrics_pct_df, |
|
) |
|
|
|
|
|
def initialise_data() -> Tuple[Dataset, SplitDataset]: |
|
|
|
dataset = import_data() |
|
|
|
st.write( |
|
"Assuming data is already cleaned and relevant features (predictors) added." |
|
) |
|
|
|
with st.expander("Input Dataframe (X and y)"): |
|
st.dataframe(dataset.df) |
|
streamlit_2columns_metrics_df_shape(dataset.df) |
|
|
|
selected_x_values = select_predictors(dataset) |
|
|
|
with st.expander("Predictors Dataframe (X)"): |
|
st.dataframe(selected_x_values) |
|
streamlit_2columns_metrics_df_shape(selected_x_values) |
|
|
|
st.header("Split Testing and Training Data") |
|
|
|
test_size_slider_col, seed_col = st.columns(2) |
|
|
|
with test_size_slider_col: |
|
|
|
dataset.test_size = st.slider( |
|
label="Test Size Percentage of Input Dataframe:", |
|
min_value=0, |
|
max_value=100, |
|
value=dataset.test_size, |
|
key="init_test_size", |
|
format="%f%%", |
|
) |
|
|
|
with seed_col: |
|
dataset.random_state = int( |
|
st.number_input(label="Random State:", value=dataset.random_state) |
|
) |
|
|
|
split_dataset = dataset.train_test_split(selected_x_values) |
|
|
|
true_status = split_dataset.y_test.to_frame().value_counts() |
|
|
|
st.sidebar.metric( |
|
label="Testing Data # of Actual Default (=1)", |
|
value=true_status.get(1), |
|
) |
|
|
|
st.sidebar.metric( |
|
label="Testing Data % of Actual Default", |
|
value="{:.0%}".format(true_status.get(1) / true_status.sum()), |
|
) |
|
|
|
st.sidebar.metric( |
|
label="Testing Data # of Actual Non-Default (=0)", |
|
value=true_status.get(0), |
|
) |
|
|
|
st.sidebar.metric( |
|
label="Testing Data % of Actual Non-Default", |
|
value="{:.0%}".format(true_status.get(0) / true_status.sum()), |
|
) |
|
|
|
|
|
X_y_test = split_dataset.X_y_test |
|
X_y_train = split_dataset.X_y_train |
|
|
|
with st.expander("Testing Dataframe (X and y)"): |
|
st.dataframe(X_y_test) |
|
streamlit_2columns_metrics_df_shape(X_y_test) |
|
|
|
streamlit_2columns_metrics_series( |
|
"# Defaults(=1) (Testing Data)", |
|
"# Non-Defaults(=0) (Testing Data)", |
|
true_status, |
|
) |
|
|
|
streamlit_2columns_metrics_pct_series( |
|
"% Defaults (Testing Data)", |
|
"% Non-Defaults (Testing Data)", |
|
true_status, |
|
) |
|
|
|
st.header("Training Data") |
|
|
|
with st.expander("Training Dataframe (X and y)"): |
|
st.dataframe(X_y_train) |
|
streamlit_2columns_metrics_df_shape(X_y_train) |
|
|
|
st.subheader("Class Count") |
|
|
|
streamlit_2columns_metrics_df( |
|
"# Defaults (Training Data Class Balance Check)", |
|
"# Non-Defaults (Training Data Class Balance Check)", |
|
split_dataset.y_train, |
|
) |
|
|
|
streamlit_2columns_metrics_pct_df( |
|
"% Defaults (Training Data Class Balance Check)", |
|
"% Non-Defaults (Training Data Class Balance Check)", |
|
split_dataset.y_train, |
|
) |
|
|
|
balance_the_classes = st.radio( |
|
label="Balance the Classes:", options=("Yes", "No") |
|
) |
|
|
|
if balance_the_classes == "Yes": |
|
st.subheader("Balanced Classes (by Undersampling)") |
|
|
|
( |
|
split_dataset.X_train, |
|
split_dataset.y_train, |
|
_X_y_train, |
|
class_balance_default, |
|
) = undersample_training_data(X_y_train, "loan_status", split_dataset) |
|
|
|
streamlit_2columns_metrics_series( |
|
"# Defaults (Training Data with Class Balance)", |
|
"# Non-Defaults (Training Data with Class Balance)", |
|
class_balance_default, |
|
) |
|
|
|
streamlit_2columns_metrics_pct_series( |
|
"% of Defaults (Training Data with Class Balance)", |
|
"% of Non-Defaults (Training Data with Class Balance)", |
|
class_balance_default, |
|
) |
|
|
|
return dataset, split_dataset |
|
|