cleanup
Browse files
src/features/util_build_features.py
CHANGED
@@ -113,110 +113,6 @@ def boolean_int_condition_label(df, label_column_name, condition):
|
|
113 |
return y, df
|
114 |
|
115 |
|
116 |
-
@dataclass
|
117 |
-
class SplitDataset:
|
118 |
-
X_test: pd.DataFrame
|
119 |
-
X_train: pd.DataFrame
|
120 |
-
y_test: pd.Series
|
121 |
-
y_train: pd.Series
|
122 |
-
|
123 |
-
@property
|
124 |
-
def X_y_test(self) -> pd.DataFrame:
|
125 |
-
return pd.concat(
|
126 |
-
cast(
|
127 |
-
List[Union[pd.DataFrame, pd.Series]],
|
128 |
-
[
|
129 |
-
self.X_test.reset_index(drop=True),
|
130 |
-
self.y_test.reset_index(drop=True),
|
131 |
-
],
|
132 |
-
),
|
133 |
-
axis=1,
|
134 |
-
)
|
135 |
-
|
136 |
-
@property
|
137 |
-
def X_y_train(self) -> pd.DataFrame:
|
138 |
-
return pd.concat(
|
139 |
-
cast(
|
140 |
-
List[Union[pd.DataFrame, pd.Series]],
|
141 |
-
[
|
142 |
-
self.X_train.reset_index(drop=True),
|
143 |
-
self.y_train.reset_index(drop=True),
|
144 |
-
],
|
145 |
-
),
|
146 |
-
axis=1,
|
147 |
-
)
|
148 |
-
|
149 |
-
|
150 |
-
@dataclass
|
151 |
-
class Dataset:
|
152 |
-
df: pd.DataFrame
|
153 |
-
random_state: int
|
154 |
-
test_size: int
|
155 |
-
|
156 |
-
@property
|
157 |
-
def y_value(self) -> pd.DataFrame:
|
158 |
-
return self.df["loan_status"]
|
159 |
-
|
160 |
-
@property
|
161 |
-
def x_values(self) -> pd.DataFrame:
|
162 |
-
return cast(
|
163 |
-
pd.DataFrame,
|
164 |
-
drop_columns(
|
165 |
-
self.df,
|
166 |
-
[
|
167 |
-
"loan_status",
|
168 |
-
"loan_grade_A",
|
169 |
-
"loan_grade_B",
|
170 |
-
"loan_grade_C",
|
171 |
-
"loan_grade_D",
|
172 |
-
"loan_grade_E",
|
173 |
-
"loan_grade_F",
|
174 |
-
"loan_grade_G",
|
175 |
-
],
|
176 |
-
),
|
177 |
-
)
|
178 |
-
|
179 |
-
@property
|
180 |
-
def x_values_column_names(self):
|
181 |
-
return self.x_values.columns.tolist()
|
182 |
-
|
183 |
-
def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
|
184 |
-
return self.df.filter(columns)
|
185 |
-
|
186 |
-
def train_test_split(
|
187 |
-
self, selected_x_values: pd.DataFrame
|
188 |
-
) -> SplitDataset:
|
189 |
-
X_train, X_test, y_train, y_test = train_test_split(
|
190 |
-
selected_x_values,
|
191 |
-
self.y_value,
|
192 |
-
test_size=self.test_size / 100, # since up was given as pct
|
193 |
-
random_state=self.random_state,
|
194 |
-
)
|
195 |
-
|
196 |
-
return SplitDataset(
|
197 |
-
X_train=cast(pd.DataFrame, X_train),
|
198 |
-
X_test=cast(pd.DataFrame, X_test),
|
199 |
-
y_train=cast(pd.Series, y_train),
|
200 |
-
y_test=cast(pd.Series, y_test),
|
201 |
-
)
|
202 |
-
|
203 |
-
|
204 |
-
def drop_columns(df, columns):
|
205 |
-
return df.drop(columns, axis=1)
|
206 |
-
|
207 |
-
|
208 |
-
def remove_less_than_0_columns(df, column):
|
209 |
-
df[column].dropna()
|
210 |
-
return df.loc[(df[column] != 0).any(1)]
|
211 |
-
|
212 |
-
|
213 |
-
def boolean_int_condition_label(df, label_column_name, condition):
|
214 |
-
df[label_column_name] = condition
|
215 |
-
y = df[label_column_name].astype(int)
|
216 |
-
df = drop_columns(df, label_column_name)
|
217 |
-
return y, df
|
218 |
-
|
219 |
-
|
220 |
@st.cache(suppress_st_warning=True)
|
221 |
def undersample_training_data(
|
222 |
df: pd.DataFrame, column_name: str, split_dataset
|
|
|
113 |
return y, df
|
114 |
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
@st.cache(suppress_st_warning=True)
|
117 |
def undersample_training_data(
|
118 |
df: pd.DataFrame, column_name: str, split_dataset
|