pkiage commited on
Commit
830efed
1 Parent(s): b5a7ea1
Files changed (1) hide show
  1. src/features/util_build_features.py +0 -104
src/features/util_build_features.py CHANGED
@@ -113,110 +113,6 @@ def boolean_int_condition_label(df, label_column_name, condition):
113
  return y, df
114
 
115
 
116
- @dataclass
117
- class SplitDataset:
118
- X_test: pd.DataFrame
119
- X_train: pd.DataFrame
120
- y_test: pd.Series
121
- y_train: pd.Series
122
-
123
- @property
124
- def X_y_test(self) -> pd.DataFrame:
125
- return pd.concat(
126
- cast(
127
- List[Union[pd.DataFrame, pd.Series]],
128
- [
129
- self.X_test.reset_index(drop=True),
130
- self.y_test.reset_index(drop=True),
131
- ],
132
- ),
133
- axis=1,
134
- )
135
-
136
- @property
137
- def X_y_train(self) -> pd.DataFrame:
138
- return pd.concat(
139
- cast(
140
- List[Union[pd.DataFrame, pd.Series]],
141
- [
142
- self.X_train.reset_index(drop=True),
143
- self.y_train.reset_index(drop=True),
144
- ],
145
- ),
146
- axis=1,
147
- )
148
-
149
-
150
- @dataclass
151
- class Dataset:
152
- df: pd.DataFrame
153
- random_state: int
154
- test_size: int
155
-
156
- @property
157
- def y_value(self) -> pd.DataFrame:
158
- return self.df["loan_status"]
159
-
160
- @property
161
- def x_values(self) -> pd.DataFrame:
162
- return cast(
163
- pd.DataFrame,
164
- drop_columns(
165
- self.df,
166
- [
167
- "loan_status",
168
- "loan_grade_A",
169
- "loan_grade_B",
170
- "loan_grade_C",
171
- "loan_grade_D",
172
- "loan_grade_E",
173
- "loan_grade_F",
174
- "loan_grade_G",
175
- ],
176
- ),
177
- )
178
-
179
- @property
180
- def x_values_column_names(self):
181
- return self.x_values.columns.tolist()
182
-
183
- def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
184
- return self.df.filter(columns)
185
-
186
- def train_test_split(
187
- self, selected_x_values: pd.DataFrame
188
- ) -> SplitDataset:
189
- X_train, X_test, y_train, y_test = train_test_split(
190
- selected_x_values,
191
- self.y_value,
192
- test_size=self.test_size / 100, # since up was given as pct
193
- random_state=self.random_state,
194
- )
195
-
196
- return SplitDataset(
197
- X_train=cast(pd.DataFrame, X_train),
198
- X_test=cast(pd.DataFrame, X_test),
199
- y_train=cast(pd.Series, y_train),
200
- y_test=cast(pd.Series, y_test),
201
- )
202
-
203
-
204
- def drop_columns(df, columns):
205
- return df.drop(columns, axis=1)
206
-
207
-
208
- def remove_less_than_0_columns(df, column):
209
- df[column].dropna()
210
- return df.loc[(df[column] != 0).any(1)]
211
-
212
-
213
- def boolean_int_condition_label(df, label_column_name, condition):
214
- df[label_column_name] = condition
215
- y = df[label_column_name].astype(int)
216
- df = drop_columns(df, label_column_name)
217
- return y, df
218
-
219
-
220
  @st.cache(suppress_st_warning=True)
221
  def undersample_training_data(
222
  df: pd.DataFrame, column_name: str, split_dataset
 
113
  return y, df
114
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  @st.cache(suppress_st_warning=True)
117
  def undersample_training_data(
118
  df: pd.DataFrame, column_name: str, split_dataset