Gaurav069 commited on
Commit
ba67510
1 Parent(s): dc62f39

Upload 12 files

Browse files
Files changed (9) hide show
  1. .streamlit/config.toml +7 -0
  2. app.py +166 -53
  3. auto_optimizer.py +361 -317
  4. best_tts.py +2 -2
  5. eda.py +325 -0
  6. feature_selections.py +6 -6
  7. grid_search_cv.py +284 -0
  8. models.py +2 -0
  9. requirements.txt +5 -4
.streamlit/config.toml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ [theme]
3
+ primaryColor="#F63366"
4
+ backgroundColor="#002148"
5
+ secondaryBackgroundColor="#576c86"
6
+ textColor="white"
7
+ font="serif"
app.py CHANGED
@@ -8,6 +8,7 @@ import evaluationer,models, null_value_handling
8
  import auto_optimizer
9
  from sklearn.experimental import enable_iterative_imputer
10
  from sklearn.impute import SimpleImputer, IterativeImputer
 
11
  # st.set_page_config(layout="wide")
12
 
13
  st.set_page_config(
@@ -21,7 +22,23 @@ st.set_page_config(
21
  }
22
  )
23
 
24
- import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # Title with Rainbow Transition Effect and Neon Glow
27
  html_code = """
@@ -67,23 +84,74 @@ html_code = """
67
  """
68
 
69
  st.markdown(html_code, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
 
72
  # file uploader
73
  csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
 
 
 
 
74
  csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
 
75
  test = pd.DataFrame()
76
  if csv_upload is not None:
77
  # read the uploaded file into dataframe
78
- df = pd.read_csv(csv_upload)
79
 
80
  # saving the dataframe to a CSV file
81
  df.to_csv('csv_upload.csv', index=False)
82
- st.write("Train File uploaded successfully. ✅")
83
-
84
  if csv_upload2 is not None:
85
- test = pd.read_csv(csv_upload2)
86
- id_col = st.selectbox("select column for submission i.e, ID",test.columns)
 
 
 
87
  submission_id = test[id_col]
88
  # st.write("Train File upl",submission_id)
89
 
@@ -93,8 +161,10 @@ if csv_upload is not None:
93
  if len(test) >0:
94
  # saving the test dataframe to a CSV file
95
  test.to_csv('csv_upload_test.csv', index=False)
96
- st.write("Test File uploaded successfully. ✅")
97
 
 
 
98
  display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
99
  if display_train_data == "Yes":
100
  st.dataframe(df.head())
@@ -104,29 +174,40 @@ if csv_upload is not None:
104
  if display_test_data == "Yes":
105
  st.dataframe(test.head())
106
 
107
-
108
- if st.radio("Select Supervision Category",["Supervised","Un-Supervised"],index =0) == "Supervised":
109
-
110
- selected_column = st.selectbox('Select Target column', df.columns, index=(len(df.columns)-1))
 
 
 
111
 
112
  # Display the selected column
113
  st.write('You selected:', selected_column)
114
-
 
 
 
 
 
 
 
115
  y = df[selected_column]
116
 
117
  if y.dtype == "O":
118
- st.write("⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️")
119
- if st.radio("Proceed for Label Encoding ",["Yes","No"],index = 1) == "Yes":
 
120
  from sklearn.preprocessing import LabelEncoder
121
  le = LabelEncoder()
122
  y= pd.Series(le.fit_transform(y))
123
- st.write("Label Encoding Completed ✅")
124
-
125
- if st.radio("Display Target Column",["Yes","No"],index =1) == "Yes":
126
  st.dataframe(y.head())
127
 
128
-
129
- select_target_trans = st.radio("Target column Transformation",["Yes","No"],index = 1)
 
130
  if select_target_trans == "Yes":
131
  selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
132
  if selected_transformation == "Log Transformation":
@@ -155,36 +236,52 @@ if csv_upload is not None:
155
 
156
  if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
157
  st.dataframe(y.head())
158
- # inverse of transformation
 
159
 
160
  X = df.drop(columns = selected_column)
161
 
162
  if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
163
  st.dataframe(X.head())
164
- if st.radio("Check for duplicate Values",["Yes","No"],index = 1) == "Yes":
 
 
 
 
165
  len_duplicates = len(X[X.duplicated()])
166
  if len_duplicates >0:
167
  st.write(f"There are {len_duplicates} duplicate values in Train")
 
 
168
  if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
169
  X = X.drop_duplicates()
170
  st.write("Duplicate values removed ✅")
171
  else:
172
  st.write("There are no duplicate values in Train")
 
173
  # dropping not important columns
174
- if st.radio("Drop Un-Important Column(s)",["Yes","No"],index = 1) == "Yes":
 
175
  selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
176
  X = X.drop(columns = selected_drop_column)
177
  if len(test) >0:
178
  test = test.drop(columns = selected_drop_column)
179
- st.write("Un-Important column(s) Delected ✅")
180
  st.dataframe(X.head())
181
 
 
182
  num_cols = X.select_dtypes(exclude = "O").columns
183
  cat_cols = X.select_dtypes(include = "O").columns
184
  st.write("Numerical Columns in Train Data: ", tuple(num_cols))
185
  st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
186
-
187
- if st.radio("Select method for ML modelling", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
 
 
 
 
 
 
188
  ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
189
 
190
  if ml_cat_ao =="Regression":
@@ -192,7 +289,7 @@ if csv_upload is not None:
192
  st.write("Select ML algorithm")
193
  reg_model_name = st.selectbox("select model",models.Regression_models.index)
194
  reg_model = models.Regression_models.loc[reg_model_name].values[0]
195
- auto_optimizer.Auto_optimizer(X,y,eva,reg_model)
196
 
197
  elif ml_cat_ao =="Classification":
198
  eva = "class"
@@ -201,10 +298,12 @@ if csv_upload is not None:
201
  class_model = models.Classification_models.loc[class_model_name].values[0]
202
  auto_optimizer.Auto_optimizer(X,y,eva,class_model)
203
 
204
-
205
  else:
 
206
  if X.isnull().sum().sum() >0 :
207
- st.write("⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️")
 
208
 
209
  if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
210
 
@@ -241,7 +340,9 @@ if csv_upload is not None:
241
 
242
 
243
  clean_num_nvh_df_cat = pd.DataFrame()
 
244
  if X[cat_cols].isnull().sum().sum() >0:
 
245
  st.write("Categorical Columns with Percentage of Null Values: ")
246
  cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
247
  st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
@@ -270,33 +371,41 @@ if csv_upload is not None:
270
  null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
271
  st.write("X Data after Null value handling", X.head())
272
 
273
- new_df = pd.concat([X,y[X.index]],axis = 1)
274
-
275
- csv = new_df.to_csv(index = False)
276
- if st.radio("Download Null Value Handled DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
277
- st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
278
-
 
 
279
  ord_enc_cols = []
280
 
281
  if len(cat_cols) == 0:
282
  st.write("No Categorical Columns in Train")
283
  else:
284
- st.write("Select Columns for Ordinal Encoding")
 
 
 
285
  for column in cat_cols:
286
 
287
  selected = st.checkbox(column)
288
  if selected:
289
  st.write(f"No. of Unique value in {column} column are", X[column].nunique())
290
  ord_enc_cols.append(column)
 
291
  ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
292
  ohe_enc_cols = list(ohe_enc_cols)
293
  if len(ord_enc_cols)>0:
294
  st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
295
  if len(ohe_enc_cols)>0:
296
  st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
297
-
 
298
  if len(ord_enc_cols)>0:
299
- if st.radio("proceed for ordinal encoding",["Yes","No"],index = 1) == "Yes":
 
300
  ordinal_order_vals = []
301
 
302
  for column in ord_enc_cols:
@@ -317,7 +426,7 @@ if csv_upload is not None:
317
  st.write("Ordinal Encoding Completed ✅")
318
 
319
  if len(ohe_enc_cols)>0:
320
- if st.radio("proceed for OnehotEncoding ",["Yes","No"],index = 1) == "Yes": # import one hot encoder
321
  from sklearn.preprocessing import OneHotEncoder
322
  ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
323
  pd.options.mode.chained_assignment = None
@@ -331,39 +440,43 @@ if csv_upload is not None:
331
 
332
  st.write("DataFrame after One Hot Encoding",X.head())
333
  st.write("OneHot Encoding Completed ✅")
334
-
335
  new_df = pd.concat([X,y],axis = 1)
336
 
337
  csv = new_df.to_csv(index = False)
338
- if st.radio("Download Encoded DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
339
  st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
340
 
341
-
342
- random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
343
- test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
344
- if st.radio("select Train Validation Split Method",
345
- [f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})",
346
- "KFoldCV, Default (CV = 5)"], index = 0)== f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})":
347
  ttsmethod = "Train_Test_split"
348
  else:
349
  ttsmethod = "KFoldCV"
350
  st.write('You selected:', ttsmethod)
351
  if ttsmethod == "Train_Test_split":
 
 
352
  X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
353
- st.write('X-Training Data shape:', (X_train.info()))
354
 
355
  st.write('X-Training Data shape:', X_train.shape)
356
  st.write('X-Validation Data shape:', X_Val.shape)
357
-
358
- ml_cat = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
359
-
 
360
  if ml_cat =="Regression":
361
- method_name_selector = st.selectbox("Select Error Evaluation Method",evaluationer.method_df.index,index = 0)
 
 
 
362
 
363
  method = evaluationer.method_df.loc[method_name_selector].values[0]
364
  reg_algorithm = []
365
  selected_options = []
366
-
367
  for option in models.Regression_models.index:
368
  selected = st.checkbox(option)
369
  if selected:
@@ -450,7 +563,7 @@ if csv_upload is not None:
450
 
451
  cla_algorithm = []
452
  selected_options = []
453
-
454
  for option in models.Classification_models.index:
455
  selected = st.checkbox(option)
456
  if selected:
 
8
  import auto_optimizer
9
  from sklearn.experimental import enable_iterative_imputer
10
  from sklearn.impute import SimpleImputer, IterativeImputer
11
+ import eda
12
  # st.set_page_config(layout="wide")
13
 
14
  st.set_page_config(
 
22
  }
23
  )
24
 
25
+
26
+
27
+ # Set the background image
28
+ background_image = """
29
+ <style>
30
+ [data-testid="stAppViewContainer"] > .main {
31
+ background-image: url("https://w.wallhaven.cc/full/jx/wallhaven-jx7w25.png");
32
+ background-size: 100vw 100vh; # This sets the size to cover 100% of the viewport width and height
33
+ background-position: center;
34
+ background-repeat: no-repeat;
35
+ }
36
+ </style>
37
+ """
38
+
39
+ st.markdown(background_image, unsafe_allow_html=True)
40
+
41
+
42
 
43
  # Title with Rainbow Transition Effect and Neon Glow
44
  html_code = """
 
84
  """
85
 
86
  st.markdown(html_code, unsafe_allow_html=True)
87
+ st.divider()
88
+
89
+ st.markdown(
90
+ """
91
+ <style>
92
+ .success-message {
93
+ font-family: Arial, sans-serif;
94
+ font-size: 24px;
95
+ color: green;
96
+ text-align: left;
97
+ }
98
+ .unsuccess-message {
99
+ font-family: Arial, sans-serif;
100
+ font-size: 24px;
101
+ color: red;
102
+ text-align: left;
103
+ }
104
+ .prompt-message {
105
+ font-family: Arial, sans-serif;
106
+ font-size: 24px;
107
+ color: #333;
108
+ text-align: center;
109
+ }
110
+ .success-message2 {
111
+ font-family: Arial, sans-serif;
112
+ font-size: 18px;
113
+ color: white;
114
+ text-align: left;
115
+ }
116
+ .message-box {
117
+ text-align: center;
118
+ background-color: white;
119
+ padding: 5px;
120
+ border-radius: 10px;
121
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
122
+ font-size: 24px;
123
+ color: #333;
124
+ }
125
+ </style>
126
+ """,
127
+ unsafe_allow_html=True
128
+ )
129
 
130
 
131
+ # st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
132
  # file uploader
133
  csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
134
+
135
+ sep = st.sidebar.text_input("Input Seperator")
136
+ if (len(sep) ==0):
137
+ sep = ","
138
  csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
139
+
140
  test = pd.DataFrame()
141
  if csv_upload is not None:
142
  # read the uploaded file into dataframe
143
+ df = pd.read_csv(csv_upload,sep = sep)
144
 
145
  # saving the dataframe to a CSV file
146
  df.to_csv('csv_upload.csv', index=False)
147
+ st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
148
+
149
  if csv_upload2 is not None:
150
+ test = pd.read_csv(csv_upload2,sep = sep)
151
+ st.markdown('<p class="success-message">Test File uploaded successfully. ✅</p>', unsafe_allow_html=True)
152
+ st.divider()
153
+ id_col = st.selectbox("Select Column for Submission i.e, ID",test.columns)
154
+ st.divider()
155
  submission_id = test[id_col]
156
  # st.write("Train File upl",submission_id)
157
 
 
161
  if len(test) >0:
162
  # saving the test dataframe to a CSV file
163
  test.to_csv('csv_upload_test.csv', index=False)
164
+
165
 
166
+ st.markdown('<p class="message-box">Display Data</p>', unsafe_allow_html=True)
167
+ st.write("")
168
  display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
169
  if display_train_data == "Yes":
170
  st.dataframe(df.head())
 
174
  if display_test_data == "Yes":
175
  st.dataframe(test.head())
176
 
177
+ st.divider()
178
+ st.markdown('<div class="message-box success">Select Supervision Category</div>', unsafe_allow_html=True)
179
+ if st.radio("",["Supervised","Un-Supervised"],index =0) == "Supervised":
180
+ st.divider()
181
+
182
+ st.write('<p class="success-message2">Select Target column</p>', unsafe_allow_html=True)
183
+ selected_column = st.selectbox('', df.columns, index=(len(df.columns)-1))
184
 
185
  # Display the selected column
186
  st.write('You selected:', selected_column)
187
+ st.divider()
188
+
189
+ st.markdown('<div class="message-box success ">Perform EDA</div>', unsafe_allow_html=True)
190
+ st.write("")
191
+ if st.checkbox("Proceed to perform EDA"):
192
+ eda.eda_analysis(df)
193
+ st.write('<p class="success-message">EDA Performed proceed for Pre-processing</p>', unsafe_allow_html=True)
194
+ st.divider()
195
  y = df[selected_column]
196
 
197
  if y.dtype == "O":
198
+ st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
199
+
200
+ if st.checkbox("Proceed for Label Encoding "):
201
  from sklearn.preprocessing import LabelEncoder
202
  le = LabelEncoder()
203
  y= pd.Series(le.fit_transform(y))
204
+ st.markdown('<p class="success-message">Label Encoding Completed ✅</p>', unsafe_allow_html=True)
205
+ if st.checkbox("Display Target Column"):
 
206
  st.dataframe(y.head())
207
 
208
+ st.divider()
209
+ st.markdown('<div class="message-box success">Target column Transformation</div>', unsafe_allow_html=True)
210
+ select_target_trans = st.radio("",["Yes","No"],index = 1)
211
  if select_target_trans == "Yes":
212
  selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
213
  if selected_transformation == "Log Transformation":
 
236
 
237
  if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
238
  st.dataframe(y.head())
239
+
240
+
241
 
242
  X = df.drop(columns = selected_column)
243
 
244
  if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
245
  st.dataframe(X.head())
246
+ st.divider()
247
+
248
+ # st.checkbox()
249
+ st.markdown('<div class="message-box success">Check for duplicate Values</div>', unsafe_allow_html=True)
250
+ if st.radio(" ",["Yes","No"],index = 1) == "Yes":
251
  len_duplicates = len(X[X.duplicated()])
252
  if len_duplicates >0:
253
  st.write(f"There are {len_duplicates} duplicate values in Train")
254
+ if st.checkbox("Show Duplicate values"):
255
+ st.dataframe(X[X.duplicated()])
256
  if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
257
  X = X.drop_duplicates()
258
  st.write("Duplicate values removed ✅")
259
  else:
260
  st.write("There are no duplicate values in Train")
261
+ st.divider()
262
  # dropping not important columns
263
+ st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
264
+ if st.radio(" ",["Yes","No"],index = 1) == "Yes":
265
  selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
266
  X = X.drop(columns = selected_drop_column)
267
  if len(test) >0:
268
  test = test.drop(columns = selected_drop_column)
269
+ st.write("Un-Important column(s) Deleted ✅")
270
  st.dataframe(X.head())
271
 
272
+ st.divider()
273
  num_cols = X.select_dtypes(exclude = "O").columns
274
  cat_cols = X.select_dtypes(include = "O").columns
275
  st.write("Numerical Columns in Train Data: ", tuple(num_cols))
276
  st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
277
+ if st.sidebar.button("Clear Evaluation DataFrame"):
278
+ evaluationer.reg_evaluation_df = evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
279
+ evaluationer.classification_evaluation_df = evaluationer.classification_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
280
+ st.divider()
281
+ # markdown
282
+ st.markdown('<div class="message-box success">Select method for ML modelling</div>', unsafe_allow_html = True)
283
+ if st.radio(" ", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
284
+ st.divider()
285
  ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
286
 
287
  if ml_cat_ao =="Regression":
 
289
  st.write("Select ML algorithm")
290
  reg_model_name = st.selectbox("select model",models.Regression_models.index)
291
  reg_model = models.Regression_models.loc[reg_model_name].values[0]
292
+ auto_optimizer.Auto_optimizer(X,y,eva,reg_model,reg_model_name)
293
 
294
  elif ml_cat_ao =="Classification":
295
  eva = "class"
 
298
  class_model = models.Classification_models.loc[class_model_name].values[0]
299
  auto_optimizer.Auto_optimizer(X,y,eva,class_model)
300
 
301
+
302
  else:
303
+ st.divider()
304
  if X.isnull().sum().sum() >0 :
305
+
306
+ st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
307
 
308
  if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
309
 
 
340
 
341
 
342
  clean_num_nvh_df_cat = pd.DataFrame()
343
+
344
  if X[cat_cols].isnull().sum().sum() >0:
345
+ st.divider()
346
  st.write("Categorical Columns with Percentage of Null Values: ")
347
  cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
348
  st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
 
371
  null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
372
  st.write("X Data after Null value handling", X.head())
373
 
374
+ new_df = pd.concat([X,y[X.index]],axis = 1)
375
+
376
+ csv = new_df.to_csv(index = False)
377
+
378
+ st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
379
+ if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
380
+ st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
381
+ st.divider()
382
  ord_enc_cols = []
383
 
384
  if len(cat_cols) == 0:
385
  st.write("No Categorical Columns in Train")
386
  else:
387
+ st.markdown('<div class="message-box success">Features Encoding</div>', unsafe_allow_html=True)
388
+ st.markdown('<p class="unsuccess-message">There are Object type Features in Train Data ⚠️</p>', unsafe_allow_html=True)
389
+ st.markdown('<p class="success-message2">Select Columns for Ordinal Encoding</p>', unsafe_allow_html=True)
390
+
391
  for column in cat_cols:
392
 
393
  selected = st.checkbox(column)
394
  if selected:
395
  st.write(f"No. of Unique value in {column} column are", X[column].nunique())
396
  ord_enc_cols.append(column)
397
+ st.divider()
398
  ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
399
  ohe_enc_cols = list(ohe_enc_cols)
400
  if len(ord_enc_cols)>0:
401
  st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
402
  if len(ohe_enc_cols)>0:
403
  st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
404
+ st.divider()
405
+ st.markdown('<div class="message-box success">Proceed for Encoding</div>', unsafe_allow_html=True)
406
  if len(ord_enc_cols)>0:
407
+
408
+ if st.checkbox("Proceed for Ordinal Encoding"):
409
  ordinal_order_vals = []
410
 
411
  for column in ord_enc_cols:
 
426
  st.write("Ordinal Encoding Completed ✅")
427
 
428
  if len(ohe_enc_cols)>0:
429
+ if st.checkbox("Proceed for OneHotEncoding "): # import one hot encoder
430
  from sklearn.preprocessing import OneHotEncoder
431
  ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
432
  pd.options.mode.chained_assignment = None
 
440
 
441
  st.write("DataFrame after One Hot Encoding",X.head())
442
  st.write("OneHot Encoding Completed ✅")
443
+ st.divider()
444
  new_df = pd.concat([X,y],axis = 1)
445
 
446
  csv = new_df.to_csv(index = False)
447
+ if st.checkbox("Download Encoded DataFrame as CSV File ? "):
448
  st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
449
 
450
+ st.divider()
451
+ st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
452
+ st.write("")
453
+ st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)
454
+ if st.radio("",["Train_Test_split","KFoldCV, Default (CV = 5)"], index = 0)== "Train_Test_split":
 
455
  ttsmethod = "Train_Test_split"
456
  else:
457
  ttsmethod = "KFoldCV"
458
  st.write('You selected:', ttsmethod)
459
  if ttsmethod == "Train_Test_split":
460
+ random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
461
+ test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
462
  X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
 
463
 
464
  st.write('X-Training Data shape:', X_train.shape)
465
  st.write('X-Validation Data shape:', X_Val.shape)
466
+ st.divider()
467
+ st.markdown('<p class="success-message2">Select Machine Learning Category</p>', unsafe_allow_html=True)
468
+ ml_cat = st.radio("___",options=["Regression","Classification"],index =0)
469
+ st.divider()
470
  if ml_cat =="Regression":
471
+ st.markdown('<p class="success-message2">Select Error Evaluation Method</p>', unsafe_allow_html=True)
472
+ method_name_selector = st.selectbox(" ",evaluationer.method_df.index,index = 0)
473
+
474
+ st.divider()
475
 
476
  method = evaluationer.method_df.loc[method_name_selector].values[0]
477
  reg_algorithm = []
478
  selected_options = []
479
+ st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
480
  for option in models.Regression_models.index:
481
  selected = st.checkbox(option)
482
  if selected:
 
563
 
564
  cla_algorithm = []
565
  selected_options = []
566
+ st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
567
  for option in models.Classification_models.index:
568
  selected = st.checkbox(option)
569
  if selected:
auto_optimizer.py CHANGED
@@ -1,317 +1,361 @@
1
- import pandas as pd
2
- import numpy as np
3
- import streamlit as st
4
- from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
5
- import best_tts, evaluationer,models
6
- from sklearn.experimental import enable_iterative_imputer
7
- from sklearn.model_selection import train_test_split as tts
8
- from collections import Counter
9
- #root_mean_squared_error
10
- from sklearn.metrics import root_mean_squared_error
11
- import seaborn as sns
12
- import matplotlib.pyplot as plt
13
- import outliers,best_tts
14
- import feature_selections
15
- def Auto_optimizer(X,y,eva,model,test= None):
16
- evaluationer.reg_evaluation_df =evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
17
- num_cols = X.select_dtypes(exclude = "O").columns
18
- cat_cols = X.select_dtypes(include = "O").columns
19
- st.write("Num_cols",tuple(num_cols))
20
- st.write("cat_cols",tuple(cat_cols))
21
-
22
- # check for Duplicate and drop duplicated in X
23
-
24
- if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
25
- X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
26
- st.write("Columns with more than 40% null values removed")
27
- # st.write("csx",X)
28
-
29
- len_null = X.isnull().sum().sum()
30
-
31
- st.write(f"There are {len_null} null values in Train")
32
-
33
- knn_imputed_num_X = X.copy()
34
- si_mean_imputed_num_X = X.copy()
35
- # st.write("sf",si_mean_imputed_num_X)
36
- si_median_imputed_num_X = X.copy()
37
- si_most_frequent_imputed_num_X = X.copy()
38
- iter_imputed_num_X = X.copy()
39
- knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
40
- si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
41
- si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
42
- si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
43
- iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
44
- if len_null >0:
45
-
46
- if X[num_cols].isnull().sum().sum() >0:
47
-
48
- knn_imputer = KNNImputer(n_neighbors = 5)
49
- knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
50
- si_imputer = SimpleImputer(strategy = "mean")
51
- si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
52
- si_imputer = SimpleImputer(strategy = "median")
53
- si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
54
- si_imputer = SimpleImputer(strategy = "most_frequent")
55
- si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
56
- iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
57
- iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
58
- knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
59
- si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
60
- si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
61
- si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
62
- iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
63
-
64
- if X[cat_cols].isnull().sum().sum() >0:
65
- # treating missing values in categorical columns
66
- # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
67
- si_imputer = SimpleImputer(strategy = "most_frequent")
68
-
69
- knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
70
- si_imputer = SimpleImputer(strategy = "most_frequent")
71
- si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
72
- # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
73
- si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
74
- si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
75
- iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
76
-
77
- knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
78
- si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
79
- si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
80
- si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
81
- iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
82
- st.write("sdds",knn_imputed_num_X)
83
- st.write("sddssd",knn_imputed_X_cat_dropped)
84
-
85
- miss_val_dropped_X = X.dropna()
86
-
87
- # list of dataframes
88
-
89
- list_X_after_missing_values= [knn_imputed_num_X,
90
- si_mean_imputed_num_X,
91
- si_median_imputed_num_X,
92
- si_most_frequent_imputed_num_X,
93
- iter_imputed_num_X,
94
- knn_imputed_X_cat_dropped,
95
- si_mean_imputed_X_cat_dropped,
96
- si_median_imputed_X_cat_dropped,
97
- si_most_frequent_imputed_X_cat_dropped,
98
- iter_imputed_X_cat_dropped,
99
- miss_val_dropped_X]
100
- list_X_after_missing_values_names= ["knn_imputed_num_X",
101
- "si_mean_imputed_num_X",
102
- "si_median_imputed_num_X",
103
- "si_most_frequent_imputed_num_X",
104
- "iter_imputed_num_X",
105
- "knn_imputed_X_cat_dropped",
106
- "si_mean_imputed_X_cat_dropped",
107
- "si_median_imputed_X_cat_dropped",
108
- "si_most_frequent_imputed_X_cat_dropped",
109
- "iter_imputed_X_cat_dropped",
110
- "miss_val_dropped_X"]
111
- # st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
112
- ord_enc_cols = []
113
- ohe_enc_cols = []
114
-
115
- if len(cat_cols) == 0:
116
- st.write("No Categorical Columns in Train")
117
- else:
118
- st.write("Select Columns for Ordinal Encoding")
119
- for column in cat_cols:
120
- selected = st.checkbox(column)
121
- if selected:
122
- st.write(f"No. of Unique value in {column} column are", X[column].nunique())
123
- ord_enc_cols.append(column)
124
- ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
125
- ohe_enc_cols = list(ohe_enc_cols)
126
-
127
- if len(ord_enc_cols)>0:
128
- st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
129
- if len(ohe_enc_cols)>0:
130
- st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
131
-
132
- if len(ord_enc_cols)>0:
133
-
134
- ordinal_order_vals = []
135
-
136
- for column in ord_enc_cols:
137
- unique_vals = X.dropna()[column].unique()
138
- # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
139
-
140
- ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
141
- ordinal_order_vals.append(ordered_unique_vals)
142
-
143
- st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
144
-
145
- if len_null > 0:
146
-
147
- for df_name, df in enumerate(list_X_after_missing_values):
148
- # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
149
- from sklearn.preprocessing import OrdinalEncoder
150
- ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
151
- df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
152
- # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
153
- else :
154
- from sklearn.preprocessing import OrdinalEncoder
155
- ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
156
- X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
157
-
158
- st.write("Ordinal Encoding Completed ✅")
159
-
160
- if len(ohe_enc_cols)>0:
161
- if len_null > 0:
162
- for df_name, df in enumerate(list_X_after_missing_values):
163
- from sklearn.preprocessing import OneHotEncoder
164
- ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
165
- pd.options.mode.chained_assignment = None
166
- df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
167
- df.drop(columns = ohe_enc_cols,inplace = True)
168
- pd.options.mode.chained_assignment = 'warn'
169
- else:
170
- from sklearn.preprocessing import OneHotEncoder
171
- ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
172
- pd.options.mode.chained_assignment = None
173
- X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
174
- X.drop(columns = ohe_enc_cols,inplace = True)
175
- pd.options.mode.chained_assignment = 'warn'
176
- st.write("OneHot Encoding Completed ✅")
177
-
178
-
179
- if len(ohe_enc_cols)>0:
180
- if len_null > 0:
181
- for name,df in enumerate(list_X_after_missing_values):
182
- X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
183
- # best_tts.best_tts(df,y,model,eva)
184
- evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
185
- else:
186
- X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
187
- # best_tts.best_tts(X,y,model,eva)
188
-
189
- evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
190
-
191
- if len_null >0:
192
- for name,df in enumerate(list_X_after_missing_values):
193
- X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
194
- st.write(f"this is test{list_X_after_missing_values_names[name]}",X_train.isnull().sum().sum())
195
- evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
196
-
197
- if eva == "class":
198
- counter = Counter(y)
199
- total = sum(counter.values())
200
- balance_ratio = {cls: count / total for cls, count in counter.items()}
201
- num_classes = len(balance_ratio)
202
- ideal_ratio = 1 / num_classes
203
- a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
204
- if a == True:
205
- st.write("Balanced Dataset ✅")
206
- st.write("Using accuracy for Evaluation")
207
- value = "test_acc"
208
- else:
209
- st.write("Unbalanced Dataset ❌")
210
- st.write("Using F1 score for Evaluation")
211
- value = "test_f1"
212
- st.write("SFdfs",evaluationer.classification_evaluation_df)
213
- evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
214
- name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
215
- st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
216
- if len_null >0:
217
- b = list_X_after_missing_values_names.index(name)
218
- st.write("Sdffsf",b)
219
- st.write("df",list_X_after_missing_values[b])
220
- X = list_X_after_missing_values[b]
221
- if eva == "reg":
222
- st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
223
- value = "test_r2"
224
- evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
225
- st.write("adfsdf",evaluationer.reg_evaluation_df.iloc[-1,0])
226
- name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
227
- st.write("Sdffsf",name)
228
- if len_null >0:
229
- b = list_X_after_missing_values_names.index(name)
230
- st.write("Sdffsf",b)
231
- st.write("df",list_X_after_missing_values[b])
232
- X = list_X_after_missing_values[b]
233
-
234
-
235
- # Create a figure and axes
236
- num_plots = len(num_cols)
237
- cols = 2 # Number of columns in the subplot grid
238
- rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
239
-
240
- fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
241
-
242
- # Flatten the axes array for easy iteration, and remove any excess subplots
243
- axes = axes.flatten()
244
- for ax in axes[num_plots:]:
245
- fig.delaxes(ax)
246
-
247
- for i, col in enumerate(num_cols):
248
- sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
249
- axes[i].set_title(col)
250
-
251
- # Adjust layout
252
- plt.tight_layout()
253
-
254
- # Show the plot in Streamlit
255
- st.pyplot(fig)
256
-
257
- # Create a figure and axes
258
- num_plots = len(num_cols)
259
- cols = 3 # Number of columns in the subplot grid
260
- rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
261
-
262
- fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
263
-
264
- # Flatten the axes array for easy iteration, and remove any excess subplots
265
- axes = axes.flatten()
266
- for ax in axes[num_plots:]:
267
- fig.delaxes(ax)
268
-
269
- for i, col in enumerate(num_cols):
270
- sns.boxplot(y=X[col], ax=axes[i],palette="magma")
271
- axes[i].set_title(col)
272
-
273
- # Adjust layout
274
- plt.tight_layout()
275
-
276
- # Show the plot in Streamlit
277
- st.pyplot(fig)
278
-
279
- outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
280
-
281
- st.write("Checking for Outliers")
282
- outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
283
- st.write("Outliers in Dataframe Summary",outliers_df_X)
284
- st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
285
-
286
- select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
287
- resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
288
- st.write("outlier handling with methods",resultant)
289
- st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
290
- try :
291
- st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
292
-
293
- st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
294
- X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
295
- except :
296
- "evaluation of baseline model is better continuing with baseline model"
297
-
298
- # result_df ,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
299
- X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
300
- st.write("result_df",X)
301
- st.write("fsdfs",X_train)
302
- result_df_1 = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
303
- st.write("sdchsvdgj",result_df_1)
304
-
305
-
306
-
307
-
308
-
309
-
310
-
311
-
312
-
313
-
314
-
315
-
316
-
317
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
5
+ import best_tts, evaluationer,models
6
+ from sklearn.experimental import enable_iterative_imputer
7
+ from sklearn.model_selection import train_test_split as tts
8
+ from collections import Counter
9
+ from sklearn.preprocessing import PolynomialFeatures
10
+ from sklearn.metrics import root_mean_squared_error
11
+ import seaborn as sns
12
+ from sklearn.decomposition import PCA
13
+ import grid_search_cv
14
+ import matplotlib.pyplot as plt
15
+ import outliers,best_tts
16
+ import feature_selections
17
+ def Auto_optimizer(X,y,eva,model,model_name,test= None):
18
+ if st.button("Train Regression Model"):
19
+ num_cols = X.select_dtypes(exclude = "O").columns
20
+ cat_cols = X.select_dtypes(include = "O").columns
21
+ st.write("Num_cols",tuple(num_cols))
22
+ st.write("cat_cols",tuple(cat_cols))
23
+
24
+ # check for Duplicate and drop duplicated in X
25
+
26
+ if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
27
+ X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
28
+ st.write("Columns with more than 40% null values removed")
29
+ # st.write("csx",X)
30
+
31
+ len_null = X.isnull().sum().sum()
32
+
33
+ st.write(f"There are {len_null} null values in Train")
34
+
35
+ knn_imputed_num_X = X.copy()
36
+ si_mean_imputed_num_X = X.copy()
37
+ # st.write("sf",si_mean_imputed_num_X)
38
+ si_median_imputed_num_X = X.copy()
39
+ si_most_frequent_imputed_num_X = X.copy()
40
+ iter_imputed_num_X = X.copy()
41
+ knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
42
+ si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
43
+ si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
44
+ si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
45
+ iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
46
+ if len_null >0:
47
+
48
+ if X[num_cols].isnull().sum().sum() >0:
49
+
50
+ knn_imputer = KNNImputer(n_neighbors = 5)
51
+ knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
52
+ si_imputer = SimpleImputer(strategy = "mean")
53
+ si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
54
+ si_imputer = SimpleImputer(strategy = "median")
55
+ si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
56
+ si_imputer = SimpleImputer(strategy = "most_frequent")
57
+ si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
58
+ iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
59
+ iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
60
+ knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
61
+ si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
62
+ si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
63
+ si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
64
+ iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
65
+
66
+ if X[cat_cols].isnull().sum().sum() >0:
67
+ # treating missing values in categorical columns
68
+ # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
69
+ si_imputer = SimpleImputer(strategy = "most_frequent")
70
+
71
+ knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
72
+ si_imputer = SimpleImputer(strategy = "most_frequent")
73
+ si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
74
+ # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
75
+ si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
76
+ si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
77
+ iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
78
+
79
+ knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
80
+ si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
81
+ si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
82
+ si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
83
+ iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
84
+
85
+
86
+ miss_val_dropped_X = X.dropna()
87
+
88
+ # list of dataframes
89
+
90
+ list_X_after_missing_values= [knn_imputed_num_X,
91
+ si_mean_imputed_num_X,
92
+ si_median_imputed_num_X,
93
+ si_most_frequent_imputed_num_X,
94
+ iter_imputed_num_X,
95
+ knn_imputed_X_cat_dropped,
96
+ si_mean_imputed_X_cat_dropped,
97
+ si_median_imputed_X_cat_dropped,
98
+ si_most_frequent_imputed_X_cat_dropped,
99
+ iter_imputed_X_cat_dropped,
100
+ miss_val_dropped_X]
101
+ list_X_after_missing_values_names= ["knn_imputed_num_X",
102
+ "si_mean_imputed_num_X",
103
+ "si_median_imputed_num_X",
104
+ "si_most_frequent_imputed_num_X",
105
+ "iter_imputed_num_X",
106
+ "knn_imputed_X_cat_dropped",
107
+ "si_mean_imputed_X_cat_dropped",
108
+ "si_median_imputed_X_cat_dropped",
109
+ "si_most_frequent_imputed_X_cat_dropped",
110
+ "iter_imputed_X_cat_dropped",
111
+ "miss_val_dropped_X"]
112
+ # st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
113
+ ord_enc_cols = []
114
+ ohe_enc_cols = []
115
+
116
+ if len(cat_cols) == 0:
117
+ st.write("No Categorical Columns in Train")
118
+ else:
119
+ st.write("Select Columns for Ordinal Encoding")
120
+ for column in cat_cols:
121
+ selected = st.checkbox(column)
122
+ if selected:
123
+ st.write(f"No. of Unique value in {column} column are", X[column].nunique())
124
+ ord_enc_cols.append(column)
125
+ ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
126
+ ohe_enc_cols = list(ohe_enc_cols)
127
+
128
+ if len(ord_enc_cols)>0:
129
+ st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
130
+ if len(ohe_enc_cols)>0:
131
+ st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
132
+
133
+ if len(ord_enc_cols)>0:
134
+
135
+ ordinal_order_vals = []
136
+
137
+ for column in ord_enc_cols:
138
+ unique_vals = X.dropna()[column].unique()
139
+ # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
140
+
141
+ ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
142
+ ordinal_order_vals.append(ordered_unique_vals)
143
+
144
+ st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
145
+
146
+ if len_null > 0:
147
+
148
+ for df_name, df in enumerate(list_X_after_missing_values):
149
+ # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
150
+ from sklearn.preprocessing import OrdinalEncoder
151
+ ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
152
+ df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
153
+ # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
154
+ else :
155
+ from sklearn.preprocessing import OrdinalEncoder
156
+ ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
157
+ X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
158
+
159
+ st.write("Ordinal Encoding Completed ✅")
160
+
161
+ if len(ohe_enc_cols)>0:
162
+ if len_null > 0:
163
+ for df_name, df in enumerate(list_X_after_missing_values):
164
+ from sklearn.preprocessing import OneHotEncoder
165
+ ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
166
+ pd.options.mode.chained_assignment = None
167
+ df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
168
+ df.drop(columns = ohe_enc_cols,inplace = True)
169
+ pd.options.mode.chained_assignment = 'warn'
170
+ else:
171
+ from sklearn.preprocessing import OneHotEncoder
172
+ ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
173
+ pd.options.mode.chained_assignment = None
174
+ X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
175
+ X.drop(columns = ohe_enc_cols,inplace = True)
176
+ pd.options.mode.chained_assignment = 'warn'
177
+ st.write("OneHot Encoding Completed ✅")
178
+
179
+
180
+ if len(ohe_enc_cols)>0:
181
+ if len_null > 0:
182
+ for name,df in enumerate(list_X_after_missing_values):
183
+ X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
184
+ # best_tts.best_tts(df,y,model,eva)
185
+ evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
186
+ else:
187
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
188
+ # best_tts.best_tts(X,y,model,eva)
189
+
190
+ evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
191
+
192
+ if len_null >0:
193
+ for name,df in enumerate(list_X_after_missing_values):
194
+ X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
195
+
196
+ evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
197
+
198
+ if eva == "class":
199
+ counter = Counter(y)
200
+ total = sum(counter.values())
201
+ balance_ratio = {cls: count / total for cls, count in counter.items()}
202
+ num_classes = len(balance_ratio)
203
+ ideal_ratio = 1 / num_classes
204
+ a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
205
+ if a == True:
206
+ st.write("Balanced Dataset ")
207
+ st.write("Using accuracy for Evaluation")
208
+ value = "test_acc"
209
+ else:
210
+ st.write("Unbalanced Dataset ")
211
+ st.write("Using F1 score for Evaluation")
212
+ value = "test_f1"
213
+
214
+ evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
215
+ name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
216
+ st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
217
+ if len_null >0:
218
+ b = list_X_after_missing_values_names.index(name)
219
+
220
+ st.write("df",list_X_after_missing_values[b])
221
+ X = list_X_after_missing_values[b]
222
+ if eva == "reg":
223
+ st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
224
+ value = "test_r2"
225
+ evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
226
+
227
+ name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
228
+
229
+ if len_null >0:
230
+ b = list_X_after_missing_values_names.index(name)
231
+
232
+ st.write("df",list_X_after_missing_values[b])
233
+ X = list_X_after_missing_values[b]
234
+
235
+
236
+ # Create a figure and axes
237
+ num_plots = len(num_cols)
238
+ cols = 2 # Number of columns in the subplot grid
239
+ rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
240
+
241
+ fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
242
+
243
+ # Flatten the axes array for easy iteration, and remove any excess subplots
244
+ axes = axes.flatten()
245
+ for ax in axes[num_plots:]:
246
+ fig.delaxes(ax)
247
+
248
+ for i, col in enumerate(num_cols):
249
+ sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
250
+ axes[i].set_title(col)
251
+
252
+ # Adjust layout
253
+ plt.tight_layout()
254
+
255
+ # Show the plot in Streamlit
256
+ st.pyplot(fig)
257
+
258
+ # Create a figure and axes
259
+ num_plots = len(num_cols)
260
+ cols = 3 # Number of columns in the subplot grid
261
+ rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
262
+
263
+ fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
264
+
265
+ # Flatten the axes array for easy iteration, and remove any excess subplots
266
+ axes = axes.flatten()
267
+ for ax in axes[num_plots:]:
268
+ fig.delaxes(ax)
269
+
270
+ for i, col in enumerate(num_cols):
271
+ sns.boxplot(y=X[col], ax=axes[i],palette="magma")
272
+ axes[i].set_title(col)
273
+
274
+ # Adjust layout
275
+ plt.tight_layout()
276
+
277
+ # Show the plot in Streamlit
278
+ st.pyplot(fig)
279
+
280
+ outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
281
+
282
+ st.write("Checking for Outliers")
283
+ outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
284
+ st.write("Outliers in Dataframe Summary",outliers_df_X)
285
+ st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
286
+
287
+ select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
288
+ resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
289
+ st.write("outlier handling with methods",resultant)
290
+ st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
291
+ try :
292
+ st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
293
+
294
+ st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
295
+ X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
296
+ except :
297
+ "evaluation of baseline model is better continuing with baseline model"
298
+
299
+
300
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
301
+ st.write("result_df",X)
302
+
303
+
304
+
305
+ try:
306
+ result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
307
+ X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
308
+ except:
309
+ "evaluation by feature selection is not better than previous"
310
+
311
+ try:
312
+ result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
313
+ st.write("result_df",result)
314
+ except:
315
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
316
+
317
+
318
+
319
+
320
+ st.write("cheking with polynomial features")
321
+ poly = PolynomialFeatures(degree=(2))
322
+ X_train_poly = poly.fit_transform(X_train)
323
+ X_test_poly = poly.transform(X_test)
324
+ result_df_2 = evaluationer.evaluation("polynomial features degree 2",X_train_poly,X_test_poly,y_train,y_test,model,root_mean_squared_error,eva)
325
+ st.write("after polynomial features degree 2",evaluationer.reg_evaluation_df)
326
+ poly1 = PolynomialFeatures(degree=(3))
327
+ X_train_poly1 = poly.fit_transform(X_train)
328
+ X_test_poly1 = poly.transform(X_test)
329
+ evaluationer.evaluation("polynomial features degree 3",X_train_poly1,X_test_poly1,y_train,y_test,model,root_mean_squared_error,eva)
330
+ st.write("after polynomial features degree 3",evaluationer.reg_evaluation_df)
331
+
332
+ pca = PCA(n_components=0.95)
333
+ X_train_pca = pca.fit_transform(X_train)
334
+ X_test_pca = pca.transform(X_test)
335
+ evaluationer.evaluation("PCA",X_train_pca,X_test_pca,y_train,y_test,model,root_mean_squared_error,eva)
336
+ st.write("After PCA",evaluationer.reg_evaluation_df)
337
+
338
+ grid_search_cv.perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva)
339
+ st.write("best param",evaluationer.reg_evaluation_df)
340
+ st.sidebar.button("click to clear evaluation metrics",evaluationer.reg_evaluation_df.drop(index = evaluationer.reg_evaluation_df.index))
341
+
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
best_tts.py CHANGED
@@ -10,9 +10,9 @@ def best_tts(X,y,model,eva):
10
  if eva == "reg":
11
 
12
  test_r2_,test_r2_ts,test_r2_rs = 0,0,0
13
- for k in range(10,25):
14
  i = k/100
15
- for j in range(1,100):
16
  X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
17
 
18
  model = model
 
10
  if eva == "reg":
11
 
12
  test_r2_,test_r2_ts,test_r2_rs = 0,0,0
13
+ for k in range(10,25,3):
14
  i = k/100
15
+ for j in range(1,100,10):
16
  X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
17
 
18
  model = model
eda.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import streamlit as st
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import streamlit as st
7
+ import streamlit.components.v1 as components
8
+ import plotly.express as px
9
+ from plotly.subplots import make_subplots
10
+ import plotly.graph_objects as go
11
+ import streamlit as st
12
+ import pandas as pd
13
+ import datashader as ds
14
+ import datashader.transfer_functions as tf
15
+ from colorcet import fire
16
+ import plotly.express as px
17
+ # function to analysing EDA
18
+ def eda_analysis(df):
19
+
20
+ target_col = st.sidebar.selectbox("Select Target Column", df.columns,index = len(df.columns)-1)
21
+ y = df[target_col]
22
+ X = df.drop(columns = target_col)
23
+ num_cols = X.select_dtypes(exclude= "O").columns.tolist()
24
+ cat_cols = X.select_dtypes(include= "O").columns.tolist()
25
+ st.write("num_cols",tuple(num_cols))
26
+ st.write("cat_cols",tuple(cat_cols))
27
+ st.divider()
28
+
29
+ results = []
30
+ for column in X[num_cols].columns:
31
+ skewness = X[column].skew()
32
+ kurtosis = X[column].kurtosis()
33
+
34
+ skewness_html = f'<span style="color: {"red" if abs(skewness) > .5 else "white"}">{skewness:.2f}</span>'
35
+ kurtosis_html = f'<span style="color: {"red" if abs(kurtosis) > 3 else "white"}">{kurtosis:.2f}</span>'
36
+
37
+ results.append({
38
+ 'Column': column,
39
+ 'Skewness': skewness,
40
+ 'Kurtosis': kurtosis,
41
+ 'Skewness_': skewness_html,
42
+ 'Kurtosis_': kurtosis_html
43
+ })
44
+
45
+ result_df = pd.DataFrame(results)
46
+
47
+ # Display the data types of Skewness and Kurtosis columns
48
+ # st.write("Data types of Skewness and Kurtosis columns:", result_df[["Skewness", "Kurtosis"]].dtypes)
49
+
50
+ if st.toggle("Show Skewness and Kurtosis of DataFrame columns"):
51
+ st.write("Columns with Skewness and Kurtosis:")
52
+ if st.checkbox("Filter Skewed columns"):
53
+ filtered_df = result_df[abs(result_df["Skewness"]) > 0.5]
54
+ st.write(filtered_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
55
+ else:
56
+ st.write(result_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
57
+
58
+ st.divider()
59
+ st.write("Plotting Numerical Columns for Visual EDA")
60
+
61
+ # Create two columns
62
+ column1, column2 = st.columns(2)
63
+
64
+ # Checkbox for plotting distribution in the first column
65
+ with column1:
66
+ plot_distribution = st.checkbox("Plot Distribution of Target Column")
67
+
68
+ # Show the second checkbox in the second column only if the first checkbox is clicked
69
+ if plot_distribution:
70
+ with column2:
71
+ show_kde = st.checkbox("Show KDE Plot")
72
+ kde = show_kde
73
+ else:
74
+ kde = False
75
+
76
+ # Plot the histogram if the first checkbox is checked
77
+ if plot_distribution:
78
+ fig, ax = plt.subplots()
79
+ sns.histplot(y, ax=ax, kde=kde)
80
+
81
+ # Show the plot in the Streamlit app
82
+ st.pyplot(fig)
83
+
84
+ column3, column4 = st.columns(2)
85
+ with column3:
86
+ plot_distribution_nc =st.checkbox("Plot Distribution of Input Numerical columns")
87
+ if plot_distribution_nc:
88
+ with column4:
89
+ show_kde_1 = st.checkbox("Show KDE Plot for Numerical Columns")
90
+ kde_1 = show_kde_1
91
+ if plot_distribution_nc:
92
+ for column in num_cols:
93
+ fig, ax = plt.subplots()
94
+ sns.histplot(df[column], ax=ax, kde=kde_1)
95
+ st.write(f"Distribution of {column}:")
96
+ st.pyplot(fig)
97
+ st.divider()
98
+ # plot count plot for categorical columns
99
+ st.write("Plotting Categorical Columns for Visual EDA")
100
+ if st.checkbox("Plot Distribution of Input Categorical columns") :
101
+ for column in cat_cols:
102
+ fig, ax = plt.subplots()
103
+ fig = px.histogram(df.fillna('Null'), x=column, color=target_col)
104
+ st.write(fig)
105
+
106
+ st.divider()
107
+ # plot correlation matrics using plotly
108
+ st.write("Plotting Correlation Matrix for Numerical Columns")
109
+
110
+ column5, column6 = st.columns(2)
111
+ with column5:
112
+ plot_distribution =st.checkbox("Plot Correlation Matrix")
113
+ if plot_distribution:
114
+ with column6:
115
+ show_value = st.checkbox("Correlation values > 0.5")
116
+ if show_value:
117
+ # Compute correlation matrix
118
+ corr_matrix = df[num_cols].corr()
119
+
120
+ # Plot correlation matrix heatmap
121
+ fig = px.imshow(corr_matrix[abs(corr_matrix)>0.5], color_continuous_scale='RdBu')
122
+
123
+ # Add annotations for values greater than 0.5
124
+ for i in range(corr_matrix.shape[0]):
125
+ for j in range(corr_matrix.shape[1]):
126
+ correlation_value = corr_matrix.iloc[i, j]
127
+ if abs(correlation_value) > 0.5: # Filter values greater than 0.5
128
+ fig.add_annotation(
129
+ x=i, y=j,
130
+ text=str(round(correlation_value, 2)),
131
+ showarrow=False
132
+ )
133
+
134
+ # Update layout
135
+ fig.update_layout(
136
+ xaxis=dict(side="top"),
137
+ width=600,
138
+ height=600,
139
+ margin=dict(l=20, r=20, t=40, b=20)
140
+ )
141
+
142
+ # Display the heatmap
143
+ st.write(fig)
144
+ if plot_distribution and not show_value:
145
+
146
+
147
+ corr_matrix = df[num_cols].corr()
148
+ fig = px.imshow(corr_matrix, color_continuous_scale='RdBu')
149
+ for i in range(corr_matrix.shape[0]):
150
+ for j in range(corr_matrix.shape[1]):
151
+ fig.add_annotation(
152
+ x=i, y=j,
153
+ text=str(round(corr_matrix.iloc[i, j], 2)),
154
+ showarrow=False
155
+ )
156
+
157
+ # Update the layout to ensure annotations are displayed properly
158
+ fig.update_layout(
159
+ xaxis=dict(side="top"),
160
+ width=600,
161
+ height=600,
162
+ margin=dict(l=20, r=20, t=40, b=20)
163
+ )
164
+
165
+ st.write(fig)
166
+ st.divider()
167
+ outlier_cols = st.multiselect("Select Continous numerical columns for Outlier Plot",num_cols)
168
+
169
+ # plot px.boxplot for outlier cols
170
+ if st.toggle("Toggle for Violin Plot"):
171
+ if st.checkbox("Plot BoxPlot for Outlier Cols"):
172
+ if st.toggle("Split by Target"):
173
+ for col in outlier_cols:
174
+ fig = px.violin(df, x=col,color=y)
175
+ st.write(fig)
176
+ st.divider()
177
+ else:
178
+ for col in outlier_cols:
179
+ fig = px.violin(df, x=col)
180
+ st.write(fig)
181
+ st.divider()
182
+ if st.checkbox("check outlier distribution of Target column"):
183
+ fig = px.violin(y)
184
+ st.write(fig)
185
+
186
+ else:
187
+ if st.checkbox("Plot BoxPlot for Outlier Cols"):
188
+ if st.toggle("Split by Target"):
189
+ for col in outlier_cols:
190
+ fig = px.box(df, x=col,color=y)
191
+ st.write(fig)
192
+ st.divider()
193
+ else:
194
+ for col in outlier_cols:
195
+ fig = px.box(df, x=col)
196
+ st.write(fig)
197
+ st.divider()
198
+ if st.checkbox("check outlier distribution of Target column"):
199
+ fig = px.box(y)
200
+ st.write(fig)
201
+
202
+
203
+ # plot scatter plot using px
204
+ st.divider()
205
+
206
+ if st.checkbox("Plot Scatter Plot"):
207
+ column7, column8,column9 = st.columns(3)
208
+ with column7:
209
+
210
+
211
+ # Select y-axis column
212
+ y_col = st.selectbox("Select y axis column", df.columns)
213
+
214
+ # Filter categorical columns for the x-axis selection
215
+ categorical_columns = df.columns
216
+ with column8:
217
+ # Allow user to select the x-axis column from categorical columns
218
+ x_col = st.selectbox("Select x axis column", categorical_columns)
219
+ with column9:
220
+ hue_col = st.selectbox("Select Hue column",categorical_columns)
221
+ # Plot scatter plot using Plotly
222
+ fig = px.scatter(df, x=x_col, y=y_col, color=hue_col)
223
+ st.write(fig)
224
+
225
+ # barchart and line chart
226
+ st.divider()
227
+ if st.checkbox("Plot Bar Chart"):
228
+ column10, column11 = st.columns(2)
229
+ with column10:
230
+ # Select y-axis column
231
+ y_col = st.selectbox("Select y axis column", df.columns)
232
+
233
+ # Filter categorical columns for the x-axis selection
234
+ categorical_columns = df.columns
235
+ with column11:
236
+ # Allow user to select the x-axis column from categorical columns
237
+ x_col = st.selectbox("Select x axis column", categorical_columns)
238
+ fig = px.bar(df, x=x_col, y=y_col,color = x_col)
239
+ st.write(fig)
240
+ st.divider()
241
+ if st.checkbox("Plot Line Chart"):
242
+ column12, column13,colx = st.columns(3)
243
+ with column12:
244
+ # Select y-axis column
245
+ y_col = st.selectbox("Select y axis column", df.columns)
246
+
247
+ # Filter categorical columns for the x-axis selection
248
+ categorical_columns = df.columns
249
+ with column13:
250
+ # Allow user to select the x-axis column from categorical columns
251
+ x_col = st.selectbox("Select x axis column", categorical_columns)
252
+ with colx:
253
+ hue_col1 = st.selectbox("Select Line split column",categorical_columns)
254
+ fig = px.line(df.sort_values(by = y_col), x=x_col, y=y_col,color = hue_col1)
255
+ st.write(fig)
256
+ st.divider()
257
+ # plot pie chart
258
+ if st.checkbox("Plot Pie Chart "):
259
+ column14, column15 = st.columns(2)
260
+ with column14:
261
+ # Select y-axis column
262
+ y_col = st.selectbox("Select values columns", df.columns)
263
+
264
+ # Filter categorical columns for the x-axis selection
265
+ categorical_columns = df.columns
266
+ with column15:
267
+ # Allow user to select the x-axis column from categorical columns
268
+ x_col = st.selectbox("Select names column", categorical_columns)
269
+ fig = px.pie(df, values=y_col, names=x_col)
270
+ st.write(fig)
271
+
272
+ st.divider()
273
+ # check if there are latitude and longitude columns
274
+ if st.checkbox("Plot on Map"):
275
+ lat_col = st.selectbox("Select Latitute Column",df.columns)
276
+ long_col = st.selectbox("Select Longitude Column",df.columns)
277
+ color = st.selectbox
278
+
279
+ # # Create the datashader canvas and aggregate points
280
+ # cvs = ds.Canvas(plot_width=1000, plot_height=1000)
281
+ # agg = cvs.points(df, x=long_col, y=lat_col)
282
+
283
+ # # Get the coordinates for the mapbox layer
284
+ # coords_lat, coords_lon = agg.coords[lat_col].values, agg.coords[long_col].values
285
+ # coordinates = [
286
+ # [coords_lon[0], coords_lat[0]],
287
+ # [coords_lon[-1], coords_lat[0]],
288
+ # [coords_lon[-1], coords_lat[-1]],
289
+ # [coords_lon[0], coords_lat[-1]]
290
+ # ]
291
+
292
+ # # Generate the datashader image
293
+ # img = tf.shade(agg, cmap=fire)[::-1].to_pil()
294
+
295
+ # # Create the Plotly figure with a mapbox layer
296
+ # fig = px.scatter_mapbox(df[:1], lat=lat_col, lon=long_col, zoom=10) # Adjust zoom level as needed
297
+ # fig.update_layout(mapbox_style="carto-darkmatter",
298
+ # mapbox_layers=[
299
+ # {
300
+ # "sourcetype": "image",
301
+ # "source": img,
302
+ # "coordinates": coordinates
303
+ # }
304
+ # ])
305
+
306
+ # # Display the figure in Streamlit
307
+ # st.plotly_chart(fig)
308
+
309
+ # Create a scatter mapbox plot with vibrant colors and custom marker sizes
310
+ if st.button("Proceed to plot map"):
311
+ fig = px.scatter_mapbox(df, lat=lat_col, lon=long_col,
312
+
313
+ size_max=15, # Max marker size
314
+ mapbox_style="open-street-map", # Using a different map style for vibrancy
315
+ zoom=1,
316
+ title='Latitude and Longitude Plotting')
317
+
318
+ # Customize the layout for more vibrant appearance
319
+ fig.update_layout(mapbox_accesstoken='your_mapbox_access_token')
320
+ st.write(fig)
321
+
322
+
323
+
324
+
325
+
feature_selections.py CHANGED
@@ -8,12 +8,10 @@ import pandas as pd
8
  import numpy as np
9
  import evaluationer
10
  import streamlit as st
11
- # import root_mean_squared_error
 
12
  from sklearn.metrics import root_mean_squared_error
13
  def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
14
-
15
- st.write("dvsdv",y_train)
16
- st.write("dvfssdv",X_train)
17
 
18
  model = sm.OLS(y_train, sm.add_constant(X_train))
19
  model_fit = model.fit()
@@ -100,5 +98,7 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
100
  feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
101
  st.write("feature_cols", vif_cols)
102
  for i,j in enumerate(feature_cols):
103
- evaluationer.evaluation(f"{feature_cols_name[i]} dropped" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
104
- return evaluationer.reg_evaluation_df
 
 
 
8
  import numpy as np
9
  import evaluationer
10
  import streamlit as st
11
+
12
+
13
  from sklearn.metrics import root_mean_squared_error
14
  def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
 
 
 
15
 
16
  model = sm.OLS(y_train, sm.add_constant(X_train))
17
  model_fit = model.fit()
 
98
  feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
99
  st.write("feature_cols", vif_cols)
100
  for i,j in enumerate(feature_cols):
101
+ evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
102
+ return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name
103
+
104
+
grid_search_cv.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
2
+ from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
3
+ from sklearn.neighbors import KNeighborsRegressor
4
+ from sklearn.tree import DecisionTreeRegressor
5
+ from sklearn.svm import SVR
6
+ from xgboost import XGBRegressor, XGBRFRegressor
7
+ from sklearn.neural_network import MLPRegressor
8
+ from lightgbm import LGBMRegressor
9
+ from sklearn.naive_bayes import GaussianNB
10
+ from sklearn.model_selection import GridSearchCV
11
+ from sklearn.datasets import make_regression
12
+ from sklearn.model_selection import train_test_split
13
+ import streamlit as st
14
+ import evaluationer
15
+
16
+ from sklearn.metrics import root_mean_squared_error
17
+
18
+ from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
19
+ from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
20
+ from sklearn.neighbors import KNeighborsClassifier
21
+ from sklearn.tree import DecisionTreeClassifier
22
+ from sklearn.svm import SVC
23
+ from xgboost import XGBClassifier, XGBRFClassifier
24
+ from sklearn.neural_network import MLPClassifier
25
+ from lightgbm import LGBMClassifier
26
+ from sklearn.naive_bayes import MultinomialNB, CategoricalNB
27
+
28
+ param_grids_class = {
29
+ "Logistic Regression": {
30
+ 'penalty': ['l1', 'l2', 'elasticnet', 'none'],
31
+ 'C': [0.01, 0.1, 1, 10],
32
+ 'solver': ['lbfgs', 'liblinear', 'saga']
33
+ },
34
+
35
+ "SGD Classifier": {
36
+ 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
37
+ 'penalty': ['l2', 'l1', 'elasticnet'],
38
+ 'alpha': [0.0001, 0.001, 0.01],
39
+ 'max_iter': [1000, 5000, 10000]
40
+ },
41
+
42
+ "Ridge Classifier": {
43
+ 'alpha': [0.1, 1, 10, 100]
44
+ },
45
+
46
+ "Random Forest Classifier": {
47
+ 'n_estimators': [100, 200, 300],
48
+ 'max_depth': [None, 10, 20, 30],
49
+ 'min_samples_split': [2, 5, 10],
50
+ 'min_samples_leaf': [1, 2, 4]
51
+ },
52
+
53
+ "AdaBoost Classifier": {
54
+ 'n_estimators': [50, 100, 200],
55
+ 'learning_rate': [0.01, 0.1, 1]
56
+ },
57
+
58
+ "Gradient Boosting Classifier": {
59
+ 'n_estimators': [100, 200, 300],
60
+ 'learning_rate': [0.01, 0.1, 0.2],
61
+ 'max_depth': [3, 5, 7]
62
+ },
63
+
64
+ "Hist Gradient Boosting Classifier": {
65
+ 'learning_rate': [0.01, 0.1, 0.2],
66
+ 'max_depth': [None, 10, 20],
67
+ 'min_samples_leaf': [20, 50, 100]
68
+ },
69
+
70
+ "K Neighbors Classifier": {
71
+ 'n_neighbors': [3, 5, 7],
72
+ 'weights': ['uniform', 'distance'],
73
+ 'metric': ['euclidean', 'manhattan']
74
+ },
75
+
76
+ "Decision Tree Classifier": {
77
+ 'max_depth': [None, 10, 20, 30],
78
+ 'min_samples_split': [2, 5, 10],
79
+ 'min_samples_leaf': [1, 2, 4]
80
+ },
81
+
82
+ "SVC": {
83
+ 'C': [0.1, 1, 10],
84
+ 'kernel': ['linear', 'poly', 'rbf'],
85
+ 'degree': [3, 4, 5],
86
+ 'gamma': ['scale', 'auto']
87
+ },
88
+
89
+ "XGB Classifier": {
90
+ 'n_estimators': [100, 200, 300],
91
+ 'learning_rate': [0.01, 0.1, 0.2],
92
+ 'max_depth': [3, 5, 7]
93
+ },
94
+
95
+ "XGBRF Classifier": {
96
+ 'n_estimators': [100, 200, 300],
97
+ 'learning_rate': [0.01, 0.1, 0.2],
98
+ 'max_depth': [3, 5, 7]
99
+ },
100
+
101
+ "MLP Classifier": {
102
+ 'hidden_layer_sizes': [(50,), (100,), (50, 50)],
103
+ 'activation': ['tanh', 'relu'],
104
+ 'solver': ['adam', 'sgd'],
105
+ 'alpha': [0.0001, 0.001, 0.01],
106
+ 'learning_rate': ['constant', 'adaptive']
107
+ },
108
+
109
+ "LGBM Classifier": {
110
+ 'n_estimators': [100, 200, 300],
111
+ 'learning_rate': [0.01, 0.1, 0.2],
112
+ 'max_depth': [-1, 10, 20]
113
+ },
114
+
115
+ "Multinomial Naive Bayes": {
116
+ 'alpha': [0.1, 0.5, 1.0]
117
+ },
118
+
119
+ "Categorical Naive Bayes": {
120
+ 'alpha': [0.1, 0.5, 1.0]
121
+ }
122
+ }
123
+
124
+ param_grids_reg = {
125
+ "Linear Regression": {},
126
+
127
+ "SGD Regressor": {
128
+ 'loss': ['squared_loss', 'huber'],
129
+ 'penalty': ['l2', 'l1', 'elasticnet'],
130
+ 'alpha': [0.0001, 0.001, 0.01],
131
+ 'max_iter': [1000, 5000, 10000]
132
+ },
133
+
134
+ "Ridge Regressor": {
135
+ 'alpha': [0.1, 1, 10, 100],
136
+ 'solver': ['auto', 'svd', 'cholesky', 'lsqr']
137
+ },
138
+
139
+ "Lasso Regressor": {
140
+ 'alpha': [0.1, 1, 10, 100]
141
+ },
142
+
143
+ "ElasticNet Regressor": {
144
+ 'alpha': [0.1, 1, 10, 100],
145
+ 'l1_ratio': [0.1, 0.5, 0.9]
146
+ },
147
+
148
+ "Random Forest Regressor": {
149
+ 'n_estimators': [100, 200, 300],
150
+ 'max_depth': [None, 10, 20, 30],
151
+ 'min_samples_split': [2, 5, 10],
152
+ 'min_samples_leaf': [1, 2, 4]
153
+ },
154
+
155
+ "AdaBoost Regressor": {
156
+ 'n_estimators': [50, 100, 200],
157
+ 'learning_rate': [0.01, 0.1, 1]
158
+ },
159
+
160
+ "Gradient Boosting Regressor": {
161
+ 'n_estimators': [100, 200, 300],
162
+ 'learning_rate': [0.01, 0.1, 0.2],
163
+ 'max_depth': [3, 5, 7]
164
+ },
165
+
166
+ "Hist Gradient Boosting Regressor": {
167
+ 'learning_rate': [0.01, 0.1, 0.2],
168
+ 'max_depth': [None, 10, 20],
169
+ 'min_samples_leaf': [20, 50, 100]
170
+ },
171
+
172
+ "K Neighbors Regressor": {
173
+ 'n_neighbors': [3, 5, 7],
174
+ 'weights': ['uniform', 'distance'],
175
+ 'metric': ['euclidean', 'manhattan']
176
+ },
177
+
178
+ "Decision Tree Regressor": {
179
+ 'max_depth': [None, 10, 20, 30],
180
+ 'min_samples_split': [2, 5, 10],
181
+ 'min_samples_leaf': [1, 2, 4]
182
+ },
183
+
184
+ "SVR": {
185
+ 'C': [0.1, 1, 10],
186
+ 'kernel': ['linear', 'poly', 'rbf'],
187
+ 'degree': [3, 4, 5],
188
+ 'gamma': ['scale', 'auto']
189
+ },
190
+
191
+ "XGB Regressor": {
192
+ 'n_estimators': [100, 200, 300],
193
+ 'learning_rate': [0.01, 0.1, 0.2],
194
+ 'max_depth': [3, 5, 7]
195
+ },
196
+
197
+ "XGBRF Regressor": {
198
+ 'n_estimators': [100, 200, 300],
199
+ 'learning_rate': [0.01, 0.1, 0.2],
200
+ 'max_depth': [3, 5, 7]
201
+ },
202
+
203
+ "MLP Regressor": {
204
+ 'hidden_layer_sizes': [(50,), (100,), (50, 50)],
205
+ 'activation': ['tanh', 'relu'],
206
+ 'solver': ['adam', 'sgd'],
207
+ 'alpha': [0.0001, 0.001, 0.01],
208
+ 'learning_rate': ['constant', 'adaptive']
209
+ },
210
+
211
+ "LGBM Regressor": {
212
+ 'n_estimators': [100, 200, 300],
213
+ 'learning_rate': [0.01, 0.1, 0.2],
214
+ 'max_depth': [-1, 10, 20]
215
+ },
216
+
217
+ "Gaussian Naive Bayes": {
218
+ 'var_smoothing': [1e-9, 1e-8, 1e-7]
219
+ }
220
+ }
221
+
222
+ # Define the regressors
223
+ regressors = {
224
+ "Linear Regression": LinearRegression(),
225
+ "SGD Regressor": SGDRegressor(),
226
+ "Ridge Regressor": Ridge(),
227
+ "Lasso Regressor": Lasso(),
228
+ "ElasticNet Regressor": ElasticNet(),
229
+ "Random Forest Regressor": RandomForestRegressor(),
230
+ "AdaBoost Regressor": AdaBoostRegressor(),
231
+ "Gradient Boosting Regressor": GradientBoostingRegressor(),
232
+ "Hist Gradient Boosting Regressor": HistGradientBoostingRegressor(),
233
+ "K Neighbors Regressor": KNeighborsRegressor(),
234
+ "Decision Tree Regressor": DecisionTreeRegressor(),
235
+ "SVR": SVR(),
236
+ "XGB Regressor": XGBRegressor(),
237
+ "XGBRF Regressor": XGBRFRegressor(),
238
+ "MLP Regressor": MLPRegressor(),
239
+ "LGBM Regressor": LGBMRegressor(),
240
+ "Gaussian Naive Bayes": GaussianNB()
241
+ }
242
+
243
+ classifiers = {
244
+ "Logistic Regression": LogisticRegression(),
245
+ "SGD Classifier": SGDClassifier(),
246
+ "Ridge Classifier": RidgeClassifier(),
247
+ "Random Forest Classifier": RandomForestClassifier(),
248
+ "AdaBoost Classifier": AdaBoostClassifier(),
249
+ "Gradient Boosting Classifier": GradientBoostingClassifier(),
250
+ "Hist Gradient Boosting Classifier": HistGradientBoostingClassifier(),
251
+ "K Neighbors Classifier": KNeighborsClassifier(),
252
+ "Decision Tree Classifier": DecisionTreeClassifier(),
253
+ "SVC": SVC(),
254
+ "XGB Classifier": XGBClassifier(),
255
+ "XGBRF Classifier": XGBRFClassifier(),
256
+ "MLP Classifier": MLPClassifier(),
257
+ "LGBM Classifier": LGBMClassifier(),
258
+ "Multinomial Naive Bayes": MultinomialNB(),
259
+ "Categorical Naive Bayes": CategoricalNB()
260
+ }
261
+ def perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva):
262
+ if eva == "reg":
263
+ regressor = regressors[model_name]
264
+
265
+ param_grid_reg = param_grids_reg[model_name]
266
+
267
+ grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid_reg, cv=5, scoring='neg_mean_squared_error')
268
+ grid_search.fit(X_train,y_train)
269
+ st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
270
+ st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
271
+ best_model = grid_search.best_estimator_
272
+ y_pred = best_model.predict(X_test)
273
+ evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
274
+ elif eva == "class":
275
+ classifier = classifiers[model_name]
276
+ param_grid_class = param_grids_class[model_name]
277
+
278
+ grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid_class, cv=5, scoring='accuracy')
279
+ grid_search.fit(X_train,y_train)
280
+ st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
281
+ st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
282
+ best_model = grid_search.best_estimator_
283
+ y_pred = best_model.predict(X_test)
284
+ evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
models.py CHANGED
@@ -23,6 +23,8 @@ from sklearn.neural_network import MLPRegressor
23
  from lightgbm import LGBMRegressor
24
  from sklearn.naive_bayes import GaussianNB
25
 
 
 
26
  # dictionary where keys are name of algorithm and values are algorithm for classifier
27
  algos_class = {
28
  "Logistic Regression": LogisticRegression(),
 
23
  from lightgbm import LGBMRegressor
24
  from sklearn.naive_bayes import GaussianNB
25
 
26
+
27
+
28
  # dictionary where keys are name of algorithm and values are algorithm for classifier
29
  algos_class = {
30
  "Logistic Regression": LogisticRegression(),
requirements.txt CHANGED
@@ -1,10 +1,11 @@
 
1
  streamlit==1.34.0
2
  joblib==1.4.2
3
  numpy==1.26.4
4
  pandas==2.2.2
5
  scikit-learn==1.4.2
6
- seaborn==0.13.2
 
 
7
  matplotlib==3.9.0
8
- xgboost==2.0.3
9
- lightgbm==4.3.0
10
- statsmodels==0.14.2
 
1
+
2
  streamlit==1.34.0
3
  joblib==1.4.2
4
  numpy==1.26.4
5
  pandas==2.2.2
6
  scikit-learn==1.4.2
7
+ datashader==0.16.2
8
+ colorcet==3.1.0
9
+ plotly==5.22.0
10
  matplotlib==3.9.0
11
+ seaborn==0.13.2