Gaurav069 commited on
Commit
f736b41
1 Parent(s): 9d86a56

Upload 12 files

Browse files
Files changed (4) hide show
  1. app.py +65 -20
  2. auto_optimizer.py +25 -12
  3. feature_selections.py +58 -5
  4. requirements.txt +11 -14
app.py CHANGED
@@ -8,7 +8,7 @@ import evaluationer,models, null_value_handling
8
  import auto_optimizer
9
  from sklearn.experimental import enable_iterative_imputer
10
  from sklearn.impute import SimpleImputer, IterativeImputer
11
- import eda
12
  # st.set_page_config(layout="wide")
13
 
14
  st.set_page_config(
@@ -86,6 +86,8 @@ html_code = """
86
  st.markdown(html_code, unsafe_allow_html=True)
87
  st.divider()
88
 
 
 
89
  st.markdown(
90
  """
91
  <style>
@@ -137,6 +139,37 @@ if (len(sep) ==0):
137
  sep = ","
138
  csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  test = pd.DataFrame()
141
  if csv_upload is not None:
142
  # read the uploaded file into dataframe
@@ -260,14 +293,15 @@ if csv_upload is not None:
260
  st.write("There are no duplicate values in Train")
261
  st.divider()
262
  # dropping not important columns
263
- st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
264
- if st.radio(" ",["Yes","No"],index = 1) == "Yes":
265
- selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
266
- X = X.drop(columns = selected_drop_column)
267
- if len(test) >0:
268
- test = test.drop(columns = selected_drop_column)
269
- st.write("Un-Important column(s) Deleted ✅")
270
- st.dataframe(X.head())
 
271
 
272
  st.divider()
273
  num_cols = X.select_dtypes(exclude = "O").columns
@@ -296,7 +330,7 @@ if csv_upload is not None:
296
  st.write("Select ML algorithm")
297
  class_model_name = st.selectbox("select model",models.Classification_models.index)
298
  class_model = models.Classification_models.loc[class_model_name].values[0]
299
- auto_optimizer.Auto_optimizer(X,y,eva,class_model)
300
 
301
 
302
  else:
@@ -349,7 +383,7 @@ if csv_upload is not None:
349
 
350
  dict_2= {}
351
  for nvh_method in null_value_handling.null_value_handling_method_cat_cols :
352
- st.write("dsff",nvh_method)
353
 
354
  selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', cat_cols_nvh,)
355
  dict_2[nvh_method] = selected_nvh_num_cols
@@ -368,17 +402,22 @@ if csv_upload is not None:
368
  test[cat_cols] = SimpleImputer(strategy = "most_frequent").fit_transform(test[cat_cols])
369
 
370
 
371
- null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
372
- st.write("X Data after Null value handling", X.head())
 
373
 
374
- new_df = pd.concat([X,y[X.index]],axis = 1)
375
-
376
- csv = new_df.to_csv(index = False)
 
 
 
 
 
 
 
 
377
 
378
- st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
379
- if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
380
- st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
381
- st.divider()
382
  ord_enc_cols = []
383
 
384
  if len(cat_cols) == 0:
@@ -448,6 +487,12 @@ if csv_upload is not None:
448
  st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
449
 
450
  st.divider()
 
 
 
 
 
 
451
  st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
452
  st.write("")
453
  st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)
 
8
  import auto_optimizer
9
  from sklearn.experimental import enable_iterative_imputer
10
  from sklearn.impute import SimpleImputer, IterativeImputer
11
+ import eda,outliers
12
  # st.set_page_config(layout="wide")
13
 
14
  st.set_page_config(
 
86
  st.markdown(html_code, unsafe_allow_html=True)
87
  st.divider()
88
 
89
+
90
+
91
  st.markdown(
92
  """
93
  <style>
 
139
  sep = ","
140
  csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
141
 
142
+ if csv_upload is None:
143
+ st.title("LazyML")
144
+
145
+ st.header("Welcome to LazyML – your go-to app for effortless machine learning!")
146
+
147
+ st.subheader("Overview")
148
+ st.write("""
149
+ LazyML is designed to make machine learning accessible to everyone, regardless of their technical expertise. Whether you're a seasoned data scientist or a complete beginner, LazyML takes the complexity out of building and deploying machine learning models.
150
+ """)
151
+
152
+ st.subheader("Key Features")
153
+ st.write("""
154
+ - **Automated Model Building:** Automatically preprocess your data, select the best algorithms, and fine-tune models with minimal effort.
155
+ - **User-Friendly Interface:** Intuitive and easy-to-navigate interface that guides you through the entire machine learning workflow.
156
+ - **Data Visualization:** Comprehensive visualization tools to help you understand your data and model performance.
157
+ - **Customizable Pipelines:** Flexibility to customize data preprocessing, feature engineering, and model selection to suit your needs.
158
+ - **Performance Metrics:** Detailed performance metrics and comparison reports for informed decision-making.
159
+ - **Deployment Ready:** Easily deploy your models and start making predictions with just a few clicks.
160
+ """)
161
+
162
+ st.subheader("How It Works")
163
+ st.write("""
164
+ 1. **Upload Your Data:** Start by uploading your dataset in CSV format.
165
+ 2. **Data Preprocessing:** LazyML automatically cleans and preprocesses your data, handling missing values, and scaling features as needed.
166
+ 3. **Model Selection:** The app evaluates multiple algorithms and selects the best performing ones for your specific data.
167
+ 4. **Model Training:** Selected models are trained and fine-tuned using cross-validation to ensure robustness.
168
+ 5. **Evaluation:** Get detailed reports on model performance with key metrics like accuracy, precision, recall, and F1 score.
169
+ 6. **Deployment:** Once satisfied with the model, deploy it and start making real-time predictions.
170
+ """)
171
+
172
+
173
  test = pd.DataFrame()
174
  if csv_upload is not None:
175
  # read the uploaded file into dataframe
 
293
  st.write("There are no duplicate values in Train")
294
  st.divider()
295
  # dropping not important columns
296
+ if len(X.columns) >1:
297
+ st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
298
+ if st.radio(" ",["Yes","No"],index = 1) == "Yes":
299
+ selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
300
+ X = X.drop(columns = selected_drop_column)
301
+ if len(test) >0:
302
+ test = test.drop(columns = selected_drop_column)
303
+ st.write("Un-Important column(s) Deleted ✅")
304
+ st.dataframe(X.head())
305
 
306
  st.divider()
307
  num_cols = X.select_dtypes(exclude = "O").columns
 
330
  st.write("Select ML algorithm")
331
  class_model_name = st.selectbox("select model",models.Classification_models.index)
332
  class_model = models.Classification_models.loc[class_model_name].values[0]
333
+ auto_optimizer.Auto_optimizer(X,y,eva,class_model,class_model_name)
334
 
335
 
336
  else:
 
383
 
384
  dict_2= {}
385
  for nvh_method in null_value_handling.null_value_handling_method_cat_cols :
386
+
387
 
388
  selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', cat_cols_nvh,)
389
  dict_2[nvh_method] = selected_nvh_num_cols
 
402
  test[cat_cols] = SimpleImputer(strategy = "most_frequent").fit_transform(test[cat_cols])
403
 
404
 
405
+ try:
406
+ null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
407
+ st.write("X Data after Null value handling", X.head())
408
 
409
+ new_df = pd.concat([X,y[X.index]],axis = 1)
410
+
411
+ csv = new_df.to_csv(index = False)
412
+
413
+ st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
414
+ if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
415
+ st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
416
+ st.divider()
417
+ except:
418
+ st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ Categorical column null value not handled ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
419
+
420
 
 
 
 
 
421
  ord_enc_cols = []
422
 
423
  if len(cat_cols) == 0:
 
487
  st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
488
 
489
  st.divider()
490
+ st.markdown('<div class="message-box success">Outlier Detection</div>', unsafe_allow_html=True)
491
+ st.write("")
492
+ if st.button("Click to check outliers"):
493
+ outlier,out_index = outliers.detect_outliers(new_df,num_cols)
494
+ st.write("outlier",outlier)
495
+ st.divider()
496
  st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
497
  st.write("")
498
  st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)
auto_optimizer.py CHANGED
@@ -285,7 +285,7 @@ def Auto_optimizer(X,y,eva,model,model_name,test= None):
285
  st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
286
 
287
  select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
288
- resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
289
  st.write("outlier handling with methods",resultant)
290
  st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
291
  try :
@@ -302,17 +302,30 @@ def Auto_optimizer(X,y,eva,model,model_name,test= None):
302
 
303
 
304
 
305
- try:
306
- result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
307
- X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
308
- except:
309
- "evaluation by feature selection is not better than previous"
310
-
311
- try:
312
- result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
313
- st.write("result_df",result)
314
- except:
315
- X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
 
318
 
 
285
  st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
286
 
287
  select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
288
+ resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = eva)
289
  st.write("outlier handling with methods",resultant)
290
  st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
291
  try :
 
302
 
303
 
304
 
305
+ if eva == "reg":
306
+ try:
307
+ result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
308
+ X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
309
+ except:
310
+ "evaluation by feature selection is not better than previous"
311
+
312
+ try:
313
+ result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
314
+ st.write("result_df",result)
315
+ except:
316
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
317
+ elif eva =="class":
318
+ try:
319
+ result_df_1 , feature_col, feature_col_name = feature_selections.clas_feature_selection(X_train,X_test,y_train,y_test,model)
320
+ X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_acc").tail(1).iloc[:,0].values[0])])
321
+ except:
322
+ "evaluation by feature selection is not better than previous"
323
+
324
+ try:
325
+ result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
326
+ st.write("result_df",result)
327
+ except:
328
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
329
 
330
 
331
 
feature_selections.py CHANGED
@@ -8,8 +8,16 @@ import pandas as pd
8
  import numpy as np
9
  import evaluationer
10
  import streamlit as st
11
-
12
-
 
 
 
 
 
 
 
 
13
  from sklearn.metrics import root_mean_squared_error
14
  def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
15
 
@@ -40,10 +48,10 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
40
  vif = pd.DataFrame()
41
  vif["variables"] = X_new_vif.columns
42
  vif["VIF"] = [variance_inflation_factor(X_new_vif.values, i) for i in range(X_new_vif.shape[1])]
43
- st.write("gdfgdsdsdfad",vif)
44
  if len(vif[vif["variables"] == "const"]) == 1:
45
  vif = vif.drop(index = (vif[vif["variables"] == "const"].index[0]))
46
- st.write("gdfgdsad",vif)
47
  # drop const in vif cols
48
  # vif_cols = X_new_vif.drop(columns = "const")
49
  vif_cols = vif[vif.VIF >10].variables.tolist()
@@ -101,4 +109,49 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
101
  evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
102
  return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name
103
 
104
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import numpy as np
9
  import evaluationer
10
  import streamlit as st
11
+ from sklearn.feature_selection import RFE,RFECV
12
+ from sklearn.linear_model import Lasso
13
+ from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
14
+ import numpy as np
15
+ import pandas as pd
16
+ import matplotlib.pyplot as plt
17
+ from sklearn.linear_model import LogisticRegression
18
+ from sklearn.feature_selection import RFE, RFECV, SelectKBest, chi2, mutual_info_classif
19
+ from sklearn.model_selection import StratifiedKFold
20
+ from sklearn.metrics import f1_score
21
  from sklearn.metrics import root_mean_squared_error
22
  def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
23
 
 
48
  vif = pd.DataFrame()
49
  vif["variables"] = X_new_vif.columns
50
  vif["VIF"] = [variance_inflation_factor(X_new_vif.values, i) for i in range(X_new_vif.shape[1])]
51
+ # st.write("gdfgdsdsdfad",vif)
52
  if len(vif[vif["variables"] == "const"]) == 1:
53
  vif = vif.drop(index = (vif[vif["variables"] == "const"].index[0]))
54
+ # st.write("gdfgdsad",vif)
55
  # drop const in vif cols
56
  # vif_cols = X_new_vif.drop(columns = "const")
57
  vif_cols = vif[vif.VIF >10].variables.tolist()
 
109
  evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
110
  return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name
111
 
112
+ def clas_feature_selection(X_train, X_test,y_train,y_test,model,n_features_to_select = None, step=1,importance_getter='auto',refcv_graph= False,C=0.05,k = 10):
113
+ global rfe_cols,rfecv_cols,lasso_cols,chi2_imp_col,mi_imp_col
114
+ rfe = RFE(estimator= model,n_features_to_select = n_features_to_select,importance_getter=importance_getter, step=1)
115
+ rfe.fit(X_train,y_train)
116
+ rfe_cols = X_train.columns[rfe.support_]
117
+ cv = StratifiedKFold(5)
118
+ rfecv = RFECV(estimator=model,
119
+ step=1,
120
+ cv=cv,
121
+ scoring="f1",
122
+ min_features_to_select=1,
123
+ n_jobs=-1)
124
+ rfecv.fit(X_train,y_train)
125
+ rfecv_cols = X_train.columns[rfecv.support_]
126
+ if refcv_graph == True:
127
+ n_scores = len(rfecv.cv_results_["mean_test_score"])
128
+ plt.figure()
129
+ plt.xlabel("Number of features selected")
130
+ plt.ylabel("Mean test f1")
131
+ plt.errorbar(range(min_features_to_select, n_scores + min_features_to_select),
132
+ rfecv.cv_results_["mean_test_score"],
133
+ yerr=rfecv.cv_results_["std_test_score"],
134
+ )
135
+ plt.grid(True)
136
+ plt.title("Recursive Feature Elimination \nwith correlated features")
137
+ plt.show()
138
+ clf = LogisticRegression(penalty = "l1", C = C,
139
+ random_state = 42,
140
+ solver = "liblinear")
141
+ clf.fit(X_train, y_train)
142
+ lasso_cols = clf.feature_names_in_[clf.coef_[0] != 0]
143
+
144
+ sk = SelectKBest(chi2, k=k)
145
+ X_chi2 = sk.fit_transform(X_train, y_train)
146
+ chi2_imp_col = X_train.columns[sk.get_support()]
147
+ sk = SelectKBest(mutual_info_classif, k=k)
148
+ X_mutual = sk.fit_transform(X_train, y_train)
149
+ mi_imp_col = X_train.columns[sk.get_support()]
150
+
151
+ feature_cols = [rfe_cols,rfecv_cols,lasso_cols,chi2_imp_col,mi_imp_col]
152
+ feature_cols_name = ["rfe_cols","rfecv_cols","lasso_cols","chi2_imp_col","mi_imp_col"]
153
+
154
+ for i,j in enumerate(feature_cols):
155
+ # evaluationerevaluation(f"{feature_cols_name[i]} " ,X_train[j],X_test[j],y_train,y_test,model = model,eva = "class")
156
+ evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train[j],X_test[j],y_train,y_test,model,method = root_mean_squared_error,eva = "class")
157
+ return evaluationer.classification_evaluation_df , feature_cols, feature_cols_name
requirements.txt CHANGED
@@ -1,14 +1,11 @@
1
-
2
- streamlit==1.34.0
3
- joblib==1.4.2
4
- numpy==1.26.4
5
- pandas==2.2.2
6
- scikit-learn==1.4.2
7
- datashader==0.16.2
8
- colorcet==3.1.0
9
- plotly==5.22.0
10
- matplotlib==3.9.0
11
- seaborn==0.13.2
12
- xgboost==2.0.3
13
- lightgbm==4.3.0
14
- statsmodels==0.14.2
 
1
+
2
+ streamlit==1.34.0
3
+ joblib==1.4.2
4
+ numpy==1.26.4
5
+ pandas==2.2.2
6
+ scikit-learn==1.4.2
7
+ datashader==0.16.2
8
+ colorcet==3.1.0
9
+ plotly==5.22.0
10
+ matplotlib==3.9.0
11
+ seaborn==0.13.2