Spaces:

Gaurav069
/

LazyML

Running

App Files Files Community

Gaurav069 commited on Jun 13

Commit

ba67510

•

1 Parent(s): dc62f39

Upload 12 files

Browse files

Files changed (9) hide show

.streamlit/config.toml +7 -0
app.py +166 -53
auto_optimizer.py +361 -317
best_tts.py +2 -2
eda.py +325 -0
feature_selections.py +6 -6
grid_search_cv.py +284 -0
models.py +2 -0
requirements.txt +5 -4

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,7 @@

+[theme]
+primaryColor="#F63366"
+backgroundColor="#002148"
+secondaryBackgroundColor="#576c86"
+textColor="white"
+font="serif"

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import evaluationer,models, null_value_handling
 import auto_optimizer
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import SimpleImputer, IterativeImputer
 # st.set_page_config(layout="wide")
 st.set_page_config(
@@ -21,7 +22,23 @@ st.set_page_config(
  }
 )
-import streamlit as st
 # Title with Rainbow Transition Effect and Neon Glow
 html_code = """
@@ -67,23 +84,74 @@ html_code = """
 """
 st.markdown(html_code, unsafe_allow_html=True)
 # file uploader
 csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
 csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
 test = pd.DataFrame()
 if csv_upload is not None:
  # read the uploaded file into dataframe
- df = pd.read_csv(csv_upload)
  # saving the dataframe to a CSV file
  df.to_csv('csv_upload.csv', index=False)
- st.write("Train File uploaded successfully. ✅")
  if csv_upload2 is not None:
- test = pd.read_csv(csv_upload2)
- id_col = st.selectbox("select column for submission i.e, ID",test.columns)
  submission_id = test[id_col]
  # st.write("Train File upl",submission_id)
@@ -93,8 +161,10 @@ if csv_upload is not None:
  if len(test) >0:
  # saving the test dataframe to a CSV file
  test.to_csv('csv_upload_test.csv', index=False)
- st.write("Test File uploaded successfully. ✅")
  display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
  if display_train_data == "Yes":
  st.dataframe(df.head())
@@ -104,29 +174,40 @@ if csv_upload is not None:
  if display_test_data == "Yes":
  st.dataframe(test.head())
- if st.radio("Select Supervision Category",["Supervised","Un-Supervised"],index =0) == "Supervised":
- selected_column = st.selectbox('Select Target column', df.columns, index=(len(df.columns)-1))
  # Display the selected column
  st.write('You selected:', selected_column)
  y = df[selected_column]
  if y.dtype == "O":
- st.write("⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️")
- if st.radio("Proceed for Label Encoding ",["Yes","No"],index = 1) == "Yes":
  from sklearn.preprocessing import LabelEncoder
  le = LabelEncoder()
  y= pd.Series(le.fit_transform(y))
- st.write("Label Encoding Completed ✅")
- if st.radio("Display Target Column",["Yes","No"],index =1) == "Yes":
  st.dataframe(y.head())
- select_target_trans = st.radio("Target column Transformation",["Yes","No"],index = 1)
  if select_target_trans == "Yes":
  selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
  if selected_transformation == "Log Transformation":
@@ -155,36 +236,52 @@ if csv_upload is not None:
  if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
  st.dataframe(y.head())
-# inverse of transformation
  X = df.drop(columns = selected_column)
  if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
  st.dataframe(X.head())
- if st.radio("Check for duplicate Values",["Yes","No"],index = 1) == "Yes":
  len_duplicates = len(X[X.duplicated()])
  if len_duplicates >0:
  st.write(f"There are {len_duplicates} duplicate values in Train")
  if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
  X = X.drop_duplicates()
  st.write("Duplicate values removed ✅")
  else:
  st.write("There are no duplicate values in Train")
  # dropping not important columns
- if st.radio("Drop Un-Important Column(s)",["Yes","No"],index = 1) == "Yes":
  selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
  X = X.drop(columns = selected_drop_column)
  if len(test) >0:
  test = test.drop(columns = selected_drop_column)
- st.write("Un-Important column(s) Delected ✅")
  st.dataframe(X.head())
  num_cols = X.select_dtypes(exclude = "O").columns
  cat_cols = X.select_dtypes(include = "O").columns
  st.write("Numerical Columns in Train Data: ", tuple(num_cols))
  st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
- if st.radio("Select method for ML modelling", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
  ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
  if ml_cat_ao =="Regression":
@@ -192,7 +289,7 @@ if csv_upload is not None:
  st.write("Select ML algorithm")
  reg_model_name = st.selectbox("select model",models.Regression_models.index)
  reg_model = models.Regression_models.loc[reg_model_name].values[0]
- auto_optimizer.Auto_optimizer(X,y,eva,reg_model)
  elif ml_cat_ao =="Classification":
  eva = "class"
@@ -201,10 +298,12 @@ if csv_upload is not None:
  class_model = models.Classification_models.loc[class_model_name].values[0]
  auto_optimizer.Auto_optimizer(X,y,eva,class_model)
  else:
  if X.isnull().sum().sum() >0 :
- st.write("⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️")
  if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
@@ -241,7 +340,9 @@ if csv_upload is not None:
  clean_num_nvh_df_cat = pd.DataFrame()
  if X[cat_cols].isnull().sum().sum() >0:
  st.write("Categorical Columns with Percentage of Null Values: ")
  cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
  st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
@@ -270,33 +371,41 @@ if csv_upload is not None:
  null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
  st.write("X Data after Null value handling", X.head())
- new_df = pd.concat([X,y[X.index]],axis = 1)
- csv = new_df.to_csv(index = False)
- if st.radio("Download Null Value Handled DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
- st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
  ord_enc_cols = []
  if len(cat_cols) == 0:
  st.write("No Categorical Columns in Train")
  else:
- st.write("Select Columns for Ordinal Encoding")
  for column in cat_cols:
  selected = st.checkbox(column)
  if selected:
  st.write(f"No. of Unique value in {column} column are", X[column].nunique())
  ord_enc_cols.append(column)
  ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
  ohe_enc_cols = list(ohe_enc_cols)
  if len(ord_enc_cols)>0:
  st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
  if len(ohe_enc_cols)>0:
  st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
  if len(ord_enc_cols)>0:
- if st.radio("proceed for ordinal encoding",["Yes","No"],index = 1) == "Yes":
  ordinal_order_vals = []
  for column in ord_enc_cols:
@@ -317,7 +426,7 @@ if csv_upload is not None:
  st.write("Ordinal Encoding Completed ✅")
  if len(ohe_enc_cols)>0:
- if st.radio("proceed for OnehotEncoding ",["Yes","No"],index = 1) == "Yes": # import one hot encoder
  from sklearn.preprocessing import OneHotEncoder
  ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
  pd.options.mode.chained_assignment = None
@@ -331,39 +440,43 @@ if csv_upload is not None:
  st.write("DataFrame after One Hot Encoding",X.head())
  st.write("OneHot Encoding Completed ✅")
  new_df = pd.concat([X,y],axis = 1)
  csv = new_df.to_csv(index = False)
- if st.radio("Download Encoded DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
  st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
- random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
- test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
- if st.radio("select Train Validation Split Method",
- [f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})",
- "KFoldCV, Default (CV = 5)"], index = 0)== f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})":
  ttsmethod = "Train_Test_split"
  else:
  ttsmethod = "KFoldCV"
  st.write('You selected:', ttsmethod)
  if ttsmethod == "Train_Test_split":
  X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
- st.write('X-Training Data shape:', (X_train.info()))
  st.write('X-Training Data shape:', X_train.shape)
  st.write('X-Validation Data shape:', X_Val.shape)
- ml_cat = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
  if ml_cat =="Regression":
- method_name_selector = st.selectbox("Select Error Evaluation Method",evaluationer.method_df.index,index = 0)
  method = evaluationer.method_df.loc[method_name_selector].values[0]
  reg_algorithm = []
  selected_options = []
  for option in models.Regression_models.index:
  selected = st.checkbox(option)
  if selected:
@@ -450,7 +563,7 @@ if csv_upload is not None:
  cla_algorithm = []
  selected_options = []
  for option in models.Classification_models.index:
  selected = st.checkbox(option)
  if selected:

 import auto_optimizer
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import SimpleImputer, IterativeImputer
+import eda
 # st.set_page_config(layout="wide")
 st.set_page_config(
  }
 )
+# Set the background image
+background_image = """
+<style>
+[data-testid="stAppViewContainer"] > .main {
+ background-image: url("https://w.wallhaven.cc/full/jx/wallhaven-jx7w25.png");
+ background-size: 100vw 100vh; # This sets the size to cover 100% of the viewport width and height
+ background-position: center;
+ background-repeat: no-repeat;
+}
+</style>
+"""
+st.markdown(background_image, unsafe_allow_html=True)
 # Title with Rainbow Transition Effect and Neon Glow
 html_code = """
 """
 st.markdown(html_code, unsafe_allow_html=True)
+st.divider()
+st.markdown(
+ """
+ <style>
+ .success-message {
+ font-family: Arial, sans-serif;
+ font-size: 24px;
+ color: green;
+ text-align: left;
+ }
+ .unsuccess-message {
+ font-family: Arial, sans-serif;
+ font-size: 24px;
+ color: red;
+ text-align: left;
+ }
+ .prompt-message {
+ font-family: Arial, sans-serif;
+ font-size: 24px;
+ color: #333;
+ text-align: center;
+ }
+ .success-message2 {
+ font-family: Arial, sans-serif;
+ font-size: 18px;
+ color: white;
+ text-align: left;
+ }
+ .message-box {
+ text-align: center;
+ background-color: white;
+ padding: 5px;
+ border-radius: 10px;
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+ font-size: 24px;
+ color: #333;
+ }
+ </style>
+ """,
+ unsafe_allow_html=True
+)
+# st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
 # file uploader
 csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
+sep = st.sidebar.text_input("Input Seperator")
+if (len(sep) ==0):
+ sep = ","
 csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
 test = pd.DataFrame()
 if csv_upload is not None:
  # read the uploaded file into dataframe
+ df = pd.read_csv(csv_upload,sep = sep)
  # saving the dataframe to a CSV file
  df.to_csv('csv_upload.csv', index=False)
+ st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
  if csv_upload2 is not None:
+ test = pd.read_csv(csv_upload2,sep = sep)
+ st.markdown('<p class="success-message">Test File uploaded successfully. ✅</p>', unsafe_allow_html=True)
+ st.divider()
+ id_col = st.selectbox("Select Column for Submission i.e, ID",test.columns)
+ st.divider()
  submission_id = test[id_col]
  # st.write("Train File upl",submission_id)
  if len(test) >0:
  # saving the test dataframe to a CSV file
  test.to_csv('csv_upload_test.csv', index=False)
+ st.markdown('<p class="message-box">Display Data</p>', unsafe_allow_html=True)
+ st.write("")
  display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
  if display_train_data == "Yes":
  st.dataframe(df.head())
  if display_test_data == "Yes":
  st.dataframe(test.head())
+ st.divider()
+ st.markdown('<div class="message-box success">Select Supervision Category</div>', unsafe_allow_html=True)
+ if st.radio("",["Supervised","Un-Supervised"],index =0) == "Supervised":
+ st.divider()
+ st.write('<p class="success-message2">Select Target column</p>', unsafe_allow_html=True)
+ selected_column = st.selectbox('', df.columns, index=(len(df.columns)-1))
  # Display the selected column
  st.write('You selected:', selected_column)
+ st.divider()
+ st.markdown('<div class="message-box success ">Perform EDA</div>', unsafe_allow_html=True)
+ st.write("")
+ if st.checkbox("Proceed to perform EDA"):
+ eda.eda_analysis(df)
+ st.write('<p class="success-message">EDA Performed proceed for Pre-processing</p>', unsafe_allow_html=True)
+ st.divider()
  y = df[selected_column]
  if y.dtype == "O":
+ st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
+ if st.checkbox("Proceed for Label Encoding "):
  from sklearn.preprocessing import LabelEncoder
  le = LabelEncoder()
  y= pd.Series(le.fit_transform(y))
+ st.markdown('<p class="success-message">Label Encoding Completed ✅</p>', unsafe_allow_html=True)
+ if st.checkbox("Display Target Column"):
  st.dataframe(y.head())
+ st.divider()
+ st.markdown('<div class="message-box success">Target column Transformation</div>', unsafe_allow_html=True)
+ select_target_trans = st.radio("",["Yes","No"],index = 1)
  if select_target_trans == "Yes":
  selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
  if selected_transformation == "Log Transformation":
  if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
  st.dataframe(y.head())
  X = df.drop(columns = selected_column)
  if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
  st.dataframe(X.head())
+ st.divider()
+ # st.checkbox()
+ st.markdown('<div class="message-box success">Check for duplicate Values</div>', unsafe_allow_html=True)
+ if st.radio(" ",["Yes","No"],index = 1) == "Yes":
  len_duplicates = len(X[X.duplicated()])
  if len_duplicates >0:
  st.write(f"There are {len_duplicates} duplicate values in Train")
+ if st.checkbox("Show Duplicate values"):
+ st.dataframe(X[X.duplicated()])
  if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
  X = X.drop_duplicates()
  st.write("Duplicate values removed ✅")
  else:
  st.write("There are no duplicate values in Train")
+ st.divider()
  # dropping not important columns
+ st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
+ if st.radio(" ",["Yes","No"],index = 1) == "Yes":
  selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
  X = X.drop(columns = selected_drop_column)
  if len(test) >0:
  test = test.drop(columns = selected_drop_column)
+ st.write("Un-Important column(s) Deleted ✅")
  st.dataframe(X.head())
+ st.divider()
  num_cols = X.select_dtypes(exclude = "O").columns
  cat_cols = X.select_dtypes(include = "O").columns
  st.write("Numerical Columns in Train Data: ", tuple(num_cols))
  st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
+ if st.sidebar.button("Clear Evaluation DataFrame"):
+ evaluationer.reg_evaluation_df = evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
+ evaluationer.classification_evaluation_df = evaluationer.classification_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
+ st.divider()
+ # markdown
+ st.markdown('<div class="message-box success">Select method for ML modelling</div>', unsafe_allow_html = True)
+ if st.radio(" ", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
+ st.divider()
  ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
  if ml_cat_ao =="Regression":
  st.write("Select ML algorithm")
  reg_model_name = st.selectbox("select model",models.Regression_models.index)
  reg_model = models.Regression_models.loc[reg_model_name].values[0]
+ auto_optimizer.Auto_optimizer(X,y,eva,reg_model,reg_model_name)
  elif ml_cat_ao =="Classification":
  eva = "class"
  class_model = models.Classification_models.loc[class_model_name].values[0]
  auto_optimizer.Auto_optimizer(X,y,eva,class_model)
  else:
+ st.divider()
  if X.isnull().sum().sum() >0 :
+ st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
  if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
  clean_num_nvh_df_cat = pd.DataFrame()
  if X[cat_cols].isnull().sum().sum() >0:
+ st.divider()
  st.write("Categorical Columns with Percentage of Null Values: ")
  cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
  st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
  null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
  st.write("X Data after Null value handling", X.head())
+ new_df = pd.concat([X,y[X.index]],axis = 1)
+ csv = new_df.to_csv(index = False)
+ st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
+ if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
+ st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
+ st.divider()
  ord_enc_cols = []
  if len(cat_cols) == 0:
  st.write("No Categorical Columns in Train")
  else:
+ st.markdown('<div class="message-box success">Features Encoding</div>', unsafe_allow_html=True)
+ st.markdown('<p class="unsuccess-message">There are Object type Features in Train Data ⚠️</p>', unsafe_allow_html=True)
+ st.markdown('<p class="success-message2">Select Columns for Ordinal Encoding</p>', unsafe_allow_html=True)
  for column in cat_cols:
  selected = st.checkbox(column)
  if selected:
  st.write(f"No. of Unique value in {column} column are", X[column].nunique())
  ord_enc_cols.append(column)
+ st.divider()
  ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
  ohe_enc_cols = list(ohe_enc_cols)
  if len(ord_enc_cols)>0:
  st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
  if len(ohe_enc_cols)>0:
  st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
+ st.divider()
+ st.markdown('<div class="message-box success">Proceed for Encoding</div>', unsafe_allow_html=True)
  if len(ord_enc_cols)>0:
+ if st.checkbox("Proceed for Ordinal Encoding"):
  ordinal_order_vals = []
  for column in ord_enc_cols:
  st.write("Ordinal Encoding Completed ✅")
  if len(ohe_enc_cols)>0:
+ if st.checkbox("Proceed for OneHotEncoding "): # import one hot encoder
  from sklearn.preprocessing import OneHotEncoder
  ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
  pd.options.mode.chained_assignment = None
  st.write("DataFrame after One Hot Encoding",X.head())
  st.write("OneHot Encoding Completed ✅")
+ st.divider()
  new_df = pd.concat([X,y],axis = 1)
  csv = new_df.to_csv(index = False)
+ if st.checkbox("Download Encoded DataFrame as CSV File ? "):
  st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
+ st.divider()
+ st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
+ st.write("")
+ st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)
+ if st.radio("",["Train_Test_split","KFoldCV, Default (CV = 5)"], index = 0)== "Train_Test_split":
  ttsmethod = "Train_Test_split"
  else:
  ttsmethod = "KFoldCV"
  st.write('You selected:', ttsmethod)
  if ttsmethod == "Train_Test_split":
+ random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
+ test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
  X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
  st.write('X-Training Data shape:', X_train.shape)
  st.write('X-Validation Data shape:', X_Val.shape)
+ st.divider()
+ st.markdown('<p class="success-message2">Select Machine Learning Category</p>', unsafe_allow_html=True)
+ ml_cat = st.radio("___",options=["Regression","Classification"],index =0)
+ st.divider()
  if ml_cat =="Regression":
+ st.markdown('<p class="success-message2">Select Error Evaluation Method</p>', unsafe_allow_html=True)
+ method_name_selector = st.selectbox(" ",evaluationer.method_df.index,index = 0)
+ st.divider()
  method = evaluationer.method_df.loc[method_name_selector].values[0]
  reg_algorithm = []
  selected_options = []
+ st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
  for option in models.Regression_models.index:
  selected = st.checkbox(option)
  if selected:
  cla_algorithm = []
  selected_options = []
+ st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
  for option in models.Classification_models.index:
  selected = st.checkbox(option)
  if selected:

auto_optimizer.py CHANGED Viewed

@@ -1,317 +1,361 @@
-import pandas as pd
-import numpy as np
-import streamlit as st
-from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
-import best_tts, evaluationer,models
-from sklearn.experimental import enable_iterative_imputer
-from sklearn.model_selection import train_test_split as tts
-from collections import Counter
-#root_mean_squared_error
-from sklearn.metrics import root_mean_squared_error
-import seaborn as sns
-import matplotlib.pyplot as plt
-import outliers,best_tts
-import feature_selections
-def Auto_optimizer(X,y,eva,model,test= None):
- evaluationer.reg_evaluation_df =evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
- num_cols = X.select_dtypes(exclude = "O").columns
- cat_cols = X.select_dtypes(include = "O").columns
- st.write("Num_cols",tuple(num_cols))
- st.write("cat_cols",tuple(cat_cols))
-# check for Duplicate and drop duplicated in X
- if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
- X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
- st.write("Columns with more than 40% null values removed")
- # st.write("csx",X)
- len_null = X.isnull().sum().sum()
- st.write(f"There are {len_null} null values in Train")
- knn_imputed_num_X = X.copy()
- si_mean_imputed_num_X = X.copy()
- # st.write("sf",si_mean_imputed_num_X)
- si_median_imputed_num_X = X.copy()
- si_most_frequent_imputed_num_X = X.copy()
- iter_imputed_num_X = X.copy()
- knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
- si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
- si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
- si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
- iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
- if len_null >0:
- if X[num_cols].isnull().sum().sum() >0:
- knn_imputer = KNNImputer(n_neighbors = 5)
- knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
- si_imputer = SimpleImputer(strategy = "mean")
- si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
- si_imputer = SimpleImputer(strategy = "median")
- si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
- si_imputer = SimpleImputer(strategy = "most_frequent")
- si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
- iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
- iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
- knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
- si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
- si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
- si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
- iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
- if X[cat_cols].isnull().sum().sum() >0:
- # treating missing values in categorical columns
- # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
- si_imputer = SimpleImputer(strategy = "most_frequent")
- knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
- si_imputer = SimpleImputer(strategy = "most_frequent")
- si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
- # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
- si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
- si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
- iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
- knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
- si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
- si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
- si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
- iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
- st.write("sdds",knn_imputed_num_X)
- st.write("sddssd",knn_imputed_X_cat_dropped)
- miss_val_dropped_X = X.dropna()
- # list of dataframes
- list_X_after_missing_values= [knn_imputed_num_X,
- si_mean_imputed_num_X,
- si_median_imputed_num_X,
- si_most_frequent_imputed_num_X,
- iter_imputed_num_X,
- knn_imputed_X_cat_dropped,
- si_mean_imputed_X_cat_dropped,
- si_median_imputed_X_cat_dropped,
- si_most_frequent_imputed_X_cat_dropped,
- iter_imputed_X_cat_dropped,
- miss_val_dropped_X]
- list_X_after_missing_values_names= ["knn_imputed_num_X",
- "si_mean_imputed_num_X",
- "si_median_imputed_num_X",
- "si_most_frequent_imputed_num_X",
- "iter_imputed_num_X",
- "knn_imputed_X_cat_dropped",
- "si_mean_imputed_X_cat_dropped",
- "si_median_imputed_X_cat_dropped",
- "si_most_frequent_imputed_X_cat_dropped",
- "iter_imputed_X_cat_dropped",
- "miss_val_dropped_X"]
- # st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
- ord_enc_cols = []
- ohe_enc_cols = []
- if len(cat_cols) == 0:
- st.write("No Categorical Columns in Train")
- else:
- st.write("Select Columns for Ordinal Encoding")
- for column in cat_cols:
- selected = st.checkbox(column)
- if selected:
- st.write(f"No. of Unique value in {column} column are", X[column].nunique())
- ord_enc_cols.append(column)
- ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
- ohe_enc_cols = list(ohe_enc_cols)
- if len(ord_enc_cols)>0:
- st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
- if len(ohe_enc_cols)>0:
- st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
- if len(ord_enc_cols)>0:
- ordinal_order_vals = []
- for column in ord_enc_cols:
- unique_vals = X.dropna()[column].unique()
- # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
- ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
- ordinal_order_vals.append(ordered_unique_vals)
- st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
- if len_null > 0:
- for df_name, df in enumerate(list_X_after_missing_values):
- # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
- from sklearn.preprocessing import OrdinalEncoder
- ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
- df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
- # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
- else :
- from sklearn.preprocessing import OrdinalEncoder
- ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
- X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
- st.write("Ordinal Encoding Completed ✅")
- if len(ohe_enc_cols)>0:
- if len_null > 0:
- for df_name, df in enumerate(list_X_after_missing_values):
- from sklearn.preprocessing import OneHotEncoder
- ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
- pd.options.mode.chained_assignment = None
- df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
- df.drop(columns = ohe_enc_cols,inplace = True)
- pd.options.mode.chained_assignment = 'warn'
- else:
- from sklearn.preprocessing import OneHotEncoder
- ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
- pd.options.mode.chained_assignment = None
- X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
- X.drop(columns = ohe_enc_cols,inplace = True)
- pd.options.mode.chained_assignment = 'warn'
- st.write("OneHot Encoding Completed ✅")
- if len(ohe_enc_cols)>0:
- if len_null > 0:
- for name,df in enumerate(list_X_after_missing_values):
- X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
- # best_tts.best_tts(df,y,model,eva)
- evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
- else:
- X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
- # best_tts.best_tts(X,y,model,eva)
- evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
- if len_null >0:
- for name,df in enumerate(list_X_after_missing_values):
- X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
- st.write(f"this is test{list_X_after_missing_values_names[name]}",X_train.isnull().sum().sum())
- evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
- if eva == "class":
- counter = Counter(y)
- total = sum(counter.values())
- balance_ratio = {cls: count / total for cls, count in counter.items()}
- num_classes = len(balance_ratio)
- ideal_ratio = 1 / num_classes
- a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
- if a == True:
- st.write("Balanced Dataset ✅")
- st.write("Using accuracy for Evaluation")
- value = "test_acc"
- else:
- st.write("Unbalanced Dataset ❌")
- st.write("Using F1 score for Evaluation")
- value = "test_f1"
- st.write("SFdfs",evaluationer.classification_evaluation_df)
- evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
- name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
- st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
- if len_null >0:
- b = list_X_after_missing_values_names.index(name)
- st.write("Sdffsf",b)
- st.write("df",list_X_after_missing_values[b])
- X = list_X_after_missing_values[b]
- if eva == "reg":
- st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
- value = "test_r2"
- evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
- st.write("adfsdf",evaluationer.reg_evaluation_df.iloc[-1,0])
- name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
- st.write("Sdffsf",name)
- if len_null >0:
- b = list_X_after_missing_values_names.index(name)
- st.write("Sdffsf",b)
- st.write("df",list_X_after_missing_values[b])
- X = list_X_after_missing_values[b]
- # Create a figure and axes
- num_plots = len(num_cols)
- cols = 2 # Number of columns in the subplot grid
- rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
- fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
- # Flatten the axes array for easy iteration, and remove any excess subplots
- axes = axes.flatten()
- for ax in axes[num_plots:]:
- fig.delaxes(ax)
- for i, col in enumerate(num_cols):
- sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
- axes[i].set_title(col)
- # Adjust layout
- plt.tight_layout()
- # Show the plot in Streamlit
- st.pyplot(fig)
- # Create a figure and axes
- num_plots = len(num_cols)
- cols = 3 # Number of columns in the subplot grid
- rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
- fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
- # Flatten the axes array for easy iteration, and remove any excess subplots
- axes = axes.flatten()
- for ax in axes[num_plots:]:
- fig.delaxes(ax)
- for i, col in enumerate(num_cols):
- sns.boxplot(y=X[col], ax=axes[i],palette="magma")
- axes[i].set_title(col)
- # Adjust layout
- plt.tight_layout()
- # Show the plot in Streamlit
- st.pyplot(fig)
- outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
- st.write("Checking for Outliers")
- outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
- st.write("Outliers in Dataframe Summary",outliers_df_X)
- st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
- select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
- resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
- st.write("outlier handling with methods",resultant)
- st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
- try :
- st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
- st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
- X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
- except :
- "evaluation of baseline model is better continuing with baseline model"
- # result_df ,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
- X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
- st.write("result_df",X)
- st.write("fsdfs",X_train)
- result_df_1 = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
- st.write("sdchsvdgj",result_df_1)

+import pandas as pd
+import numpy as np
+import streamlit as st
+from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
+import best_tts, evaluationer,models
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.model_selection import train_test_split as tts
+from collections import Counter
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.metrics import root_mean_squared_error
+import seaborn as sns
+from sklearn.decomposition import PCA
+import grid_search_cv
+import matplotlib.pyplot as plt
+import outliers,best_tts
+import feature_selections
+def Auto_optimizer(X,y,eva,model,model_name,test= None):
+ if st.button("Train Regression Model"):
+ num_cols = X.select_dtypes(exclude = "O").columns
+ cat_cols = X.select_dtypes(include = "O").columns
+ st.write("Num_cols",tuple(num_cols))
+ st.write("cat_cols",tuple(cat_cols))
+ # check for Duplicate and drop duplicated in X
+ if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
+ X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
+ st.write("Columns with more than 40% null values removed")
+ # st.write("csx",X)
+ len_null = X.isnull().sum().sum()
+ st.write(f"There are {len_null} null values in Train")
+ knn_imputed_num_X = X.copy()
+ si_mean_imputed_num_X = X.copy()
+ # st.write("sf",si_mean_imputed_num_X)
+ si_median_imputed_num_X = X.copy()
+ si_most_frequent_imputed_num_X = X.copy()
+ iter_imputed_num_X = X.copy()
+ knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
+ si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
+ si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
+ si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
+ iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
+ if len_null >0:
+ if X[num_cols].isnull().sum().sum() >0:
+ knn_imputer = KNNImputer(n_neighbors = 5)
+ knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
+ si_imputer = SimpleImputer(strategy = "mean")
+ si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
+ si_imputer = SimpleImputer(strategy = "median")
+ si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
+ si_imputer = SimpleImputer(strategy = "most_frequent")
+ si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
+ iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
+ iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
+ knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
+ si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
+ si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
+ si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
+ iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
+ if X[cat_cols].isnull().sum().sum() >0:
+ # treating missing values in categorical columns
+ # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
+ si_imputer = SimpleImputer(strategy = "most_frequent")
+ knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
+ si_imputer = SimpleImputer(strategy = "most_frequent")
+ si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
+ # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
+ si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
+ si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
+ iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
+ knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
+ si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
+ si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
+ si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
+ iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
+ miss_val_dropped_X = X.dropna()
+ # list of dataframes
+ list_X_after_missing_values= [knn_imputed_num_X,
+ si_mean_imputed_num_X,
+ si_median_imputed_num_X,
+ si_most_frequent_imputed_num_X,
+ iter_imputed_num_X,
+ knn_imputed_X_cat_dropped,
+ si_mean_imputed_X_cat_dropped,
+ si_median_imputed_X_cat_dropped,
+ si_most_frequent_imputed_X_cat_dropped,
+ iter_imputed_X_cat_dropped,
+ miss_val_dropped_X]
+ list_X_after_missing_values_names= ["knn_imputed_num_X",
+ "si_mean_imputed_num_X",
+ "si_median_imputed_num_X",
+ "si_most_frequent_imputed_num_X",
+ "iter_imputed_num_X",
+ "knn_imputed_X_cat_dropped",
+ "si_mean_imputed_X_cat_dropped",
+ "si_median_imputed_X_cat_dropped",
+ "si_most_frequent_imputed_X_cat_dropped",
+ "iter_imputed_X_cat_dropped",
+ "miss_val_dropped_X"]
+ # st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
+ ord_enc_cols = []
+ ohe_enc_cols = []
+ if len(cat_cols) == 0:
+ st.write("No Categorical Columns in Train")
+ else:
+ st.write("Select Columns for Ordinal Encoding")
+ for column in cat_cols:
+ selected = st.checkbox(column)
+ if selected:
+ st.write(f"No. of Unique value in {column} column are", X[column].nunique())
+ ord_enc_cols.append(column)
+ ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
+ ohe_enc_cols = list(ohe_enc_cols)
+ if len(ord_enc_cols)>0:
+ st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
+ if len(ohe_enc_cols)>0:
+ st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
+ if len(ord_enc_cols)>0:
+ ordinal_order_vals = []
+ for column in ord_enc_cols:
+ unique_vals = X.dropna()[column].unique()
+ # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
+ ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
+ ordinal_order_vals.append(ordered_unique_vals)
+ st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
+ if len_null > 0:
+ for df_name, df in enumerate(list_X_after_missing_values):
+ # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
+ from sklearn.preprocessing import OrdinalEncoder
+ ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
+ df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
+ # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
+ else :
+ from sklearn.preprocessing import OrdinalEncoder
+ ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
+ X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
+ st.write("Ordinal Encoding Completed ✅")
+ if len(ohe_enc_cols)>0:
+ if len_null > 0:
+ for df_name, df in enumerate(list_X_after_missing_values):
+ from sklearn.preprocessing import OneHotEncoder
+ ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
+ pd.options.mode.chained_assignment = None
+ df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
+ df.drop(columns = ohe_enc_cols,inplace = True)
+ pd.options.mode.chained_assignment = 'warn'
+ else:
+ from sklearn.preprocessing import OneHotEncoder
+ ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
+ pd.options.mode.chained_assignment = None
+ X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
+ X.drop(columns = ohe_enc_cols,inplace = True)
+ pd.options.mode.chained_assignment = 'warn'
+ st.write("OneHot Encoding Completed ✅")
+ if len(ohe_enc_cols)>0:
+ if len_null > 0:
+ for name,df in enumerate(list_X_after_missing_values):
+ X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
+ # best_tts.best_tts(df,y,model,eva)
+ evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
+ else:
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
+ # best_tts.best_tts(X,y,model,eva)
+ evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
+ if len_null >0:
+ for name,df in enumerate(list_X_after_missing_values):
+ X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
+ evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
+ if eva == "class":
+ counter = Counter(y)
+ total = sum(counter.values())
+ balance_ratio = {cls: count / total for cls, count in counter.items()}
+ num_classes = len(balance_ratio)
+ ideal_ratio = 1 / num_classes
+ a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
+ if a == True:
+ st.write("Balanced Dataset ✅")
+ st.write("Using accuracy for Evaluation")
+ value = "test_acc"
+ else:
+ st.write("Unbalanced Dataset ❌")
+ st.write("Using F1 score for Evaluation")
+ value = "test_f1"
+ evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
+ name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
+ st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
+ if len_null >0:
+ b = list_X_after_missing_values_names.index(name)
+ st.write("df",list_X_after_missing_values[b])
+ X = list_X_after_missing_values[b]
+ if eva == "reg":
+ st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
+ value = "test_r2"
+ evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
+ name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
+ if len_null >0:
+ b = list_X_after_missing_values_names.index(name)
+ st.write("df",list_X_after_missing_values[b])
+ X = list_X_after_missing_values[b]
+ # Create a figure and axes
+ num_plots = len(num_cols)
+ cols = 2 # Number of columns in the subplot grid
+ rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
+ fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
+ # Flatten the axes array for easy iteration, and remove any excess subplots
+ axes = axes.flatten()
+ for ax in axes[num_plots:]:
+ fig.delaxes(ax)
+ for i, col in enumerate(num_cols):
+ sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
+ axes[i].set_title(col)
+ # Adjust layout
+ plt.tight_layout()
+ # Show the plot in Streamlit
+ st.pyplot(fig)
+ # Create a figure and axes
+ num_plots = len(num_cols)
+ cols = 3 # Number of columns in the subplot grid
+ rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
+ fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
+ # Flatten the axes array for easy iteration, and remove any excess subplots
+ axes = axes.flatten()
+ for ax in axes[num_plots:]:
+ fig.delaxes(ax)
+ for i, col in enumerate(num_cols):
+ sns.boxplot(y=X[col], ax=axes[i],palette="magma")
+ axes[i].set_title(col)
+ # Adjust layout
+ plt.tight_layout()
+ # Show the plot in Streamlit
+ st.pyplot(fig)
+ outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
+ st.write("Checking for Outliers")
+ outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
+ st.write("Outliers in Dataframe Summary",outliers_df_X)
+ st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
+ select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
+ resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
+ st.write("outlier handling with methods",resultant)
+ st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
+ try :
+ st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
+ st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
+ X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
+ except :
+ "evaluation of baseline model is better continuing with baseline model"
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
+ st.write("result_df",X)
+ try:
+ result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
+ X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
+ except:
+ "evaluation by feature selection is not better than previous"
+ try:
+ result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
+ st.write("result_df",result)
+ except:
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
+ st.write("cheking with polynomial features")
+ poly = PolynomialFeatures(degree=(2))
+ X_train_poly = poly.fit_transform(X_train)
+ X_test_poly = poly.transform(X_test)
+ result_df_2 = evaluationer.evaluation("polynomial features degree 2",X_train_poly,X_test_poly,y_train,y_test,model,root_mean_squared_error,eva)
+ st.write("after polynomial features degree 2",evaluationer.reg_evaluation_df)
+ poly1 = PolynomialFeatures(degree=(3))
+ X_train_poly1 = poly.fit_transform(X_train)
+ X_test_poly1 = poly.transform(X_test)
+ evaluationer.evaluation("polynomial features degree 3",X_train_poly1,X_test_poly1,y_train,y_test,model,root_mean_squared_error,eva)
+ st.write("after polynomial features degree 3",evaluationer.reg_evaluation_df)
+ pca = PCA(n_components=0.95)
+ X_train_pca = pca.fit_transform(X_train)
+ X_test_pca = pca.transform(X_test)
+ evaluationer.evaluation("PCA",X_train_pca,X_test_pca,y_train,y_test,model,root_mean_squared_error,eva)
+ st.write("After PCA",evaluationer.reg_evaluation_df)
+ grid_search_cv.perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva)
+ st.write("best param",evaluationer.reg_evaluation_df)
+ st.sidebar.button("click to clear evaluation metrics",evaluationer.reg_evaluation_df.drop(index = evaluationer.reg_evaluation_df.index))

best_tts.py CHANGED Viewed

@@ -10,9 +10,9 @@ def best_tts(X,y,model,eva):
  if eva == "reg":
  test_r2_,test_r2_ts,test_r2_rs = 0,0,0
- for k in range(10,25):
  i = k/100
- for j in range(1,100):
  X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
  model = model

  if eva == "reg":
  test_r2_,test_r2_ts,test_r2_rs = 0,0,0
+ for k in range(10,25,3):
  i = k/100
+ for j in range(1,100,10):
  X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
  model = model

eda.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import numpy as np
+import pandas as pd
+import streamlit as st
+import matplotlib.pyplot as plt
+import seaborn as sns
+import streamlit as st
+import streamlit.components.v1 as components
+import plotly.express as px
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+import streamlit as st
+import pandas as pd
+import datashader as ds
+import datashader.transfer_functions as tf
+from colorcet import fire
+import plotly.express as px
+# function to analysing EDA
+def eda_analysis(df):
+ target_col = st.sidebar.selectbox("Select Target Column", df.columns,index = len(df.columns)-1)
+ y = df[target_col]
+ X = df.drop(columns = target_col)
+ num_cols = X.select_dtypes(exclude= "O").columns.tolist()
+ cat_cols = X.select_dtypes(include= "O").columns.tolist()
+ st.write("num_cols",tuple(num_cols))
+ st.write("cat_cols",tuple(cat_cols))
+ st.divider()
+ results = []
+ for column in X[num_cols].columns:
+ skewness = X[column].skew()
+ kurtosis = X[column].kurtosis()
+ skewness_html = f'<span style="color: {"red" if abs(skewness) > .5 else "white"}">{skewness:.2f}</span>'
+ kurtosis_html = f'<span style="color: {"red" if abs(kurtosis) > 3 else "white"}">{kurtosis:.2f}</span>'
+ results.append({
+ 'Column': column,
+ 'Skewness': skewness,
+ 'Kurtosis': kurtosis,
+ 'Skewness_': skewness_html,
+ 'Kurtosis_': kurtosis_html
+ })
+ result_df = pd.DataFrame(results)
+ # Display the data types of Skewness and Kurtosis columns
+ # st.write("Data types of Skewness and Kurtosis columns:", result_df[["Skewness", "Kurtosis"]].dtypes)
+ if st.toggle("Show Skewness and Kurtosis of DataFrame columns"):
+ st.write("Columns with Skewness and Kurtosis:")
+ if st.checkbox("Filter Skewed columns"):
+ filtered_df = result_df[abs(result_df["Skewness"]) > 0.5]
+ st.write(filtered_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
+ else:
+ st.write(result_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
+ st.divider()
+ st.write("Plotting Numerical Columns for Visual EDA")
+ # Create two columns
+ column1, column2 = st.columns(2)
+ # Checkbox for plotting distribution in the first column
+ with column1:
+ plot_distribution = st.checkbox("Plot Distribution of Target Column")
+ # Show the second checkbox in the second column only if the first checkbox is clicked
+ if plot_distribution:
+ with column2:
+ show_kde = st.checkbox("Show KDE Plot")
+ kde = show_kde
+ else:
+ kde = False
+ # Plot the histogram if the first checkbox is checked
+ if plot_distribution:
+ fig, ax = plt.subplots()
+ sns.histplot(y, ax=ax, kde=kde)
+ # Show the plot in the Streamlit app
+ st.pyplot(fig)
+ column3, column4 = st.columns(2)
+ with column3:
+ plot_distribution_nc =st.checkbox("Plot Distribution of Input Numerical columns")
+ if plot_distribution_nc:
+ with column4:
+ show_kde_1 = st.checkbox("Show KDE Plot for Numerical Columns")
+ kde_1 = show_kde_1
+ if plot_distribution_nc:
+ for column in num_cols:
+ fig, ax = plt.subplots()
+ sns.histplot(df[column], ax=ax, kde=kde_1)
+ st.write(f"Distribution of {column}:")
+ st.pyplot(fig)
+ st.divider()
+ # plot count plot for categorical columns
+ st.write("Plotting Categorical Columns for Visual EDA")
+ if st.checkbox("Plot Distribution of Input Categorical columns") :
+ for column in cat_cols:
+ fig, ax = plt.subplots()
+ fig = px.histogram(df.fillna('Null'), x=column, color=target_col)
+ st.write(fig)
+ st.divider()
+ # plot correlation matrics using plotly
+ st.write("Plotting Correlation Matrix for Numerical Columns")
+ column5, column6 = st.columns(2)
+ with column5:
+ plot_distribution =st.checkbox("Plot Correlation Matrix")
+ if plot_distribution:
+ with column6:
+ show_value = st.checkbox("Correlation values > 0.5")
+ if show_value:
+ # Compute correlation matrix
+ corr_matrix = df[num_cols].corr()
+ # Plot correlation matrix heatmap
+ fig = px.imshow(corr_matrix[abs(corr_matrix)>0.5], color_continuous_scale='RdBu')
+ # Add annotations for values greater than 0.5
+ for i in range(corr_matrix.shape[0]):
+ for j in range(corr_matrix.shape[1]):
+ correlation_value = corr_matrix.iloc[i, j]
+ if abs(correlation_value) > 0.5: # Filter values greater than 0.5
+ fig.add_annotation(
+ x=i, y=j,
+ text=str(round(correlation_value, 2)),
+ showarrow=False
+ )
+ # Update layout
+ fig.update_layout(
+ xaxis=dict(side="top"),
+ width=600,
+ height=600,
+ margin=dict(l=20, r=20, t=40, b=20)
+ )
+ # Display the heatmap
+ st.write(fig)
+ if plot_distribution and not show_value:
+ corr_matrix = df[num_cols].corr()
+ fig = px.imshow(corr_matrix, color_continuous_scale='RdBu')
+ for i in range(corr_matrix.shape[0]):
+ for j in range(corr_matrix.shape[1]):
+ fig.add_annotation(
+ x=i, y=j,
+ text=str(round(corr_matrix.iloc[i, j], 2)),
+ showarrow=False
+ )
+ # Update the layout to ensure annotations are displayed properly
+ fig.update_layout(
+ xaxis=dict(side="top"),
+ width=600,
+ height=600,
+ margin=dict(l=20, r=20, t=40, b=20)
+ )
+ st.write(fig)
+ st.divider()
+ outlier_cols = st.multiselect("Select Continous numerical columns for Outlier Plot",num_cols)
+ # plot px.boxplot for outlier cols
+ if st.toggle("Toggle for Violin Plot"):
+ if st.checkbox("Plot BoxPlot for Outlier Cols"):
+ if st.toggle("Split by Target"):
+ for col in outlier_cols:
+ fig = px.violin(df, x=col,color=y)
+ st.write(fig)
+ st.divider()
+ else:
+ for col in outlier_cols:
+ fig = px.violin(df, x=col)
+ st.write(fig)
+ st.divider()
+ if st.checkbox("check outlier distribution of Target column"):
+ fig = px.violin(y)
+ st.write(fig)
+ else:
+ if st.checkbox("Plot BoxPlot for Outlier Cols"):
+ if st.toggle("Split by Target"):
+ for col in outlier_cols:
+ fig = px.box(df, x=col,color=y)
+ st.write(fig)
+ st.divider()
+ else:
+ for col in outlier_cols:
+ fig = px.box(df, x=col)
+ st.write(fig)
+ st.divider()
+ if st.checkbox("check outlier distribution of Target column"):
+ fig = px.box(y)
+ st.write(fig)
+ # plot scatter plot using px
+ st.divider()
+ if st.checkbox("Plot Scatter Plot"):
+ column7, column8,column9 = st.columns(3)
+ with column7:
+ # Select y-axis column
+ y_col = st.selectbox("Select y axis column", df.columns)
+ # Filter categorical columns for the x-axis selection
+ categorical_columns = df.columns
+ with column8:
+ # Allow user to select the x-axis column from categorical columns
+ x_col = st.selectbox("Select x axis column", categorical_columns)
+ with column9:
+ hue_col = st.selectbox("Select Hue column",categorical_columns)
+ # Plot scatter plot using Plotly
+ fig = px.scatter(df, x=x_col, y=y_col, color=hue_col)
+ st.write(fig)
+ # barchart and line chart
+ st.divider()
+ if st.checkbox("Plot Bar Chart"):
+ column10, column11 = st.columns(2)
+ with column10:
+ # Select y-axis column
+ y_col = st.selectbox("Select y axis column", df.columns)
+ # Filter categorical columns for the x-axis selection
+ categorical_columns = df.columns
+ with column11:
+ # Allow user to select the x-axis column from categorical columns
+ x_col = st.selectbox("Select x axis column", categorical_columns)
+ fig = px.bar(df, x=x_col, y=y_col,color = x_col)
+ st.write(fig)
+ st.divider()
+ if st.checkbox("Plot Line Chart"):
+ column12, column13,colx = st.columns(3)
+ with column12:
+ # Select y-axis column
+ y_col = st.selectbox("Select y axis column", df.columns)
+ # Filter categorical columns for the x-axis selection
+ categorical_columns = df.columns
+ with column13:
+ # Allow user to select the x-axis column from categorical columns
+ x_col = st.selectbox("Select x axis column", categorical_columns)
+ with colx:
+ hue_col1 = st.selectbox("Select Line split column",categorical_columns)
+ fig = px.line(df.sort_values(by = y_col), x=x_col, y=y_col,color = hue_col1)
+ st.write(fig)
+ st.divider()
+ # plot pie chart
+ if st.checkbox("Plot Pie Chart "):
+ column14, column15 = st.columns(2)
+ with column14:
+ # Select y-axis column
+ y_col = st.selectbox("Select values columns", df.columns)
+ # Filter categorical columns for the x-axis selection
+ categorical_columns = df.columns
+ with column15:
+ # Allow user to select the x-axis column from categorical columns
+ x_col = st.selectbox("Select names column", categorical_columns)
+ fig = px.pie(df, values=y_col, names=x_col)
+ st.write(fig)
+ st.divider()
+ # check if there are latitude and longitude columns
+ if st.checkbox("Plot on Map"):
+ lat_col = st.selectbox("Select Latitute Column",df.columns)
+ long_col = st.selectbox("Select Longitude Column",df.columns)
+ color = st.selectbox
+ # # Create the datashader canvas and aggregate points
+ # cvs = ds.Canvas(plot_width=1000, plot_height=1000)
+ # agg = cvs.points(df, x=long_col, y=lat_col)
+ # # Get the coordinates for the mapbox layer
+ # coords_lat, coords_lon = agg.coords[lat_col].values, agg.coords[long_col].values
+ # coordinates = [
+ # [coords_lon[0], coords_lat[0]],
+ # [coords_lon[-1], coords_lat[0]],
+ # [coords_lon[-1], coords_lat[-1]],
+ # [coords_lon[0], coords_lat[-1]]
+ # ]
+ # # Generate the datashader image
+ # img = tf.shade(agg, cmap=fire)[::-1].to_pil()
+ # # Create the Plotly figure with a mapbox layer
+ # fig = px.scatter_mapbox(df[:1], lat=lat_col, lon=long_col, zoom=10) # Adjust zoom level as needed
+ # fig.update_layout(mapbox_style="carto-darkmatter",
+ # mapbox_layers=[
+ # {
+ # "sourcetype": "image",
+ # "source": img,
+ # "coordinates": coordinates
+ # }
+ # ])
+ # # Display the figure in Streamlit
+ # st.plotly_chart(fig)
+ # Create a scatter mapbox plot with vibrant colors and custom marker sizes
+ if st.button("Proceed to plot map"):
+ fig = px.scatter_mapbox(df, lat=lat_col, lon=long_col,
+ size_max=15, # Max marker size
+ mapbox_style="open-street-map", # Using a different map style for vibrancy
+ zoom=1,
+ title='Latitude and Longitude Plotting')
+ # Customize the layout for more vibrant appearance
+ fig.update_layout(mapbox_accesstoken='your_mapbox_access_token')
+ st.write(fig)

feature_selections.py CHANGED Viewed

@@ -8,12 +8,10 @@ import pandas as pd
 import numpy as np
 import evaluationer
 import streamlit as st
-# import root_mean_squared_error
 from sklearn.metrics import root_mean_squared_error
 def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
- st.write("dvsdv",y_train)
- st.write("dvfssdv",X_train)
  model = sm.OLS(y_train, sm.add_constant(X_train))
  model_fit = model.fit()
@@ -100,5 +98,7 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
  feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
  st.write("feature_cols", vif_cols)
  for i,j in enumerate(feature_cols):
- evaluationer.evaluation(f"{feature_cols_name[i]} dropped" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
- return evaluationer.reg_evaluation_df

 import numpy as np
 import evaluationer
 import streamlit as st
 from sklearn.metrics import root_mean_squared_error
 def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
  model = sm.OLS(y_train, sm.add_constant(X_train))
  model_fit = model.fit()
  feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
  st.write("feature_cols", vif_cols)
  for i,j in enumerate(feature_cols):
+ evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
+ return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name

grid_search_cv.py ADDED Viewed

	@@ -0,0 +1,284 @@

+from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
+from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.svm import SVR
+from xgboost import XGBRegressor, XGBRFRegressor
+from sklearn.neural_network import MLPRegressor
+from lightgbm import LGBMRegressor
+from sklearn.naive_bayes import GaussianNB
+from sklearn.model_selection import GridSearchCV
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+import streamlit as st
+import evaluationer
+from sklearn.metrics import root_mean_squared_error
+from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import SVC
+from xgboost import XGBClassifier, XGBRFClassifier
+from sklearn.neural_network import MLPClassifier
+from lightgbm import LGBMClassifier
+from sklearn.naive_bayes import MultinomialNB, CategoricalNB
+param_grids_class = {
+ "Logistic Regression": {
+ 'penalty': ['l1', 'l2', 'elasticnet', 'none'],
+ 'C': [0.01, 0.1, 1, 10],
+ 'solver': ['lbfgs', 'liblinear', 'saga']
+ },
+ "SGD Classifier": {
+ 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
+ 'penalty': ['l2', 'l1', 'elasticnet'],
+ 'alpha': [0.0001, 0.001, 0.01],
+ 'max_iter': [1000, 5000, 10000]
+ },
+ "Ridge Classifier": {
+ 'alpha': [0.1, 1, 10, 100]
+ },
+ "Random Forest Classifier": {
+ 'n_estimators': [100, 200, 300],
+ 'max_depth': [None, 10, 20, 30],
+ 'min_samples_split': [2, 5, 10],
+ 'min_samples_leaf': [1, 2, 4]
+ },
+ "AdaBoost Classifier": {
+ 'n_estimators': [50, 100, 200],
+ 'learning_rate': [0.01, 0.1, 1]
+ },
+ "Gradient Boosting Classifier": {
+ 'n_estimators': [100, 200, 300],
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [3, 5, 7]
+ },
+ "Hist Gradient Boosting Classifier": {
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [None, 10, 20],
+ 'min_samples_leaf': [20, 50, 100]
+ },
+ "K Neighbors Classifier": {
+ 'n_neighbors': [3, 5, 7],
+ 'weights': ['uniform', 'distance'],
+ 'metric': ['euclidean', 'manhattan']
+ },
+ "Decision Tree Classifier": {
+ 'max_depth': [None, 10, 20, 30],
+ 'min_samples_split': [2, 5, 10],
+ 'min_samples_leaf': [1, 2, 4]
+ },
+ "SVC": {
+ 'C': [0.1, 1, 10],
+ 'kernel': ['linear', 'poly', 'rbf'],
+ 'degree': [3, 4, 5],
+ 'gamma': ['scale', 'auto']
+ },
+ "XGB Classifier": {
+ 'n_estimators': [100, 200, 300],
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [3, 5, 7]
+ },
+ "XGBRF Classifier": {
+ 'n_estimators': [100, 200, 300],
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [3, 5, 7]
+ },
+ "MLP Classifier": {
+ 'hidden_layer_sizes': [(50,), (100,), (50, 50)],
+ 'activation': ['tanh', 'relu'],
+ 'solver': ['adam', 'sgd'],
+ 'alpha': [0.0001, 0.001, 0.01],
+ 'learning_rate': ['constant', 'adaptive']
+ },
+ "LGBM Classifier": {
+ 'n_estimators': [100, 200, 300],
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [-1, 10, 20]
+ },
+ "Multinomial Naive Bayes": {
+ 'alpha': [0.1, 0.5, 1.0]
+ },
+ "Categorical Naive Bayes": {
+ 'alpha': [0.1, 0.5, 1.0]
+ }
+}
+param_grids_reg = {
+ "Linear Regression": {},
+ "SGD Regressor": {
+ 'loss': ['squared_loss', 'huber'],
+ 'penalty': ['l2', 'l1', 'elasticnet'],
+ 'alpha': [0.0001, 0.001, 0.01],
+ 'max_iter': [1000, 5000, 10000]
+ },
+ "Ridge Regressor": {
+ 'alpha': [0.1, 1, 10, 100],
+ 'solver': ['auto', 'svd', 'cholesky', 'lsqr']
+ },
+ "Lasso Regressor": {
+ 'alpha': [0.1, 1, 10, 100]
+ },
+ "ElasticNet Regressor": {
+ 'alpha': [0.1, 1, 10, 100],
+ 'l1_ratio': [0.1, 0.5, 0.9]
+ },
+ "Random Forest Regressor": {
+ 'n_estimators': [100, 200, 300],
+ 'max_depth': [None, 10, 20, 30],
+ 'min_samples_split': [2, 5, 10],
+ 'min_samples_leaf': [1, 2, 4]
+ },
+ "AdaBoost Regressor": {
+ 'n_estimators': [50, 100, 200],
+ 'learning_rate': [0.01, 0.1, 1]
+ },
+ "Gradient Boosting Regressor": {
+ 'n_estimators': [100, 200, 300],
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [3, 5, 7]
+ },
+ "Hist Gradient Boosting Regressor": {
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [None, 10, 20],
+ 'min_samples_leaf': [20, 50, 100]
+ },
+ "K Neighbors Regressor": {
+ 'n_neighbors': [3, 5, 7],
+ 'weights': ['uniform', 'distance'],
+ 'metric': ['euclidean', 'manhattan']
+ },
+ "Decision Tree Regressor": {
+ 'max_depth': [None, 10, 20, 30],
+ 'min_samples_split': [2, 5, 10],
+ 'min_samples_leaf': [1, 2, 4]
+ },
+ "SVR": {
+ 'C': [0.1, 1, 10],
+ 'kernel': ['linear', 'poly', 'rbf'],
+ 'degree': [3, 4, 5],
+ 'gamma': ['scale', 'auto']
+ },
+ "XGB Regressor": {
+ 'n_estimators': [100, 200, 300],
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [3, 5, 7]
+ },
+ "XGBRF Regressor": {
+ 'n_estimators': [100, 200, 300],
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [3, 5, 7]
+ },
+ "MLP Regressor": {
+ 'hidden_layer_sizes': [(50,), (100,), (50, 50)],
+ 'activation': ['tanh', 'relu'],
+ 'solver': ['adam', 'sgd'],
+ 'alpha': [0.0001, 0.001, 0.01],
+ 'learning_rate': ['constant', 'adaptive']
+ },
+ "LGBM Regressor": {
+ 'n_estimators': [100, 200, 300],
+ 'learning_rate': [0.01, 0.1, 0.2],
+ 'max_depth': [-1, 10, 20]
+ },
+ "Gaussian Naive Bayes": {
+ 'var_smoothing': [1e-9, 1e-8, 1e-7]
+ }
+}
+# Define the regressors
+regressors = {
+ "Linear Regression": LinearRegression(),
+ "SGD Regressor": SGDRegressor(),
+ "Ridge Regressor": Ridge(),
+ "Lasso Regressor": Lasso(),
+ "ElasticNet Regressor": ElasticNet(),
+ "Random Forest Regressor": RandomForestRegressor(),
+ "AdaBoost Regressor": AdaBoostRegressor(),
+ "Gradient Boosting Regressor": GradientBoostingRegressor(),
+ "Hist Gradient Boosting Regressor": HistGradientBoostingRegressor(),
+ "K Neighbors Regressor": KNeighborsRegressor(),
+ "Decision Tree Regressor": DecisionTreeRegressor(),
+ "SVR": SVR(),
+ "XGB Regressor": XGBRegressor(),
+ "XGBRF Regressor": XGBRFRegressor(),
+ "MLP Regressor": MLPRegressor(),
+ "LGBM Regressor": LGBMRegressor(),
+ "Gaussian Naive Bayes": GaussianNB()
+}
+classifiers = {
+ "Logistic Regression": LogisticRegression(),
+ "SGD Classifier": SGDClassifier(),
+ "Ridge Classifier": RidgeClassifier(),
+ "Random Forest Classifier": RandomForestClassifier(),
+ "AdaBoost Classifier": AdaBoostClassifier(),
+ "Gradient Boosting Classifier": GradientBoostingClassifier(),
+ "Hist Gradient Boosting Classifier": HistGradientBoostingClassifier(),
+ "K Neighbors Classifier": KNeighborsClassifier(),
+ "Decision Tree Classifier": DecisionTreeClassifier(),
+ "SVC": SVC(),
+ "XGB Classifier": XGBClassifier(),
+ "XGBRF Classifier": XGBRFClassifier(),
+ "MLP Classifier": MLPClassifier(),
+ "LGBM Classifier": LGBMClassifier(),
+ "Multinomial Naive Bayes": MultinomialNB(),
+ "Categorical Naive Bayes": CategoricalNB()
+}
+def perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva):
+ if eva == "reg":
+ regressor = regressors[model_name]
+ param_grid_reg = param_grids_reg[model_name]
+ grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid_reg, cv=5, scoring='neg_mean_squared_error')
+ grid_search.fit(X_train,y_train)
+ st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
+ st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
+ best_model = grid_search.best_estimator_
+ y_pred = best_model.predict(X_test)
+ evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
+ elif eva == "class":
+ classifier = classifiers[model_name]
+ param_grid_class = param_grids_class[model_name]
+ grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid_class, cv=5, scoring='accuracy')
+ grid_search.fit(X_train,y_train)
+ st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
+ st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
+ best_model = grid_search.best_estimator_
+ y_pred = best_model.predict(X_test)
+ evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)

models.py CHANGED Viewed

@@ -23,6 +23,8 @@ from sklearn.neural_network import MLPRegressor
 from lightgbm import LGBMRegressor
 from sklearn.naive_bayes import GaussianNB
 # dictionary where keys are name of algorithm and values are algorithm for classifier
 algos_class = {
  "Logistic Regression": LogisticRegression(),

 from lightgbm import LGBMRegressor
 from sklearn.naive_bayes import GaussianNB
 # dictionary where keys are name of algorithm and values are algorithm for classifier
 algos_class = {
  "Logistic Regression": LogisticRegression(),

requirements.txt CHANGED Viewed

@@ -1,10 +1,11 @@
 streamlit==1.34.0
 joblib==1.4.2
 numpy==1.26.4
 pandas==2.2.2
 scikit-learn==1.4.2
-seaborn==0.13.2
 matplotlib==3.9.0
-xgboost==2.0.3
-lightgbm==4.3.0
-statsmodels==0.14.2

 streamlit==1.34.0
 joblib==1.4.2
 numpy==1.26.4
 pandas==2.2.2
 scikit-learn==1.4.2
+datashader==0.16.2
+colorcet==3.1.0
+plotly==5.22.0
 matplotlib==3.9.0
+seaborn==0.13.2