|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from scipy.stats.mstats import winsorize |
|
from sklearn.preprocessing import StandardScaler,MinMaxScaler |
|
from sklearn.metrics import root_mean_squared_error |
|
from scipy.stats import yeojohnson |
|
import evaluationer |
|
from sklearn.model_selection import train_test_split as tts |
|
def detect_outliers(df,num_cols): |
|
global outlier_df,zscore_cols,outlier_indexes,iqr_cols |
|
outlier_df = pd.DataFrame({"method" :[],"columns name":[],"upper limit":[], |
|
"lower limit":[],"no of Rows":[],"percentage outlier":[]}) |
|
if type(num_cols) == list: |
|
if len(num_cols)!=0: |
|
num_cols = num_cols |
|
else: |
|
num_cols = df.select_dtypes(exclude = "object").columns.tolist() |
|
else: |
|
if num_cols.tolist() != None: |
|
num_cols = num_cols |
|
else: |
|
num_cols = df.select_dtypes(exclude = "object").columns.tolist() |
|
zscore_cols = [] |
|
iqr_cols = [] |
|
outlier_indexes =[] |
|
for col in num_cols: |
|
skewness = df[col].skew() |
|
if -0.5 <= skewness <= 0.5: |
|
method = "zscore" |
|
zscore_cols.append(col) |
|
|
|
else: |
|
method = "iqr" |
|
iqr_cols.append(col) |
|
if len(zscore_cols) >0: |
|
for col in zscore_cols: |
|
mean = df[col].mean() |
|
std = df[col].std() |
|
ul = mean + (3*std) |
|
ll = mean - (3*std) |
|
mask = (df[col] < ll) | (df[col] > ul) |
|
temp = df[mask] |
|
|
|
Zscore_index = temp.index.tolist() |
|
outlier_indexes.extend(Zscore_index) |
|
|
|
if len(temp)>0: |
|
|
|
temp_df = pd.DataFrame({"method" : ["ZScore"], |
|
"columns name" : [col], |
|
"upper limit" : [round(ul,2)], |
|
"lower limit" :[ round(ll,2)], |
|
"no of Rows" : [len(temp)], |
|
"percentage outlier" : [round(len(temp)*100/len(df),2)]}) |
|
|
|
outlier_df = pd.concat([outlier_df,temp_df]).reset_index(drop = True) |
|
|
|
else: |
|
print("No columns for Zscore method") |
|
|
|
|
|
if len(iqr_cols) >0: |
|
for col in iqr_cols: |
|
q3 = df[col].quantile(.75) |
|
q1 = df[col].quantile(.25) |
|
IQR = q3 -q1 |
|
ul = q3 + 1.5*IQR |
|
ll = q1 - 1.5*IQR |
|
mask = (df[col] < ll) | (df[col] > ul) |
|
temp = df[mask] |
|
|
|
IQR_index = temp.index.tolist() |
|
outlier_indexes.extend(IQR_index) |
|
|
|
if len(temp)>0: |
|
list(outlier_indexes).append(list(IQR_index)) |
|
|
|
temp_df1 = pd.DataFrame({"method" : ["IQR"], |
|
"columns name" : [col], |
|
"upper limit" : [round(ul,2)], |
|
"lower limit" : [round(ll,2)], |
|
"no of Rows": [len(temp)], |
|
"percentage outlier" : [round((len(temp)*100/len(df)),2)] |
|
}) |
|
|
|
outlier_df = pd.concat([outlier_df,temp_df1]).reset_index(drop = True) |
|
|
|
else: |
|
print("No columns for IQR method") |
|
|
|
|
|
outlier_indexes = list(set(outlier_indexes)) |
|
|
|
return outlier_df,outlier_indexes |
|
|
|
|
|
def outlier_handling(df,y,model,outlier_indexes = [],outlier_cols = None ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg"): |
|
num_col = df.select_dtypes(exclude = "O").columns |
|
|
|
global outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,rank_transformed_df |
|
global std_scaler_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df,minmaxscaler_df |
|
if eva == "reg": |
|
if len(outlier_indexes) ==0: |
|
print("no outlier indexes passed") |
|
outliers_dropped_df = df.copy() |
|
else: |
|
outliers_dropped_df = df.drop(index =outlier_indexes) |
|
|
|
if outlier_cols != None: |
|
|
|
if df[outlier_cols][df[outlier_cols] <0].sum().sum() == 0: |
|
log_transformed_df = df.copy() |
|
log_transformed_df[outlier_cols] = np.log(log_transformed_df[outlier_cols] + 1e-5) |
|
sqrt_transformed_df = df.copy() |
|
sqrt_transformed_df[outlier_cols] = np.sqrt(sqrt_transformed_df[outlier_cols] + 1e-5) |
|
inverse_log_transformed_winsorize_df = log_transformed_df.copy() |
|
inverse_sqrt_transformed_winsorize_df = sqrt_transformed_df.copy() |
|
for column in outlier_cols: |
|
inverse_log_transformed_winsorize_df[column] = np.exp(winsorize(inverse_log_transformed_winsorize_df[column], limits=[0.05, 0.05])) |
|
inverse_sqrt_transformed_winsorize_df[column] = (winsorize(inverse_sqrt_transformed_winsorize_df[column], limits=[0.05, 0.05]))**2 |
|
else: |
|
print("df have values less than zero") |
|
std_scaler_df = df.copy() |
|
std_scaler_df[outlier_cols] = StandardScaler().fit_transform(std_scaler_df[outlier_cols]) |
|
|
|
minmaxscaler_df = df.copy() |
|
minmaxscaler_df[outlier_cols] = MinMaxScaler().fit_transform(minmaxscaler_df[outlier_cols]) |
|
|
|
yeo_johnson_transformed_df = df.copy() |
|
for column in outlier_cols: |
|
try: |
|
yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column]) |
|
|
|
except : |
|
yeo_johnson_transformed_df[column] = yeo_johnson_transformed_df[column] |
|
|
|
print(f"Yeo-Johnson transformation failed for column '{column}'. Original data used.") |
|
|
|
rank_transformed_df = df.copy() |
|
rank_transformed_df[outlier_cols] = rank_transformed_df[outlier_cols].rank() |
|
winsorize_transformed_df = df.copy() |
|
for column in outlier_cols: |
|
winsorize_transformed_df[column] = winsorize(winsorize_transformed_df[column], limits=[0.05, 0.05]) |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
if df[num_col][df[num_col] <0].sum().sum() == 0: |
|
log_transformed_df = df.copy() |
|
log_transformed_df[num_col] = np.log(log_transformed_df[num_col] + 1e-5) |
|
sqrt_transformed_df = df.copy() |
|
sqrt_transformed_df[num_col] = np.sqrt(sqrt_transformed_df[num_col] + 1e-5) |
|
inverse_log_transformed_winsorize_df = log_transformed_df.copy() |
|
inverse_sqrt_transformed_winsorize_df = sqrt_transformed_df.copy() |
|
for column in num_col: |
|
inverse_log_transformed_winsorize_df[column] = np.exp(winsorize(inverse_log_transformed_winsorize_df[column], limits=[0.05, 0.05])) |
|
inverse_sqrt_transformed_winsorize_df[column] = (winsorize(inverse_sqrt_transformed_winsorize_df[column], limits=[0.05, 0.05]))**2 |
|
else: |
|
|
|
print("df have values less than zero") |
|
|
|
std_scaler_df = df.copy() |
|
std_scaler_df[outlier_cols] = StandardScaler().fit_transform(std_scaler_df[outlier_cols]) |
|
|
|
minmaxscaler_df = df.copy() |
|
minmaxscaler_df[outlier_cols] = MinMaxScaler().fit_transform(minmaxscaler_df[outlier_cols]) |
|
|
|
yeo_johnson_transformed_df = df.copy() |
|
for column in num_col: |
|
try: |
|
yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column]) |
|
|
|
except : |
|
yeo_johnson_transformed_df[column] = yeo_johnson_transformed_df[column] |
|
|
|
print(f"Yeo-Johnson transformation failed for column '{column}'. Original data used.") |
|
|
|
rank_transformed_df = df.copy() |
|
rank_transformed_df[num_col] = rank_transformed_df[num_col].rank() |
|
winsorize_transformed_df = df.copy() |
|
for column in num_col: |
|
winsorize_transformed_df[column] = winsorize(winsorize_transformed_df[column], limits=[0.05, 0.05]) |
|
|
|
if (df[num_col][df[num_col] <0].sum().sum() == 0): |
|
outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df, |
|
rank_transformed_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df] |
|
|
|
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df", "log_transformed_df","sqrt_transformed_df", "yeo_johnson_transformed_df","rank_transformed_df","winsorize_transformed_df", |
|
"inverse_log_transformed_winsorize_df", "inverse_sqrt_transformed_winsorize_df"] |
|
elif df[outlier_cols][df[outlier_cols] <0].sum().sum() == 0: |
|
outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df, |
|
rank_transformed_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df] |
|
|
|
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df","log_transformed_df", "sqrt_transformed_df","yeo_johnson_transformed_df","rank_transformed_df", |
|
"winsorize_transformed_df","inverse_log_transformed_winsorize_df","inverse_sqrt_transformed_winsorize_df"] |
|
|
|
else: |
|
outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,yeo_johnson_transformed_df,rank_transformed_df,winsorize_transformed_df] |
|
|
|
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df","yeo_johnson_transformed_df","rank_transformed_df","winsorize_transformed_df"] |
|
|
|
for j,i in enumerate(outlier_handled_df): |
|
X_train, X_test, y_train, y_test = tts(i,y[i.index],test_size = test_size, random_state = random_state) |
|
evaluationer.evaluation(f"{outlier_handled_df_name[j]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva) |
|
|
|
|
|
return evaluationer.reg_evaluation_df , outlier_handled_df,outlier_handled_df_name |
|
elif eva =="class": |
|
|
|
std_scaler_df = df.copy() |
|
|
|
std_scaler_df.loc[:,:] = StandardScaler().fit_transform(std_scaler_df.loc[:,:]) |
|
|
|
minmaxscaler_df = df.copy() |
|
minmaxscaler_df.loc[:,:] = MinMaxScaler().fit_transform(minmaxscaler_df.loc[:,:]) |
|
|
|
rank_transformed_df = df.copy() |
|
rank_transformed_df = rank_transformed_df.rank() |
|
|
|
outlier_handled_df = [std_scaler_df,minmaxscaler_df,rank_transformed_df] |
|
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","rank_transformed_df"] |
|
|
|
for j,i in enumerate(outlier_handled_df): |
|
|
|
X_train, X_test, y_train, y_test = tts(i,y[i.index],test_size = test_size, random_state = random_state) |
|
evaluationer.evaluation(f"{outlier_handled_df_name[j]}", X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva = "class") |
|
return evaluationer.classification_evaluation_df, outlier_handled_df,outlier_handled_df_name |
|
|
|
|
|
|