##### RUN SCRIPT ####### import pandas as pd from sklearn.impute import SimpleImputer import numpy as np from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler,PolynomialFeatures from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from joblib import dump import os data = pd.read_csv("Salary Prediction of Data Professions.csv") X = data.iloc[:, data.columns != data.columns[7]].values X = X[:, [i for i in range(X.shape[1]) if i not in [0, 1, 3]]] Y = data.iloc[:, 7].values print("initial array",X[0]) imputer = SimpleImputer(missing_values=np.nan,strategy='mean') imputer.fit(X[:, [3, 5, 6, 7]]) X[:, [3, 5, 6, 7]] = imputer.transform(X[:, [3, 5, 6, 7]]) print("after managing missing values",X[0]) X[:, 1] = pd.to_datetime(X[:, 1], format='%d-%m-%Y').astype('int64') // 10**9 print("after modifying date to timestamp",X[0]) ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),[0])],remainder='passthrough') X = ct.fit_transform(X) X = np.array(X) print("encoding the sex M and F",X[0]) label_encoder_2 = LabelEncoder() label_encoder_4 = LabelEncoder() X[:, 3] = label_encoder_2.fit_transform(X[:, 3]) X[:, 5] = label_encoder_4.fit_transform(X[:, 5]) print("encoding position and departement",X[0]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) scaler = StandardScaler() Y_train = scaler.fit_transform(Y_train.reshape(-1, 1)).ravel() Y_test = scaler.transform(Y_test.reshape(-1, 1)).ravel() print("Y train",Y_train[0]) polynomial = LinearRegression() poly_reg = PolynomialFeatures(degree=2) X_train_poly = poly_reg.fit_transform(X_train) X_test_poly = poly_reg.fit_transform(X_test) polynomial_model = polynomial.fit(X_train_poly,Y_train) poly_train_accuracy = polynomial_model.score(X_train_poly,Y_train) poly_test_accuracy = polynomial_model.score(X_test_poly,Y_test) print('poly_train_accuracy',poly_train_accuracy) print('poly_test_accuracy',poly_test_accuracy)