Would you Survive?

yn = input("Would you like to install package dependencies? (y/n)")

if yn == "y":
    print("Installing dependencies")
    !pip3 install pandas
    !pip3 install numpy
    !pip3 install scikit-learn
else:
    print("Moving on")

Moving on

print("Importing packages")
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pdb
import pickle  # Import pickle for model saving

Importing packages


/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.25.2
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"

print("Loading, pre-processing, and splitting data")

# Load the data
data_train = pd.read_csv('./trainLoan.csv')

Loading, pre-processing, and splitting data

data_train.head()
data_train[' education'].values

array([' Graduate', ' Not Graduate', ' Graduate', ..., ' Not Graduate',
       ' Not Graduate', ' Graduate'], dtype=object)

# Fill missing values for Age with the mean
print(data_train.isnull().sum())

# Encode 'Sex' column
data_train[' education'] = data_train[' education'].map({' Graduate': 1, ' Not Graduate': 0})

data_train[' self_employed'] = data_train[' self_employed'].map({' Yes': 1, ' No': 0})

data_train[" loan_status"]= data_train[" loan_status"].map({" Approved": 1, " Rejected": 0})

# Select features and target variable
X = data_train[[' no_of_dependents', ' education', ' self_employed', ' income_annum', ' loan_amount', ' loan_term',' cibil_score',' residential_assets_value',' commercial_assets_value',' luxury_assets_value',' bank_asset_value']]
y = data_train[' loan_status']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

print("Training model")

# Initialize and train the logistic regression model
log_reg = LogisticRegression(solver='liblinear')
log_reg.fit(X_train, y_train)

pdb.set_trace()

print("Training completed")

save_path = "logistic_regression_model_loan.pkl"
print("Saving model to: " + save_path)
# Save the model to a file using pickle
with open(save_path, 'wb') as model_file:
    pickle.dump(log_reg, model_file)

# Load the model back from the file
with open(save_path, 'rb') as model_file:
    loaded_model = pickle.load(model_file)

print("Finished saving trained model")

# pdb.set_trace()

# Make predictions on the test set using the loaded model

Training model
--Return--
None
> [0;32m/tmp/ipykernel_4065/3141272518.py[0m(7)[0;36m<module>[0;34m()[0m
[0;32m      5 [0;31m[0mlog_reg[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mX_train[0m[0;34m,[0m [0my_train[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m[0;34m[0m[0m
[0m[0;32m----> 7 [0;31m[0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m[0;34m[0m[0m
[0m[0;32m      9 [0;31m[0mprint[0m[0;34m([0m[0;34m"Training completed"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


Training completed
Saving model to: logistic_regression_model_loan.pkl
Finished saving trained model

X_test.isna().sum()

y_pred = log_reg.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py in get_loc(self, key)
   3789         try:
-> 3790             return self._engine.get_loc(casted_key)
   3791         except KeyError as err:

index.pyx in pandas._libs.index.IndexEngine.get_loc()

index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'gender'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)

/tmp/ipykernel_4065/582419892.py in <module>
----> 1 X_test['gender'].fillna(1, inplace=True)
      2 
      3 X_test.isna().sum()
      4 
      5 y_pred = log_reg.predict(X_test)

~/.local/lib/python3.10/site-packages/pandas/core/frame.py in __getitem__(self, key)
   3894             if self.columns.nlevels > 1:
   3895                 return self._getitem_multilevel(key)
-> 3896             indexer = self.columns.get_loc(key)
   3897             if is_integer(indexer):
   3898                 indexer = [indexer]

~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py in get_loc(self, key)
   3795             ):
   3796                 raise InvalidIndexError(key)
-> 3797             raise KeyError(key) from err
   3798         except TypeError:
   3799             # If we have a listlike key, _check_indexing_error will raise

KeyError: 'gender'