yn = input("Would you like to install package dependencies? (y/n)")

if yn == "y":
    print("Installing dependencies")
    !pip3 install pandas
    !pip3 install numpy
    !pip3 install scikit-learn
else:
    print("Moving on")
Installing dependencies
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: pandas in /home/dudeamabobby/.local/lib/python3.10/site-packages (2.1.0)
Requirement already satisfied: numpy>=1.22.4 in /home/dudeamabobby/.local/lib/python3.10/site-packages (from pandas) (1.25.2)
Requirement already satisfied: python-dateutil>=2.8.2 in /home/dudeamabobby/.local/lib/python3.10/site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3/dist-packages (from pandas) (2022.1)
Requirement already satisfied: tzdata>=2022.1 in /home/dudeamabobby/.local/lib/python3.10/site-packages (from pandas) (2023.3)
Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: numpy in /home/dudeamabobby/.local/lib/python3.10/site-packages (1.25.2)
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: scikit-learn in /home/dudeamabobby/.local/lib/python3.10/site-packages (1.3.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/dudeamabobby/.local/lib/python3.10/site-packages (from scikit-learn) (3.2.0)
Requirement already satisfied: joblib>=1.1.1 in /home/dudeamabobby/.local/lib/python3.10/site-packages (from scikit-learn) (1.3.2)
Requirement already satisfied: scipy>=1.5.0 in /usr/lib/python3/dist-packages (from scikit-learn) (1.8.0)
Requirement already satisfied: numpy>=1.17.3 in /home/dudeamabobby/.local/lib/python3.10/site-packages (from scikit-learn) (1.25.2)
print("Importing packages")
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pdb
import pickle  # Import pickle for model saving
Importing packages


/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.25.2
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
print("Loading, pre-processing, and splitting data")

# Load the data
data_train = pd.read_csv('./trainStroke.csv')
Loading, pre-processing, and splitting data
data_train.head()
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 1.0 67.0 0 1 1 Private Urban 228.69 36.600000 formerly smoked 1
1 51676 0.0 61.0 0 0 1 Self-employed Rural 202.21 28.893237 never smoked 1
2 31112 1.0 80.0 0 1 1 Private Rural 105.92 32.500000 never smoked 1
3 60182 0.0 49.0 0 0 1 Private Urban 171.23 34.400000 smokes 1
4 1665 0.0 79.0 1 0 1 Self-employed Rural 174.12 24.000000 never smoked 1
# Fill missing values for Age with the mean
data_train['age'].fillna(data_train['age'].mean(), inplace=True)
data_train['bmi'].fillna(data_train['bmi'].mean(), inplace=True)
print(data_train.isnull().sum())

# Encode 'Sex' column
data_train['gender'] = data_train['gender'].map({'Male': 1, 'Female': 0})

data_train['ever_married'] = data_train['ever_married'].map({'Yes': 1, 'No': 0})

# Select features and target variable
X = data_train[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'avg_glucose_level','bmi']]
y = data_train['stroke']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64
print("Training model")

# Initialize and train the logistic regression model
log_reg = LogisticRegression(solver='liblinear')
log_reg.fit(X_train, y_train)

pdb.set_trace()

print("Training completed")

save_path = "logistic_regression_model_stroke.pkl"
print("Saving model to: " + save_path)
# Save the model to a file using pickle
with open(save_path, 'wb') as model_file:
    pickle.dump(log_reg, model_file)

# Load the model back from the file
with open(save_path, 'rb') as model_file:
    loaded_model = pickle.load(model_file)

print("Finished saving trained model")

# pdb.set_trace()

# Make predictions on the test set using the loaded model


Training model
--Return--
None
> /tmp/ipykernel_1603/4206476194.py(7)<module>()
      5 log_reg.fit(X_train, y_train)
      6 
----> 7 pdb.set_trace()
      8 
      9 print("Training completed")



Training completed
Saving model to: logistic_regression_model_stroke.pkl
Finished saving trained model
X_test['gender'].fillna(1, inplace=True)

X_test.isna().sum()

y_pred = log_reg.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)
0.9393346379647749