from sklearn import tree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Read CSV file in pandas
df_bp = pd.read_csv("BloodPressure_train.csv")
df_bp.head()
# Check number of rows and columns in the dataframe
df_bp.shape
# Change dichotomous strings to '0' and '1's
df_bp['gender'].replace('Male',1, inplace=True)
df_bp['gender'].replace('Female',0, inplace=True)
df_bp['gender'].replace('Other',0,inplace=True)
df_bp['ever_married'].replace('Yes',1, inplace=True)
df_bp['ever_married'].replace('No',0, inplace=True)
df_bp['Residence_type'].replace('Urban',1, inplace=True)
df_bp['Residence_type'].replace('Rural',0, inplace=True)
df_bp['smoking_status'].replace('smokes',1, inplace=True)
df_bp['smoking_status'].replace('never smoked',0, inplace=True)
df_bp['smoking_status'].replace('formerly smoked',1, inplace=True)
# Read dataframe
df_bp.head()
# Drop columns that are not relevant
BPdata = df_bp.drop(["id","work_type"], axis=1)
feature_names = BPdata.columns
BPdata.head()
# Repalce NaNs with zeros
BPdata=BPdata.fillna(0.0)
# Check number of non-zeros in each column
BPdata.astype(bool).sum(axis=0)
# Number of rows and columns in the cleaned dataframe
BPdata.shape
# Check data types of columns
print(BPdata.dtypes)
# Get statistical details of each column in the dataframe
BPdata.describe()
# Show pairwise scatter plot matrix
from pandas.plotting import scatter_matrix
plt.rcParams["figure.figsize"] = (16,16)
scatter_matrix(BPdata)
plt.show()
# Select X and y data from the dataset
X = BPdata.iloc[:, 0:9].values
y = BPdata.iloc[:, 9].values
# Check various ML models
import scipy
import sklearn
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Check various ML Models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
validation_size = 0.20
seed=7
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,test_size=validation_size, random_state=seed)
scoring = 'accuracy'
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
msg = "%s: %f (%f), %f" % (name, cv_results.mean(), cv_results.std(), accuracy_score(y_test, predictions))
print(msg)
# Compare ML model results(scores) using Matplotlib
plt.rcParams["figure.figsize"] = (8,6)
fig = plt.figure()
fig.suptitle('Model Comparison (43400 Records)')
ax = fig.add_subplot(111)
plt.boxplot(results)
plt.ylabel('Avg. Score')
plt.xlabel('Model')
ax.set_xticklabels(names)
plt.show()
# Prediction based on inputs
lr = LogisticRegression()
lr.fit(X_train, y_train)
# gender, age, hypertension, heart disease, ever married, residence type, glucose level, bmi, smoking status
X= [[1, 80, 1, 0, 1, 1, 300, 30, 1]]
predictions = lr.predict(X)
print("Stroke Probability(0:Lo, 1:Hi):", predictions)