Exploratory Analysis on the Heart Disease Dataset

29. Exploratory Analysis on the Heart Disease Dataset#

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import pandas as pd
#Alternatively: download data from https://www.kaggle.com/datasets/redwankarimsony/heart-disease-data
df = pd.read_csv('/kaggle/input/heart-disease-data/heart_disease_uci.csv')
df.head(10)

	id	age	sex	dataset	cp	trestbps	chol	fbs	restecg	thalch	exang	oldpeak	slope	ca	thal	num
0	1	63	Male	Cleveland	typical angina	145.0	233.0	True	lv hypertrophy	150.0	False	2.3	downsloping	0.0	fixed defect	0
1	2	67	Male	Cleveland	asymptomatic	160.0	286.0	False	lv hypertrophy	108.0	True	1.5	flat	3.0	normal	2
2	3	67	Male	Cleveland	asymptomatic	120.0	229.0	False	lv hypertrophy	129.0	True	2.6	flat	2.0	reversable defect	1
3	4	37	Male	Cleveland	non-anginal	130.0	250.0	False	normal	187.0	False	3.5	downsloping	0.0	normal	0
4	5	41	Female	Cleveland	atypical angina	130.0	204.0	False	lv hypertrophy	172.0	False	1.4	upsloping	0.0	normal	0
5	6	56	Male	Cleveland	atypical angina	120.0	236.0	False	normal	178.0	False	0.8	upsloping	0.0	normal	0
6	7	62	Female	Cleveland	asymptomatic	140.0	268.0	False	lv hypertrophy	160.0	False	3.6	downsloping	2.0	normal	3
7	8	57	Female	Cleveland	asymptomatic	120.0	354.0	False	normal	163.0	True	0.6	upsloping	0.0	normal	0
8	9	63	Male	Cleveland	asymptomatic	130.0	254.0	False	lv hypertrophy	147.0	False	1.4	flat	1.0	reversable defect	2
9	10	53	Male	Cleveland	asymptomatic	140.0	203.0	True	lv hypertrophy	155.0	True	3.1	downsloping	0.0	reversable defect	1

df = df.set_index('id')
df

	age	sex	dataset	cp	trestbps	chol	fbs	restecg	thalch	exang	oldpeak	slope	ca	thal	num
id
1	63	Male	Cleveland	typical angina	145.0	233.0	True	lv hypertrophy	150.0	False	2.3	downsloping	0.0	fixed defect	0
2	67	Male	Cleveland	asymptomatic	160.0	286.0	False	lv hypertrophy	108.0	True	1.5	flat	3.0	normal	2
3	67	Male	Cleveland	asymptomatic	120.0	229.0	False	lv hypertrophy	129.0	True	2.6	flat	2.0	reversable defect	1
4	37	Male	Cleveland	non-anginal	130.0	250.0	False	normal	187.0	False	3.5	downsloping	0.0	normal	0
5	41	Female	Cleveland	atypical angina	130.0	204.0	False	lv hypertrophy	172.0	False	1.4	upsloping	0.0	normal	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
916	54	Female	VA Long Beach	asymptomatic	127.0	333.0	True	st-t abnormality	154.0	False	0.0	NaN	NaN	NaN	1
917	62	Male	VA Long Beach	typical angina	NaN	139.0	False	st-t abnormality	NaN	NaN	NaN	NaN	NaN	NaN	0
918	55	Male	VA Long Beach	asymptomatic	122.0	223.0	True	st-t abnormality	100.0	False	0.0	NaN	NaN	fixed defect	2
919	58	Male	VA Long Beach	asymptomatic	NaN	385.0	True	lv hypertrophy	NaN	NaN	NaN	NaN	NaN	NaN	0
920	62	Male	VA Long Beach	atypical angina	120.0	254.0	False	lv hypertrophy	93.0	True	0.0	NaN	NaN	NaN	1

920 rows × 15 columns

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 33.3 KB

29.1. Descriptive Analysis#

Let’s see the statistic summary:

df.describe()

	id	age	trestbps	chol	thalch	oldpeak	ca	num
count	920.000000	920.000000	861.000000	890.000000	865.000000	858.000000	309.000000	920.000000
mean	460.500000	53.510870	132.132404	199.130337	137.545665	0.878788	0.676375	0.995652
std	265.725422	9.424685	19.066070	110.780810	25.926276	1.091226	0.935653	1.142693
min	1.000000	28.000000	0.000000	0.000000	60.000000	-2.600000	0.000000	0.000000
25%	230.750000	47.000000	120.000000	175.000000	120.000000	0.000000	0.000000	0.000000
50%	460.500000	54.000000	130.000000	223.000000	140.000000	0.500000	0.000000	1.000000
75%	690.250000	60.000000	140.000000	268.000000	157.000000	1.500000	1.000000	2.000000
max	920.000000	77.000000	200.000000	603.000000	202.000000	6.200000	3.000000	4.000000

df['dataset'].unique()

array(['Cleveland', 'Hungary', 'Switzerland', 'VA Long Beach'],
      dtype=object)

df['slope'].unique()

array(['downsloping', 'flat', 'upsloping', nan], dtype=object)

Let’s visualize all plots in different scales:numeric_columns

from matplotlib import pyplot as plt
#select numeric columns
numeric_columns = df.select_dtypes(include='number').columns #remove first column id
n = len(numeric_columns)
plt.figure(figsize=(12,4))
for i,var in enumerate(numeric_columns):
    plt.subplot(1,n,i+1)
    df[var].plot.box()

../_images/1252c6d019a19c070f24ba0552b3be9bbb1a566cbc36b184f9354d7087ece0fb.png

Age looks well distributed. trestbps, chol, thalch, oldpeak have outliers.

import seaborn as sns
sns.pairplot(df.select_dtypes(include='number'))

/opt/conda/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

<seaborn.axisgrid.PairGrid at 0x7f263033dc90>

../_images/12bf06fdf6beefa282d18c96a7a236c224305dec89d7d5d10a50feb2df8e630a.png

df.select_dtypes(include='number').columns

Index(['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num'], dtype='object')

columns = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num','sex']
sns.pairplot(df[columns], hue='sex')

/opt/conda/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

<seaborn.axisgrid.PairGrid at 0x7f260a140640>

../_images/5efab1259b11dcfd6666e3ba0b17fae45c09c9ed8a4793d2aaf06653d8bcdbb5.png

columns = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num']
sns.pairplot(df[columns], hue='num')

/opt/conda/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

<seaborn.axisgrid.PairGrid at 0x7f26081bdc60>

../_images/92df524033e8efbc4eb61b0810970b4297a0fe7272127cdd7d0a2442a44bb78d.png

columns = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num','dataset']
sns.pairplot(df[columns], hue='dataset')

/opt/conda/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

<seaborn.axisgrid.PairGrid at 0x7f260696f490>

../_images/c708cbc1d1bc930bc4022edfaf6059ec1137b1d0d6879ab1ef9af35233a932c5.png

df['sex'].value_counts().plot.bar()

<Axes: xlabel='sex'>

../_images/781b8d10c5f1218aec6ea6b5fce8d9afa1deb7fbbb785784f0351ca8396c61a8.png

29.2. Is the dataset balanced across sites?#

df.groupby('dataset')['age'].plot.hist(alpha=0.5)
plt.legend()

<matplotlib.legend.Legend at 0x7f262028e6e0>

../_images/39c9b37df13416bcaf318fe00ba1f412999c5109bda20a02ca31ded780ec9741.png

df.boxplot(column='age', by='dataset', grid=False)

<Axes: title={'center': 'age'}, xlabel='dataset'>

../_images/0b5d81bc8ed341dff97fe3bd85dd1ffa16d692ab331aac149bd00e00d93bb017.png

df.boxplot(column='num', by='dataset', grid=False)

<Axes: title={'center': 'num'}, xlabel='dataset'>

../_images/66c3de4298d141a7f17e7fb16ce6d0a8d50471878e1e182c562161bcd3295866.png

pd.crosstab(df['sex'],df['dataset'],normalize=1).T.plot.bar(stacked=True)

<Axes: xlabel='dataset'>

../_images/25272cd589aca2c4858deb173c999ddb212de45401b467e4a5fb57123adfe5ac.png

The data is not balanced with respect to sex and age across the three sites.

29.3. Is sex a risk factor for heart disease#

pd.crosstab(df['num'],df['sex'], margins=True)

sex	Female	Male	All
num
0	144	267	411
1	30	235	265
2	10	99	109
3	8	99	107
4	2	26	28
All	194	726	920

pd.crosstab(df['num'],df['sex'], margins=True, normalize=1)

sex	Female	Male	All
num
0	0.742268	0.367769	0.446739
1	0.154639	0.323691	0.288043
2	0.051546	0.136364	0.118478
3	0.041237	0.136364	0.116304
4	0.010309	0.035813	0.030435

pd.crosstab(df['num'],df['sex'], margins=True, normalize=1).T.plot.bar(stacked=True)

<Axes: xlabel='sex'>

../_images/0b48b5164fc40e94b2eb73ca46f69b5a079526e04e378f79c5a5ae4817141668.png

It looks like there is a correlation between the sex and num variables. Let’s measure the Pearson Chi2 statistic:

from scipy.stats import chi2_contingency
chi2_contingency(pd.crosstab(df['num'],df['sex'])).statistic

87.72950473296471

Let’s compare the expected frequencies with the observed ones:

pd.crosstab(df['num'],df['sex'])

sex	Female	Male
num
0	144	267
1	30	235
2	10	99
3	8	99
4	2	26

chi2_contingency(pd.crosstab(df['num'],df['sex'])).expected_freq

array([[ 86.6673913 , 324.3326087 ],
       [ 55.88043478, 209.11956522],
       [ 22.98478261,  86.01521739],
       [ 22.56304348,  84.43695652],
       [  5.90434783,  22.09565217]])

Let’s compute the Cramer V statistic, which is normalized:

from scipy.stats.contingency import association
association(pd.crosstab(df['num'],df['sex']))

0.30880116145902026

Let’s compute the relative risk to have a result which is easier to interpret:

df['case'] = (df['num']>0).astype(int)
df['case'].value_counts().plot.bar()

<Axes: xlabel='case'>

../_images/a521a807b92714f7b44f3f53a555858533b356cac5f7051b5088cbff2ec921cd.png

contingency = pd.crosstab(df['case'], df['sex'])
contingency.iloc[[1,0]][['Male','Female']]

sex	Male	Female
case
1	459	50
0	267	144

contingency['Female'][1]

from scipy.stats.contingency import relative_risk
relative_risk(contingency['Male'][1],contingency['Male'].sum(),contingency['Female'][1],contingency['Female'].sum()).relative_risk

2.4530578512396697

Let’s compute the odds ratio:

from scipy.stats.contingency import odds_ratio
odds_ratio(contingency.iloc[[1,0]][['Male','Female']])

OddsRatioResult(statistic=4.942016985387771)

29.4. Are there correlations among the numerical variables?#

pd.crosstab(df['num'],df['ca'])

ca	0.0	1.0	2.0	3.0
num
0	133	21	8	3
1	28	20	7	3
2	9	14	9	4
3	8	9	15	5
4	3	3	2	5

sns.heatmap(df.select_dtypes(include='number').drop(['case','num','ca'],axis=1).corr(), annot=True)

<Axes: >

../_images/495e94461adb0fd9b79bde409dfaee4d70e61e1caf683a67973f75ac928780d6.png

df.plot.scatter(x='age',y='thalch')

<Axes: xlabel='age', ylabel='thalch'>

../_images/be7e87c428ab8319f2a6bddf92787bd304e647208e160f6fb29838df5da9b75a.png

df.plot.scatter(x='chol',y='thalch')

<Axes: xlabel='chol', ylabel='thalch'>

../_images/263674ae02b178a194d944d59c5184160567d60741a9d0ec8dc23a77fccaa2a4.png

29.5. Is age is a risk factor?#

df.groupby('case')['age'].plot.density()
plt.legend()

<matplotlib.legend.Legend at 0x7f26080cf5b0>

../_images/22bfcd2aab4ff6b8aaf2b76105fb35097c7bd68dbf6888d40391ee50d7a60ca9.png

df_case = df[df['case']==1]
df_nocase = df[df['case']==0]

df_case['age'].value_counts(normalize=True).sort_index().cumsum().plot(label='Case=1')
df_nocase['age'].value_counts(normalize=True).sort_index().cumsum().plot(label='Case=0')
plt.legend()
plt.grid()

../_images/bfeea59943d6d4e4bfae9453754193697af7355de59ad39a58492b5ab065482e.png

pd.cut(df['age'], bins=5)

id
      (57.4, 67.2]
      (57.4, 67.2]
      (57.4, 67.2]
    (27.951, 37.8]
      (37.8, 47.6]
            ...      
    (47.6, 57.4]
    (57.4, 67.2]
    (47.6, 57.4]
    (57.4, 67.2]
    (57.4, 67.2]
Name: age, Length: 920, dtype: category
Categories (5, interval[float64, right]): [(27.951, 37.8] < (37.8, 47.6] < (47.6, 57.4] < (57.4, 67.2] < (67.2, 77.0]]

pd.crosstab(df['case'], pd.cut(df['age'], bins=5), normalize=1).T.plot.bar(stacked=True)

<Axes: xlabel='age'>

../_images/57f09d39d57206a7870860a31575d1aa0b4b7b8eeadf48e7f649e6323a06afb0.png

pd.crosstab(df['case'], pd.cut(df['age'], bins=5), normalize=0).plot.bar(stacked=True)

<Axes: xlabel='case'>

../_images/935a3f2ee64f6a1fa6b6f6ce95183a4e962887d8817990ae6f96ee795e04ca7c.png

pd.crosstab(df['case'], pd.cut(df['age'], bins=2), normalize=0).plot.bar(stacked=True)

<Axes: xlabel='case'>

../_images/7c7f8a347ac1acc075a2f85d7c73de04a0607a50818ae1df73196d8b69e98ee4.png

contingency = pd.crosstab(df['case'], pd.cut(df['age'], bins=2))
odds_ratio(contingency)

OddsRatioResult(statistic=2.766499032279526)

contingency

age	(27.951, 52.5]	(52.5, 77.0]
case
0	229	182
1	159	350