This textbook was written for the clinical research community at Johns Hopkins leveraging the precision medicine analytics platform (PMAP). These notebooks are available in html form on the Precision Medicine portal as well as in computational form in the CAMP-share folder on Crunchr (Crunchr.pm.jh.edu).
Last Updated : April 1st, 2019 Please go here to submit comments and report problems
Publication Attribution:
This example looks again at blood pressure values by classifying them by clinical classes of blood pressure and looking for outliers in the data on improbable values.
#Libraries needed
import os
import sqlalchemy
import urllib.parse
import pandas as pd
import getpass
from SciServer import Authentication
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
myUserName = Authentication.getKeystoneUserWithToken(Authentication.getToken()).userName
passwd = getpass.getpass('Password for ' + myUserName + ': ')
user = "win\\" + myUserName
#SQL Driver
driver="FreeTDS"
tds_ver="8.0"
# Database
host_ip="ESMPMDBPR4.WIN.AD.JHU.EDU" # Update this accordingly
db_port="1433"
db="CAMP_PMCoe_Projection" # Update this accordingly
# Create Connection String
conn_str=("DRIVER={};Server={};PORT={};DATABASE={};UID={};PWD={};TDS_VERSION={}"
.format(driver, host_ip, db_port, db, user, passwd, tds_ver)
)
# Create Engine
engine = sqlalchemy.create_engine('mssql+pyodbc:///?odbc_connect=' +
urllib.parse.quote(conn_str)
)
query="SELECT * FROM vitals_BP;"
df = pd.read_sql_query(query, engine)
df.tail()
osler_id | encounter_id | encounter_type | admission_date | discharge_date | bp_systolic_diastolic | bp_date | |
---|---|---|---|---|---|---|---|
1061679 | 564bd039-e957-4d12-8188-39d9ad158ee6 | 26213 | Office Visit | 2016-04-02 | NaT | 130/75 | 2016-04-02 13:26:00 |
1061680 | 564bd039-e957-4d12-8188-39d9ad158ee6 | 666028 | Office Visit | 2017-12-23 | NaT | 136/74 | 2017-12-23 11:27:00 |
1061681 | 564bd039-e957-4d12-8188-39d9ad158ee6 | 608048 | Office Visit | 2017-07-31 | NaT | 126/72 | 2017-07-31 11:28:00 |
1061682 | 564bd039-e957-4d12-8188-39d9ad158ee6 | 528254 | Office Visit | 2017-07-15 | NaT | 131/74 | 2017-07-15 11:30:00 |
1061683 | 564bd039-e957-4d12-8188-39d9ad158ee6 | 462753 | Office Visit | 2017-04-30 | NaT | 132/72 | 2017-04-30 10:39:00 |
Remove the # symbol from each line below to see what these different commands do.
# General Exploration of Data
#df.head()
#df.tail()
#df.info()
#df.columns
#df.describe(include='all')
#df['encounter_type']
#df.encounter_type.value_counts()
#df.encounter_type.value_counts()
#df.discharge_date.isna().sum()
#df.discharge_date.isna().sum()
#masking
type(df.admission_date[0])
#df[df['encounter_type']=='Hospital Encounter']
#df[df['admission_date']<df['discharge_date']]
pandas._libs.tslibs.timestamps.Timestamp
Dimension | Definition | Example |
---|---|---|
Completeness | Degree to which data is populated | Blanks or Null Values |
Conformity | Corresponds to expected formats | Dates, Zip Codes |
Consistency | Relational integrity | Discharge after admission |
Synchronization | Consistency with external reference data | Stale reference tables |
Uniqueness | Duplication of records | Are any records identical to each other? |
Timeliness | When was the data last updated | Data Latency |
Accuracy | Degree you verify data | Validating email addresses |
https://vita.had.co.nz/papers/tidy-data.pdf
#df=data_frame.iloc[0:2000,:].copy()
df['systolic']=pd.to_numeric(df.iloc[:,5].str.split('/').str.get(0))
df['diastolic']=pd.to_numeric(df.iloc[:,5].str.split('/').str.get(1))
df.head()
osler_id | encounter_id | encounter_type | admission_date | discharge_date | bp_systolic_diastolic | bp_date | systolic | diastolic | |
---|---|---|---|---|---|---|---|---|---|
0 | 5303550b-8ed2-42fd-885a-d32b308b05f3 | 631974 | Office Visit | 2017-05-08 00:00:00 | NaT | 88/58 | 2017-05-08 18:23:00 | 88.0 | 58.0 |
1 | 5303550b-8ed2-42fd-885a-d32b308b05f3 | 483944 | Office Visit | 2016-12-10 00:00:00 | NaT | 88/40 | 2016-12-10 13:39:00 | 88.0 | 40.0 |
2 | 39026546-483b-4959-a889-82a4e3bbaa58 | 48732 | Hospital Encounter | 2016-05-05 01:38:00 | 2016-05-16 15:45:00 | 106/68 | 2016-05-16 14:00:00 | 106.0 | 68.0 |
3 | 39026546-483b-4959-a889-82a4e3bbaa58 | 58881 | Hospital Encounter | 2016-05-16 15:53:00 | 2016-06-15 16:20:00 | 90/66 | 2016-06-04 13:00:00 | 90.0 | 66.0 |
4 | 39026546-483b-4959-a889-82a4e3bbaa58 | 58881 | Hospital Encounter | 2016-05-16 15:53:00 | 2016-06-15 16:20:00 | 122/79 | 2016-05-24 08:29:00 | 122.0 | 79.0 |
tidy_df=df[['diastolic','systolic']].copy()
tidy_df.dropna(inplace=True)
tidy_df.diastolic.isna().sum()
tidy_df.head()
diastolic | systolic | |
---|---|---|
0 | 58.0 | 88.0 |
1 | 40.0 | 88.0 |
2 | 68.0 | 106.0 |
3 | 66.0 | 90.0 |
4 | 79.0 | 122.0 |
#Very High Systolic>180 or Diastolic > 110
#High systolic >130 or diastolic>80
#Elevated >120 or diastolic > 80
#low Sys<90 or diastolic <60
tidy_df['bp_type'] = 0
tidy_df.loc[tidy_df.systolic<90,'bp_type'] = 1
tidy_df.loc[tidy_df.diastolic<60,'bp_type'] = 1
tidy_df.loc[tidy_df.systolic>120,'bp_type'] = 2
tidy_df.loc[tidy_df.diastolic>80,'bp_type'] = 2
tidy_df.loc[tidy_df.systolic>130,'bp_type'] = 3
tidy_df.loc[tidy_df.diastolic>80,'bp_type'] = 3
tidy_df.loc[tidy_df.systolic>180,'bp_type'] = 4
tidy_df.loc[tidy_df.diastolic>110,'bp_type'] = 4
tidy_df.bp_type.value_counts().sort_values()
4 22166 2 142882 1 180757 0 289283 3 420764 Name: bp_type, dtype: int64
tidy_df.head()
diastolic | systolic | bp_type | |
---|---|---|---|
0 | 58.0 | 88.0 | 0 |
1 | 40.0 | 88.0 | 0 |
2 | 68.0 | 106.0 | 1 |
3 | 66.0 | 90.0 | 1 |
4 | 79.0 | 122.0 | 2 |
plt.scatter(tidy_df.iloc[:2000,0],tidy_df.iloc[:2000,1],alpha=.5,cmap='jet',c=tidy_df.iloc[:2000].bp_type)
plt.title('Blood Pressure analysis')
plt.ylabel('Systolic')
plt.show()
this
plt.scatter(tidy_df[tidy_df['bp_type']==4].diastolic,tidy_df[tidy_df['bp_type']==4].systolic,alpha=1,c='red')
plt.title('Blood Pressure analysis')
plt.show()
from sklearn.decomposition import PCA
model = PCA()
model.fit(tidy_df)
features=range(model.n_components_)
plt.bar(features,model.explained_variance_)
plt.xticks(features)
plt.ylabel('variance')
plt.xlabel('PCA feature')
plt.show()
plt.scatter(tidy_df.iloc[:2000,0],tidy_df.iloc[:2000,1],alpha=0.1)
plt.arrow(model.mean_[0],model.mean_[1],model.components_[0,0]*20,model.components_[0,1]*20,color='red',width=1.0)
plt.arrow(model.mean_[0],model.mean_[1],model.components_[1,0]*5,model.components_[1,1]*5,color='red',width=1.0)
plt.title('PCA analysis')
plt.show()
from scipy import stats
import numpy as np
slope, intercept, r_value,p_value,std_err=stats.linregress(tidy_df)
plt.scatter(tidy_df.iloc[:2000,0],pca_df.iloc[:2000,1],alpha=0.1)
plt.title('Linear regression analysis')
plt.plot([40,120],[40*slope+intercept,120*slope+intercept],color='red')
0.0011627838480833914