#Libraries needed

import os
import sqlalchemy
import urllib.parse
import pandas as pd
import getpass
from SciServer import Authentication
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

myUserName = Authentication.getKeystoneUserWithToken(Authentication.getToken()).userName
passwd = getpass.getpass('Password for ' + myUserName + ': ')
user = "win\\" + myUserName


#SQL Driver
driver="FreeTDS"
tds_ver="8.0"
# Database
host_ip="ESMPMDBPR4.WIN.AD.JHU.EDU" # Update this accordingly
db_port="1433"
db="CAMP_PMCoe_Projection" # Update this accordingly
# Create Connection String
conn_str=("DRIVER={};Server={};PORT={};DATABASE={};UID={};PWD={};TDS_VERSION={}"
.format(driver, host_ip, db_port, db, user, passwd, tds_ver)
)
# Create Engine
engine = sqlalchemy.create_engine('mssql+pyodbc:///?odbc_connect=' +
urllib.parse.quote(conn_str)
)


query="SELECT * FROM vitals_BP;"
df = pd.read_sql_query(query, engine)
df.tail()


# General Exploration of Data
#df.head()
#df.tail()
#df.info()
#df.columns
#df.describe(include='all')
#df['encounter_type']
#df.encounter_type.value_counts()
#df.encounter_type.value_counts()
#df.discharge_date.isna().sum()
#df.discharge_date.isna().sum()
#masking
type(df.admission_date[0])

#df[df['encounter_type']=='Hospital Encounter']
#df[df['admission_date']<df['discharge_date']]

pandas._libs.tslibs.timestamps.Timestamp


#df=data_frame.iloc[0:2000,:].copy()
df['systolic']=pd.to_numeric(df.iloc[:,5].str.split('/').str.get(0))
df['diastolic']=pd.to_numeric(df.iloc[:,5].str.split('/').str.get(1))
df.head()


tidy_df=df[['diastolic','systolic']].copy()
tidy_df.dropna(inplace=True)
tidy_df.diastolic.isna().sum()
tidy_df.head()


#Very High Systolic>180 or Diastolic > 110
#High systolic >130 or diastolic>80
#Elevated >120 or diastolic > 80
#low Sys<90 or diastolic <60
tidy_df['bp_type'] = 0
tidy_df.loc[tidy_df.systolic<90,'bp_type'] = 1
tidy_df.loc[tidy_df.diastolic<60,'bp_type'] = 1
tidy_df.loc[tidy_df.systolic>120,'bp_type'] = 2
tidy_df.loc[tidy_df.diastolic>80,'bp_type'] = 2
tidy_df.loc[tidy_df.systolic>130,'bp_type'] = 3
tidy_df.loc[tidy_df.diastolic>80,'bp_type'] = 3
tidy_df.loc[tidy_df.systolic>180,'bp_type'] = 4
tidy_df.loc[tidy_df.diastolic>110,'bp_type'] = 4
tidy_df.bp_type.value_counts().sort_values()

4     22166
2    142882
1    180757
0    289283
3    420764
Name: bp_type, dtype: int64


tidy_df.head()


plt.scatter(tidy_df.iloc[:2000,0],tidy_df.iloc[:2000,1],alpha=.5,cmap='jet',c=tidy_df.iloc[:2000].bp_type)
plt.title('Blood Pressure analysis')
plt.ylabel('Systolic')
plt.show()

this


plt.scatter(tidy_df[tidy_df['bp_type']==4].diastolic,tidy_df[tidy_df['bp_type']==4].systolic,alpha=1,c='red')
plt.title('Blood Pressure analysis')
plt.show()


from sklearn.decomposition import PCA
model = PCA()
model.fit(tidy_df)
features=range(model.n_components_)
plt.bar(features,model.explained_variance_)
plt.xticks(features)
plt.ylabel('variance')
plt.xlabel('PCA feature')
plt.show()


plt.scatter(tidy_df.iloc[:2000,0],tidy_df.iloc[:2000,1],alpha=0.1)
plt.arrow(model.mean_[0],model.mean_[1],model.components_[0,0]*20,model.components_[0,1]*20,color='red',width=1.0)
plt.arrow(model.mean_[0],model.mean_[1],model.components_[1,0]*5,model.components_[1,1]*5,color='red',width=1.0)
plt.title('PCA analysis')
plt.show()


from scipy import stats
import numpy as np
slope, intercept, r_value,p_value,std_err=stats.linregress(tidy_df)
plt.scatter(tidy_df.iloc[:2000,0],pca_df.iloc[:2000,1],alpha=0.1)

plt.title('Linear regression analysis')
plt.plot([40,120],[40*slope+intercept,120*slope+intercept],color='red')

0.0011627838480833914

	osler_id	encounter_id	encounter_type	admission_date	discharge_date	bp_systolic_diastolic	bp_date
1061679	564bd039-e957-4d12-8188-39d9ad158ee6	26213	Office Visit	2016-04-02	NaT	130/75	2016-04-02 13:26:00
1061680	564bd039-e957-4d12-8188-39d9ad158ee6	666028	Office Visit	2017-12-23	NaT	136/74	2017-12-23 11:27:00
1061681	564bd039-e957-4d12-8188-39d9ad158ee6	608048	Office Visit	2017-07-31	NaT	126/72	2017-07-31 11:28:00
1061682	564bd039-e957-4d12-8188-39d9ad158ee6	528254	Office Visit	2017-07-15	NaT	131/74	2017-07-15 11:30:00
1061683	564bd039-e957-4d12-8188-39d9ad158ee6	462753	Office Visit	2017-04-30	NaT	132/72	2017-04-30 10:39:00

Dimension	Definition	Example
Completeness	Degree to which data is populated	Blanks or Null Values
Conformity	Corresponds to expected formats	Dates, Zip Codes
Consistency	Relational integrity	Discharge after admission
Synchronization	Consistency with external reference data	Stale reference tables
Uniqueness	Duplication of records	Are any records identical to each other?
Timeliness	When was the data last updated	Data Latency
Accuracy	Degree you verify data	Validating email addresses

	osler_id	encounter_id	encounter_type	admission_date	discharge_date	bp_systolic_diastolic	bp_date	systolic	diastolic
0	5303550b-8ed2-42fd-885a-d32b308b05f3	631974	Office Visit	2017-05-08 00:00:00	NaT	88/58	2017-05-08 18:23:00	88.0	58.0
1	5303550b-8ed2-42fd-885a-d32b308b05f3	483944	Office Visit	2016-12-10 00:00:00	NaT	88/40	2016-12-10 13:39:00	88.0	40.0
2	39026546-483b-4959-a889-82a4e3bbaa58	48732	Hospital Encounter	2016-05-05 01:38:00	2016-05-16 15:45:00	106/68	2016-05-16 14:00:00	106.0	68.0
3	39026546-483b-4959-a889-82a4e3bbaa58	58881	Hospital Encounter	2016-05-16 15:53:00	2016-06-15 16:20:00	90/66	2016-06-04 13:00:00	90.0	66.0
4	39026546-483b-4959-a889-82a4e3bbaa58	58881	Hospital Encounter	2016-05-16 15:53:00	2016-06-15 16:20:00	122/79	2016-05-24 08:29:00	122.0	79.0

	diastolic	systolic
0	58.0	88.0
1	40.0	88.0
2	68.0	106.0
3	66.0	90.0
4	79.0	122.0

	diastolic	systolic	bp_type
0	58.0	88.0	0
1	40.0	88.0	0
2	68.0	106.0	1
3	66.0	90.0	1
4	79.0	122.0	2

Data quality assessment example¶

Learning Objectives¶

Connect to PMAP Database¶

Data Exploration¶

Tests of data quality¶

Principles of Tidy Data (Hadley Wickham)¶