import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from IPython.display import HTML, display
import tabulate

# URL Link to the Dataset
file='https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'

# creating a list of the variable names, They have three measurements tied to each morphology property.
cols=['id','diagnosis']
newcols=['radius','texture','perimeter','area','smoothness','compactness','concavity','concave_points','symmetry','fractal dimension']
col_mod=['_mean','_standard_error','_largest']
#this is a list comprehension that loops through each newcol name three times and adds col_mod to each
[cols.append(col_name+col_mod[y]) for col_name in newcols for y in range(3)]

# Loading the dataset into variable df as a Pandas dataframe object and naming the columns
df=pd.read_csv(file,names=cols)
#Create a new field mapped off of the diagnosis field
df['diag']=df['diagnosis'].map({'M':1,'B':0})

# Print out of the first five rows
df.head()


# Try some of the commands here
print ('Shape of dataframe ',df.shape)
print ('Size of dataframe ',df.size)
print ('Number of records ',len(df))

Shape of dataframe  (569, 33)
Size of dataframe  18777
Number of records  569


# Using Pandas to help characterize the dataset
print('The number of (records, columns) in the dataset',df.shape)
print('The number of patients with a malignant result',(df['diagnosis']=='M').sum())
print('The number of patients with a benign result',(df['diagnosis']=='B').sum())
print('The number of cells in total',df.size)
#isnull comes back with a boolean (True-1/False-0) which can be summed.
print('The number of cells across all columns that are null',df.isnull().sum().sum())
df['diagnosis'].value_counts()

The number of (records, columns) in the dataset (569, 33)
The number of patients with a malignant result 212
The number of patients with a benign result 357
The number of cells in total 18777
The number of cells across all columns that are null 0

B    357
M    212
Name: diagnosis, dtype: int64


sns.distplot(df[df['diagnosis']=='B']['radius_mean'],color='blue')
sns.distplot(df[df['diagnosis']=='M']['radius_mean'],color='red')
plt.title('Histogram of radius_mean by Diagnosis')
plt.show()


#Select only the columns with a mean measurement

cols=[x for x in df.columns if x[-4:]=='mean']
sns.pairplot(df,vars=cols,hue='diagnosis')
plt.show()


plt.figure(figsize=(14,10))
sns.heatmap(df.corr(),cmap="BuGn")
plt.show()


# What correlates of the mean columns
df[cols].corr().abs().unstack().sort_values(ascending=False).head(20)

fractal dimension_mean  fractal dimension_mean    1.000000
symmetry_mean           symmetry_mean             1.000000
texture_mean            texture_mean              1.000000
perimeter_mean          perimeter_mean            1.000000
area_mean               area_mean                 1.000000
smoothness_mean         smoothness_mean           1.000000
compactness_mean        compactness_mean          1.000000
concavity_mean          concavity_mean            1.000000
concave_points_mean     concave_points_mean       1.000000
radius_mean             radius_mean               1.000000
                        texture_mean              0.987357
texture_mean            radius_mean               0.987357
perimeter_mean          fractal dimension_mean    0.861323
fractal dimension_mean  perimeter_mean            0.861323
radius_mean             fractal dimension_mean    0.744214
fractal dimension_mean  radius_mean               0.744214
smoothness_mean         texture_mean              0.726628
texture_mean            smoothness_mean           0.726628
fractal dimension_mean  texture_mean              0.722017
texture_mean            fractal dimension_mean    0.722017
dtype: float64


# What correlates with diagnosis
print(df.iloc[:,2:].corr().diag.sort_values(ascending=False).head(10))

diag                             1.000000
fractal dimension_mean           0.793566
concave_points_standard_error    0.782914
perimeter_standard_error         0.776614
concavity_largest                0.776454
radius_largest                   0.742636
concave_points_largest           0.733825
radius_mean                      0.730029
texture_mean                     0.708984
perimeter_mean                   0.696360
Name: diag, dtype: float64


# taking the highest correlate as a classifier
sns.distplot(df[df['diagnosis']=='B']['fractal dimension_mean'],color='blue')
sns.distplot(df[df['diagnosis']=='M']['fractal dimension_mean'],color='red')
plt.axvline(x=0.125, color='black', linestyle='--')
plt.title('Histogram of fractal dimension_mean by Diagnosis (Malignant-Red)')
plt.annotate('TP',(0.17,6))
plt.annotate('TN',(0.06,8))
plt.annotate('FP',(0.1,0.5))
plt.annotate('FN',(0.13,0.5))
plt.show()


# 1 Variable linear threshold classifier test

def likelihood(dataframe,label_name,field_name,threshold):
    val_key={}
    val_key['benign_ct']=len(dataframe[dataframe[label_name]==0][field_name])
    val_key['mal_ct']=len(dataframe[dataframe[label_name]==1][field_name])
    val_key['true_pos']=(dataframe[dataframe[label_name]==1][field_name]>=threshold).sum()
    val_key['false_pos']=(dataframe[dataframe[label_name]==0][field_name]>threshold).sum()
    val_key['true_neg']=(dataframe[dataframe[label_name]==0][field_name]<threshold).sum()
    val_key['false_neg']=(dataframe[dataframe[label_name]==1][field_name]<threshold).sum()
    val_key['precision']=round(val_key['true_pos']/(val_key['true_pos']+val_key['false_pos']),3)
    val_key['recall']=round(val_key['true_pos']/(val_key['mal_ct']),3)
    val_key['fall-out']=round(val_key['false_pos']/(val_key['benign_ct']),3)
    val_key['f1_score']=round(2/((1/val_key['recall'])+(1/val_key['precision'])),3)
    return val_key

test=likelihood(df,'diag','fractal dimension_mean',0.16025)

table=[['','Condition Positive','Condition Negative'],
       ['Total Population',test['mal_ct'],test['benign_ct']],
       ['Predicted condition positive',test['true_pos'],test['false_pos']],
       ['Predicted condition negative',test['false_neg'],test['true_neg']],
       ['Precision',test['precision'],''],['Recall',test['recall'],''],
       ['Fall-out',test['fall-out'],''],['F1 score',test['f1_score'],'']]

display(HTML(tabulate.tabulate(table,tablefmt='html')))


tpr_1,fpr_1=[],[]
for x in range(30):
    test=likelihood(df,'diag','fractal dimension_mean',round(x/100,2))
    tpr_1.append(test['recall'])
    fpr_1.append((test['fall-out']))

plt.plot([0,1],[0,1],'k--')
plt.plot(fpr_1,tpr_1,label='fractal dimension_mean')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(' Linear 1 variable ROC Curve')
plt.show()


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

logreg=LogisticRegression(solver='liblinear')
#all the variables (features) 
# Convention Note: since it is multiple columns(features) it is capitalized
X=df.iloc[:,2:-1]
#just the label field (diagnosis)
y=df.iloc[:,-1]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       108
           1       0.97      0.94      0.95        63

   micro avg       0.96      0.96      0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171


from sklearn.metrics import roc_curve

y_pred_prob=logreg.predict_proba(X_test)[:,1]
fpr_logreg,tpr_logreg,thresholds=roc_curve(y_test,y_pred_prob)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr_logreg,tpr_logreg,label='Logistic Regression')
plt.plot(fpr_1,tpr_1,label='1 Variable Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.legend()
plt.show()


from sklearn.svm import SVC

svm=SVC(kernel='linear')
svm.fit(X_train,y_train)
y_pred=svm.predict(X_test)
fpr_svm,tpr_svm,thresholds=roc_curve(y_test,y_pred)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr_svm,tpr_svm,label='Support Vector Model')
plt.plot(fpr_logreg,tpr_logreg,label='Logistic Regression')
plt.plot(fpr_1,tpr_1,label='1 Variable Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('SVM ROC Curve')
plt.legend()
plt.show()
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       108
           1       0.97      0.94      0.95        63

   micro avg       0.96      0.96      0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171


X_normed=(X-X.min())/(X.max()-X.min())
X_train,X_test,y_train,y_test=train_test_split(X_normed,y,test_size=0.3,random_state=42)
svm=SVC(kernel='linear')
svm.fit(X_train,y_train)
y_pred=svm.predict(X_test)
fpr_svm,tpr_svm,thresholds=roc_curve(y_test,y_pred)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr_svm,tpr_svm,label='Support Vector Model (Normed)')
plt.plot(fpr_logreg,tpr_logreg,label='Logistic Regression')
plt.plot(fpr_1,tpr_1,label='1 Variable Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('SVM ROC Curve')
plt.legend()
plt.show()
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       108
           1       1.00      0.97      0.98        63

   micro avg       0.99      0.99      0.99       171
   macro avg       0.99      0.98      0.99       171
weighted avg       0.99      0.99      0.99       171

	id	diagnosis	radius_mean	radius_standard_error	radius_largest	texture_mean	texture_standard_error	texture_largest	perimeter_mean	perimeter_standard_error	...	concave_points_mean	concave_points_standard_error	concave_points_largest	symmetry_mean	symmetry_standard_error	symmetry_largest	fractal dimension_mean	fractal dimension_standard_error	fractal dimension_largest	diag
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	...	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890	1
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	...	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902	1
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	...	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758	1
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	...	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300	1
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	...	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678	1

Radiomics tutorial¶

Learning Objectives¶

Table of contents¶

Radiomics¶

Using Pandas to load data with read_csv.¶

Loading the Dataset¶

Exploring the data with Pandas¶

Filtering the dataset with Pandas¶

Exploratory Data Analysis¶

Visual comparison with paired plotting in seaborn¶

Visualizing a heatmap of correlation betwen the fields¶

Method Chaining, sorting, and correlating in one command¶

Creating a simple classification test¶

Creating a Receiver Operator Characteristic Curve¶

Creating a Logistic Regression Model¶

Creating a Support Vector Machine Model¶

Normalizing the features of the SVM Model¶

	Condition Positive	Condition Negative
Total Population	212	357
Predicted condition positive	145	2
Predicted condition negative	67	355
Precision	0.986
Recall	0.684
Fall-out	0.006
F1 score	0.808