from gensim.parsing.preprocessing import stem_text, preprocess_string, remove_stopwords, strip_multiple_whitespaces
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import csv


headers = ['text', 'CAD', 'Gout', 'Venous Insufficiency', 'PVD', 'Hypercholesterolemia', 
           'Hypertension', 'Asthma', 'Hypertriglyceridemia', 'OSA', 'Gallstones', 
           'Depression', 'Obesity', 'GERD', 'OA', 'Diabetes', 'CHF']

def load_data(filename, label_col):
    labels = []
    text = []
    with open(filename, 'rt') as file:
        reader = csv.reader(file, delimiter='\t')
        next(reader)
        for row in reader:
            val = int(row[label_col])
            if val==1 or val==-1:
                text.append(row[0])
                if val==-1:
                    labels.append(0)
                else:
                    labels.append(1)
    return text, labels

#This is a comment, the column 1 corresponds to the comorbidity CAD
col=1
print("Label we are classifying:", headers[col])
text, labels = load_data('obesity_data.csv', col)

Label we are classifying: CAD


print("Number of documents:", len(text), "Num CAD:", np.count_nonzero(labels), "Num without CAD:", (len(labels)-np.count_nonzero(labels)))

Number of documents: 619 Num CAD: 343 Num without CAD: 276


print(text[512])

658449513 | AH | 79268417 | | 790454 | 6/23/1998 12:00:00 AM | DEGENERATIVE JOINT DISEASE RT. KNEE | Signed | DIS | Admission Date: 4/18/1998 Report Status: Signed

Discharge Date: 9/22/1998
DIAGNOSIS: END-STAGE OSTEOARTHRITIS , RIGHT KNEE.
HISTORY OF PRESENT ILLNESS: Ms. Denault is a 75-year-old woman who
was scheduled for a right total knee
replacement for end-stage osteoarthritis. The patient has had a
long-standing history of progressive knee pain , which has become
disabling over the last six months. The patient has two units of
autologous blood available for this procedure.
PAST MEDICAL HISTORY: Significant for osteoarthritis , borderline
diabetes mellitus , glucose-6-phosphate
dehydrogenase deficiency , and glaucoma.
PAST SURGICAL HISTORY: Significant for cesarean section times
three. She is also status post dental
extractions.
ALLERGIES: Glucose-6-phosphate dehydrogenase deficiency and
erythromycin.
ADMISSION MEDICATIONS: Timoptic-XE one q.h. and q.a.m. in each
eye , Indocin 25 mg p.o. p.r.n. , and Tylenol
p.r.n.
SOCIAL HISTORY: The patient denies smoking currently. She quit
25 years ago. She reports drinking alcohol
occasionally. She is on an American Diabetic Association 1 , 300
calorie low sodium diet. The patient is a widow who resides in
Lina Rd. , Po Raco
REVIEW OF SYSTEMS: Significant for glasses , history of glaucoma ,
and early cataracts. She has full dentures on
uppers and partial dentures on the bottom. She is deaf in the
right ear. There are no pulmonary problems. Cardiovascular , the
patient denies coronary artery disease , hypertension , chest pain ,
congestive heart failure , or deep venous thrombosis. The patient
has a history of borderline diabetes mellitus with baseline blood
sugars in the 140s.
PHYSICAL EXAMINATION: Her blood pressure was 140/80. She is 4'10"
tall and 168 lb. In general , she ambulated
with an antalgic gait on the right with a cane. She required
assistance to get onto the examination table. Head , eyes , ears ,
nose , and throat were significant for pupils being equal and
reactive to light with extraocular muscles intact , and normal
pharynx. There was no evidence of carotid bruits. Her lungs were
clear to auscultation bilaterally. Her heart was regular in rate
and rhythm with a normal S1 and S2. There was no murmur. Her
abdomen was protuberant , but nontender with no organomegaly. There
was a well healed midline incision consistent with cesarean
section. There were no focal neurological deficits. Examination
of her extremities revealed bilateral varus with the right greater
than the left. Active range of motion of the right knee was from a
15 degree extension deficit to approximately 70 degrees. There was
palpable crepitus. She had positive medial and lateral joint line
tenderness. The knee appeared stable to valgus and varus stresses.
She had a palpable dorsalis pedis and posterior fullness with no
evidence of ulceration. She was neurovascularly intact. She
received 5 mg of Coumadin preoperatively and was instructed to
discontinue use of Indocin.
HOSPITAL COURSE: The patient was brought to the Operating Room on
2/29/98 where she underwent a right total knee
arthroplasty with a Kinemax system. Estimated blood loss was 300
cc. The tourniquet time was 90 minutes. She received
perioperative antibiotics and was continued on Coumadin in the
postoperative period. Her pain was well controlled with the use of
a PCA pump provided by the anesthesiology department. Her
postoperative course was , for the most part , uncomplicated. She
did have some low grade temperatures with a T max up to 101.8
postoperatively. This seemed to be related mostly to atelectasis.
Her hematocrit was kept greater than 30 with the use of autologous
blood transfusions. On postoperative day two , her white blood cell
count was 13.7 , but had decreased by postoperative day three to 12.
Her electrolytes were well controlled. She was made therapeutic on
Coumadin by postoperative day two with a PT of 16.4 and an INR of
2.0. She worked with the physical therapy department along the
total knee arthroplasty pathway. Her skin dressings were taken
down and the wound was noted to be clean , dry , and intact with no
evidence of erythema or discharge. The patient was felt to be
doing well , although a little bit slow in terms of her physical
therapy. It was felt that she would benefit from a short stay at a
rehabilitation hospital and a consult was placed with the Maldharp Drugties Bamayhost Memorial Hospital .  The patient was discharged to Tion Pidesf Medical Center pending bed
availability on 10/10/98 .
DISCHARGE MEDICATIONS: Colace 100 mg p.o. b.i.d. , iron sulfate 300
mg p.o. t.i.d. times a total of five days ,
Folate 1 mg p.o. q.day , insulin regular ( human ) sliding scale subcu
q.i.d. , multivitamin one tab p.o. q.day , Timoptic 0.25% one drop
each eye q.a.m. , Coumadin to keep PT/INR between 1.5 and 2.0 ,
Tylenol 650-1 , 000 mg p.o. q.4h. p.r.n. , Tylenol no. 3 one to two
tabs p.o. q.4h. p.r.n. pain , and Benadryl 25-50 mg p.o. q.h.s.
p.r.n. for sleep.
DISPOSITION: The patient is discharged to a rehabilitation hospital.
DISCHARGE INSTRUCTIONS: She is instructed to continue physical
therapy for increased range of motion of
her right knee. She is further instructed to continue taking
Coumadin for a total of six weeks. She is instructed to follow up
with Dr. Danilo Mincks as an outpatient in five weeks , to call his
office for an appointment.
Dictated By: DION TORGRIMSON , M.D. XN309
Attending: JAMEL SARKAR , M.D. CH.B IB54  EB500/1920
Batch: 8560 Index No. Q7RATAYES D: 10/10/98
T: 10/10/98


print(labels)

[1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]


sentences = ["An apple a day keeps the doctor away.","The patient doesn't have CHF"]
v = CountVectorizer(stop_words='english')
print('Removing stopwords:', v.fit(sentences).vocabulary_.keys())
print()
v = CountVectorizer(ngram_range=(1, 3), stop_words='english')
print('Removing stopwords and ngrams size 1-3:', v.fit(sentences).vocabulary_.keys())
print()
v = CountVectorizer(ngram_range=(1, 3))
print('Ngrams size 1-3:', v.fit(sentences).vocabulary_.keys())

Removing stopwords: dict_keys(['apple', 'day', 'keeps', 'doctor', 'away', 'patient', 'doesn', 'chf'])

Removing stopwords and ngrams size 1-3: dict_keys(['apple', 'day', 'keeps', 'doctor', 'away', 'apple day', 'day keeps', 'keeps doctor', 'doctor away', 'apple day keeps', 'day keeps doctor', 'keeps doctor away', 'patient', 'doesn', 'chf', 'patient doesn', 'doesn chf', 'patient doesn chf'])

Ngrams size 1-3: dict_keys(['an', 'apple', 'day', 'keeps', 'the', 'doctor', 'away', 'an apple', 'apple day', 'day keeps', 'keeps the', 'the doctor', 'doctor away', 'an apple day', 'apple day keeps', 'day keeps the', 'keeps the doctor', 'the doctor away', 'patient', 'doesn', 'have', 'chf', 'the patient', 'patient doesn', 'doesn have', 'have chf', 'the patient doesn', 'patient doesn have', 'doesn have chf'])


print(stem_text("Is it universally true? Some say the university is the best in the Universe."))

is it univers true? some sai the univers is the best in the universe.


#Scoring metrics we're interested in
scoring = {'acc': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1'}

#Want to initialize the random number generator with the same seed for reapeatable experiments
random_state = 198273

#Our preprocessing steps
def preprocess(X):
        return preprocess_string(X, filters=[lambda x: x.lower(), strip_multiple_whitespaces, remove_stopwords, stem_text])

#Notice the count vectorizer/tfididf trnasformer.  We are using a pipeline here so we are first applying the
#steps in preprocess (converting the string to lowercase, removing multiple spaces, removing stopwords then stemming).
#This is followed by the count vectorizer then the tfidf transformer.  
preprocess_pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer=preprocess)),
    ('tfidf', TfidfTransformer()),
])

features = preprocess_pipeline.fit_transform(text)
for clf in [
        RandomForestClassifier(),
        MultinomialNB(),
        SVC(kernel='linear', class_weight='balanced'),
        DecisionTreeClassifier(criterion='entropy', class_weight='balanced', random_state=random_state)
    ]:
    scores = cross_validate(clf, features, labels, scoring=scoring,
                         cv=5, return_train_score=False)
    print("Classifier:", clf)
    #print(scores)
    print("Accuracy:", scores['test_acc'].mean())
    print("Precision:", scores['test_precision'].mean())
    print("Recall:", scores['test_recall'].mean())
    print("F1:", scores['test_f1'].mean())
    print()

Classifier: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy: 0.7011713611329663
Precision: 0.7491675809328949
Recall: 0.699914748508099
F1: 0.7214798001620777

Classifier: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy: 0.5557448728035668
Precision: 0.5550251606884128
Recall: 1.0
F1: 0.7138446756316068

Classifier: SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Accuracy: 0.8109183320220298
Precision: 0.8406277772773674
Recall: 0.8193094629156011
F1: 0.8280801096851441

Classifier: DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False,
            random_state=198273, splitter='best')
Accuracy: 0.8529993181222135
Precision: 0.8653339376465888
Recall: 0.8718243819266837
F1: 0.8679300117076794


#Copy and modify code from above and the previous homework here.

Classification¶

Loading Data¶

Preprocessing¶

Stopwords¶

Stemming¶

Classification Algorithms¶

Metrics¶

Accuracy¶

Precision¶

Recall¶

F1¶

Question 1: Classification Analysis¶

Question 2: Effects of preprocesing¶