from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


corpus = [
'It was the best of times,',
'it was the worst of times,',
'it was the age of wisdom,',
'it was the age of foolishness,',
]



vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']
[[0 1 0 1 1 1 1 1 0 0]
 [0 0 0 1 1 1 1 1 0 1]
 [1 0 0 1 1 1 0 1 1 0]
 [1 0 1 1 1 1 0 1 0 0]]


#Make your changes here. Explain what you have done and the results. 
#X = vectorizer.fit_transform(corpus)
#print(vectorizer.get_feature_names())
#print(X.toarray())


#Make your changes here. Explain what you have done and the results. 
#X = vectorizer.fit_transform(corpus)
#print(vectorizer.get_feature_names())
#print(X.toarray())


#Make your changes here. Explain what you have done and the results. 

vectorizer2 = TfidfVectorizer()
Y = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(Y.toarray())

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']
[[0.         0.60735961 0.         0.31694544 0.31694544 0.31694544
  0.4788493  0.31694544 0.         0.        ]
 [0.         0.         0.         0.31694544 0.31694544 0.31694544
  0.4788493  0.31694544 0.         0.60735961]
 [0.4788493  0.         0.         0.31694544 0.31694544 0.31694544
  0.         0.31694544 0.60735961 0.        ]
 [0.4788493  0.         0.60735961 0.31694544 0.31694544 0.31694544
  0.         0.31694544 0.         0.        ]]


import pandas as pd
from pandas import read_csv
#Let's load the obeisty dataset
df = read_csv('obesity_data.csv', delimiter='\t')

notes = df['text']
# Working on "notes: instead of the toy corpus
#X = vectorizer.fit_transform(notes)
#print(vectorizer.get_feature_names())
#print(X.toarray())
#vectorizer2 = TfidfVectorizer()
#Y = vectorizer2.fit_transform(corpus)
#print(vectorizer2.get_feature_names())
#print(Y.toarray())
#Modify this code as necessary to check diffrent vectorizer functions.

Question 1: ngrams¶

Question 2: Reducing variation¶

Question 3: term weighting¶

Question 4: Working on larger datasets¶

Question 4: Check previous work¶