IMDB Recommender System
IMDB Recommender System#
**RECALL THAT TF-IDF IS USED IN the case when words as vectors, with document as dimension
from datasets import load_dataset
from collections import OrderedDict
from rich.pretty import pprint
import numpy as np
import pandas as pd
# # Load the IMDb movie reviews dataset
# dataset = load_dataset("imdb")
# train_dataset = dataset["train"]
# print(f"Number of training examples: {len(train_dataset)}")
# test_dataset = dataset["test"]
# print(f"Number of test examples: {len(test_dataset)}")
# X_train, y_train = train_dataset["text"], train_dataset["label"]
# X_test, y_test = dataset["test"]["text"], dataset["test"]["label"]
We will be less pedantic and not split a validation set.
corpus = [
"The sun is the largest celestial body in the solar system",
"The solar system consists of the sun and eight revolving planets",
"Ra was the Egyptian Sun God",
"The Pyramids were the pinnacle of Egyptian architecture",
"The quick brown fox jumps over the lazy dog",
]
corpus = ['this is the first document',
'this document is the second document',
'and this is the third one',
'is this the first document']
corpus_names = ["doc_1", "doc_2", "doc_3", "doc_4"] #, "doc_5"]
num_documents = len(corpus)
all_words = " ".join(corpus).split()
all_words = [word.lower() for word in all_words] # lower case
num_vocabs = len(set(all_words))
print(num_vocabs)
print(f"Number of documents: {num_documents}")
print(f"Number of unique words: {num_vocabs}")
word_freq = OrderedDict() # {word: {doc_1: freq, doc_2: freq, ...}, ...}
for doc, doc_name in zip(corpus, corpus_names):
for word in doc.lower().split():
if word not in word_freq:
word_freq[word] = {}
if doc_name not in word_freq[word]:
word_freq[word][doc_name] = 0
word_freq[word][doc_name] += 1
pprint(dict(word_freq.items()))
9
Number of documents: 4
Number of unique words: 9
{ │ 'this': {'doc_1': 1, 'doc_2': 1, 'doc_3': 1, 'doc_4': 1}, │ 'is': {'doc_1': 1, 'doc_2': 1, 'doc_3': 1, 'doc_4': 1}, │ 'the': {'doc_1': 1, 'doc_2': 1, 'doc_3': 1, 'doc_4': 1}, │ 'first': {'doc_1': 1, 'doc_4': 1}, │ 'document': {'doc_1': 1, 'doc_2': 2, 'doc_4': 1}, │ 'second': {'doc_2': 1}, │ 'and': {'doc_3': 1}, │ 'third': {'doc_3': 1}, │ 'one': {'doc_3': 1} }
df = pd.DataFrame(word_freq).fillna(0).astype(int).T
df
doc_1 | doc_2 | doc_3 | doc_4 | |
---|---|---|---|---|
this | 1 | 1 | 1 | 1 |
is | 1 | 1 | 1 | 1 |
the | 1 | 1 | 1 | 1 |
first | 1 | 0 | 0 | 1 |
document | 1 | 2 | 0 | 1 |
second | 0 | 1 | 0 | 0 |
and | 0 | 0 | 1 | 0 |
third | 0 | 0 | 1 | 0 |
one | 0 | 0 | 1 | 0 |
This dataframe recovers the format in words as vectors (document dimensions).
Now, each row is one sample, and each column is its feature. In other words, each row represents an unique word, and each column represents a unique document.
X = df.values # turn corpus/df into a matrix
X.shape, X.T
((9, 4),
array([[1, 1, 1, 1, 1, 0, 0, 0, 0],
[1, 1, 1, 0, 2, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0, 1, 1, 1],
[1, 1, 1, 1, 1, 0, 0, 0, 0]]))
X_tf = np.empty_like(X, dtype=float)
for row_index, each_row in enumerate(X):
tf = np.log10(each_row + 1)
X_tf[row_index, :] = tf
Intuition of using log is a word appearing 100 times in a document is not 100 times more
num_documents = X.shape[1] # number of documents = D features
X_idf = np.empty_like(X, dtype=float)
print(X_idf)
print(X)
for row_index, each_row in enumerate(X):
df = np.count_nonzero(each_row) # df is document frequency and it answers how many documents contain this word
idf = np.log(num_documents / df) # for eg, if we have 4 documents and a word that appears in 2 documents, then idf = log10(4/2) = 0.301
print(f"df: {df}, idf: {idf}")
X_idf[row_index, :] = idf
X_idf
[[4.63830132e-310 0.00000000e+000 0.00000000e+000 0.00000000e+000]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]]
[[1 1 1 1]
[1 1 1 1]
[1 1 1 1]
[1 0 0 1]
[1 2 0 1]
[0 1 0 0]
[0 0 1 0]
[0 0 1 0]
[0 0 1 0]]
df: 4, idf: 0.0
df: 4, idf: 0.0
df: 4, idf: 0.0
df: 2, idf: 0.6931471805599453
df: 3, idf: 0.28768207245178085
df: 1, idf: 1.3862943611198906
df: 1, idf: 1.3862943611198906
df: 1, idf: 1.3862943611198906
df: 1, idf: 1.3862943611198906
array([[0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. ],
[0.69314718, 0.69314718, 0.69314718, 0.69314718],
[0.28768207, 0.28768207, 0.28768207, 0.28768207],
[1.38629436, 1.38629436, 1.38629436, 1.38629436],
[1.38629436, 1.38629436, 1.38629436, 1.38629436],
[1.38629436, 1.38629436, 1.38629436, 1.38629436],
[1.38629436, 1.38629436, 1.38629436, 1.38629436]])
[1. 1. 1. 1.69314718 1.28768207 2.38629436
2.38629436 2.38629436 2.38629436]
X_idf
array([[0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. ],
[0.69314718, 0.69314718, 0.69314718, 0.69314718],
[0.28768207, 0.28768207, 0.28768207, 0.28768207],
[1.38629436, 1.38629436, 1.38629436, 1.38629436],
[1.38629436, 1.38629436, 1.38629436, 1.38629436],
[1.38629436, 1.38629436, 1.38629436, 1.38629436],
[1.38629436, 1.38629436, 1.38629436, 1.38629436]])
X_tfidf = np.multiply(X_tf, X_idf)
X_tfidf.T
array([[0. , 0. , 0. , 0.20865809, 0.08660093,
0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0.13725923,
0.41731619, 0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0.41731619, 0.41731619, 0.41731619],
[0. , 0. , 0. , 0.20865809, 0.08660093,
0. , 0. , 0. , 0. ]])
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
corpus = [
"this is the first document",
"this document is the second document",
"and this is the third one",
"is this the first document",
]
vocabulary = ["this", "is", "the", "first", "document", "second", "and", "third", "one"]
pipe = Pipeline(
[("count", CountVectorizer(vocabulary=vocabulary)), ("tfid", TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=False))]
).fit(corpus)
count = pipe["count"].transform(corpus).toarray()
print(count)
idf = pipe["tfid"].idf_
print(idf)
a = pipe.transform(corpus)
print(a.toarray().T)
[[1 1 1 1 1 0 0 0 0]
[1 1 1 0 2 1 0 0 0]
[1 1 1 0 0 0 1 1 1]
[1 1 1 1 1 0 0 0 0]]
[1. 1. 1. 1.69314718 1.28768207 2.38629436
2.38629436 2.38629436 2.38629436]
[[1. 1. 1. 1. ]
[1. 1. 1. 1. ]
[1. 1. 1. 1. ]
[1.69314718 0. 0. 1.69314718]
[1.28768207 2.57536414 0. 1.28768207]
[0. 2.38629436 0. 0. ]
[0. 0. 2.38629436 0. ]
[0. 0. 2.38629436 0. ]
[0. 0. 2.38629436 0. ]]
array([[0. , 0. , 0. , 0.09061906, 0.03761031, 0. , 0. , 0. , 0. ], [0. , 0. , 0. , 0. , 0.05961093, 0.18123812, 0. , 0. , 0. ], [0. , 0. , 0. , 0. , 0. , 0. , 0.18123812, 0.18123812, 0.18123812], [0. , 0. , 0. , 0.09061906, 0.03761031, 0. , 0. , 0. , 0. ]])
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
print(tfidf_matrix.shape)
(4, 9)
tfidf_matrix.toarray().T
array([[0. , 0. , 0.51184851, 0. ],
[0.46979139, 0.6876236 , 0. , 0.46979139],
[0.58028582, 0. , 0. , 0.58028582],
[0.38408524, 0.28108867, 0.26710379, 0.38408524],
[0. , 0. , 0.51184851, 0. ],
[0. , 0.53864762, 0. , 0. ],
[0.38408524, 0.28108867, 0.26710379, 0.38408524],
[0. , 0. , 0.51184851, 0. ],
[0.38408524, 0.28108867, 0.26710379, 0.38408524]])
# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)
[[1. 0.64692568 0.30777187 1. ]
[0.64692568 1. 0.22523955 0.64692568]
[0.30777187 0.22523955 1. 0.30777187]
[1. 0.64692568 0.30777187 1. ]]