Movie Recommender System#
TF-IDF is used in the case document is as dimension and the case term document matrix.
In this section, we will use TF-IDF and cosine similarity to build a recommender system for movies.
from collections import OrderedDict
import numpy as np
import pandas as pd
from datasets import load_dataset
from rich.pretty import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from datetime import datetime
Let’s load the data and take a look at it.
# Load the IMDb movie reviews dataset
dataset = load_dataset("SandipPalit/Movie_Dataset")
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[2], line 2
1 # Load the IMDb movie reviews dataset
----> 2 dataset = load_dataset("SandipPalit/Movie_Dataset")
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1759, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
1754 verification_mode = VerificationMode(
1755 (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
1756 )
1758 # Create a dataset builder
-> 1759 builder_instance = load_dataset_builder(
1760 path=path,
1761 name=name,
1762 data_dir=data_dir,
1763 data_files=data_files,
1764 cache_dir=cache_dir,
1765 features=features,
1766 download_config=download_config,
1767 download_mode=download_mode,
1768 revision=revision,
1769 use_auth_token=use_auth_token,
1770 **config_kwargs,
1771 )
1773 # Return iterable dataset in case of streaming
1774 if streaming:
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1496, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
1494 download_config = download_config.copy() if download_config else DownloadConfig()
1495 download_config.use_auth_token = use_auth_token
-> 1496 dataset_module = dataset_module_factory(
1497 path,
1498 revision=revision,
1499 download_config=download_config,
1500 download_mode=download_mode,
1501 data_dir=data_dir,
1502 data_files=data_files,
1503 )
1505 # Get dataset builder class from the processing script
1506 builder_cls = import_main_class(dataset_module.module_path)
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1218, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1213 if isinstance(e1, FileNotFoundError):
1214 raise FileNotFoundError(
1215 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
1216 f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
1217 ) from None
-> 1218 raise e1 from None
1219 else:
1220 raise FileNotFoundError(
1221 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
1222 )
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1195, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1187 return HubDatasetModuleFactoryWithScript(
1188 path,
1189 revision=revision,
(...)
1192 dynamic_modules_path=dynamic_modules_path,
1193 ).get_module()
1194 else:
-> 1195 return HubDatasetModuleFactoryWithoutScript(
1196 path,
1197 revision=revision,
1198 data_dir=data_dir,
1199 data_files=data_files,
1200 download_config=download_config,
1201 download_mode=download_mode,
1202 ).get_module()
1203 except (
1204 Exception
1205 ) as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
1206 try:
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:767, in HubDatasetModuleFactoryWithoutScript.get_module(self)
756 def get_module(self) -> DatasetModule:
757 hfh_dataset_info = hf_api_dataset_info(
758 HfApi(config.HF_ENDPOINT),
759 self.name,
(...)
762 timeout=100.0,
763 )
764 patterns = (
765 sanitize_patterns(self.data_files)
766 if self.data_files is not None
--> 767 else get_data_patterns_in_dataset_repository(hfh_dataset_info, self.data_dir)
768 )
769 data_files = DataFilesDict.from_hf_repo(
770 patterns,
771 dataset_info=hfh_dataset_info,
772 base_path=self.data_dir,
773 allowed_extensions=ALL_ALLOWED_EXTENSIONS,
774 )
775 module_names = {
776 key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
777 for key, data_files_list in data_files.items()
778 }
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/data_files.py:675, in get_data_patterns_in_dataset_repository(dataset_info, base_path)
673 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info, base_path=base_path)
674 try:
--> 675 return _get_data_files_patterns(resolver)
676 except FileNotFoundError:
677 raise EmptyDatasetError(
678 f"The dataset repository at '{dataset_info.id}' doesn't contain any data files"
679 ) from None
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/data_files.py:236, in _get_data_files_patterns(pattern_resolver)
234 try:
235 for pattern in patterns:
--> 236 data_files = pattern_resolver(pattern)
237 if len(data_files) > 0:
238 non_empty_splits.append(split)
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/data_files.py:486, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, base_path, allowed_extensions)
484 else:
485 base_path = "/"
--> 486 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
487 matched_paths = [
488 filepath
489 for filepath in glob_iter
(...)
496 )
497 ] # ignore .ipynb and __pycache__, but keep /../
498 if allowed_extensions is not None:
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/fsspec/spec.py:613, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
609 depth = None
611 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 613 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
614 pattern = re.compile(pattern)
616 out = {
617 p: info
618 for p, info in sorted(allpaths.items())
(...)
625 )
626 }
File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/fsspec/utils.py:729, in glob_translate(pat)
727 continue
728 elif "**" in part:
--> 729 raise ValueError(
730 "Invalid pattern: '**' can only be an entire path component"
731 )
732 if part:
733 results.extend(_translate(part, f"{not_sep}*", not_sep))
ValueError: Invalid pattern: '**' can only be an entire path component
YEAR = 2000
dataset_cutoff = dataset.filter(lambda example: datetime.strptime(example["Release Date"], "%Y-%m-%d").year > YEAR)
dataset_cutoff
To get the dataset, we need to call the key train
, obtaining our train_dataset
.
We will convert the train_dataset
to a dataframe and take a look at it.
train_dataset = dataset_cutoff["train"]
print(f"Number of training examples: {len(train_dataset)}")
df_train = train_dataset.to_pandas()
df_train.head()
We are interested in the Overview
column, which contains the movie description and reviews.
We define X_train
to be the array containing all the reviews (Overview
column).
We will be less pedantic and not split a validation set.
X_train = train_dataset["Overview"]
X_train = np.array(X_train)
We will use the TfidfVectorizer
from sklearn
to convert the text to a matrix of TF-IDF features.
This process can be treated as a feature extraction step.
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
# Generate the tf-idf vectors for the corpus
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
print(X_train_tfidf.shape)
The shape tells us we have \(D = 34,552\) documents and \(T = 61,460\) unique words.
len(tfidf_vectorizer.vocabulary_)
We will use the cosine_similarity
function from sklearn.metrics.pairwise
to compute the cosine similarity between all movies.
This means computing the cosine similarity between each document and all other documents in the corpus.
Note that cosine_similarity
takes in a matrix of n_samples
by n_features
and returns a matrix of n_samples
by n_samples
.
So in our example, the documents should correspond to the rows and the features should correspond to the columns.
# %%time
# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(X_train_tfidf, X_train_tfidf)
print(cosine_sim.shape)
# %%time
# compute and print the cosine similarity matrix
cosine_sim_linear_kernel = linear_kernel(X_train_tfidf, X_train_tfidf)
print(cosine_sim_linear_kernel.shape)
It is also known that linear_kernel
has faster computation for very sparse and large
TF-IDF matrices. They produce the same results.
Next, how to interpret the cosine similarity matrix operated on the TF-IDF matrix?
As mentioned earlier, the cosine similarity assumes your input is in the shape of n_samples
by n_features
.
corresponding to the number of documents the number of unique words respectively.
It returns a matrix of shape n_samples
by n_samples
. The value at the \(d\)-th row and \(t\)-th column is
the cosine similarity between the \(d\)-th document and the \(t\)-th document denoted by:
Consequently, the matrix’s diagonal is \(1\) since the cosine similarity between a document and itself is \(1\).
The recommender
function below is adapted from here.
def recommender(
title: str, df: pd.DataFrame, cosine_similarity: np.ndarray, top_k: int = 10
) -> pd.DataFrame:
"""Recommends movies based on the cosine similarity matrix.
Args:
title (str): Title of the movie.
df (pd.DataFrame): DataFrame containing the movie dataset.
cosine_similarity (np.ndarray): Cosine similarity matrix.
top_k (int, optional): Number of top recommendations to return.
Defaults to 10.
Returns:
pd.DataFrame: DataFrame containing the top-k recommendations
"""
# Get the index of the movie that matches the title
idx = df[df["Title"] == title].index[0]
# Get the pairwsie similarity scores of all movies with that movie
sim_scores = cosine_similarity[idx]
sim_scores = list(enumerate(sim_scores))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the top-k most similar movies
top_k_sim_scores = sim_scores[1 : top_k + 1]
print(f"Top-k most similar movies: {top_k_sim_scores}")
# Get the movie indices
movie_indices = [i[0] for i in sim_scores]
# Return the top-k most similar movies
return df.iloc[movie_indices]
recommender(title="Batman: The Dark Knight Returns, Part 1", df = df_train, cosine_similarity=cosine_sim_linear_kernel)
With just TF-IDF and the cosine similiarity metric, we can already build a somewhat naive recommender system.
df_train[df_train["Title"].str.contains("Batman")]