Movie Recommender System

Movie Recommender System#

TF-IDF is used in the case document is as dimension and the case term document matrix.

In this section, we will use TF-IDF and cosine similarity to build a recommender system for movies.

from collections import OrderedDict

import numpy as np
import pandas as pd
from datasets import load_dataset
from rich.pretty import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from datetime import datetime

Let’s load the data and take a look at it.

# Load the IMDb movie reviews dataset
dataset = load_dataset("SandipPalit/Movie_Dataset")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 2
      1 # Load the IMDb movie reviews dataset
----> 2 dataset = load_dataset("SandipPalit/Movie_Dataset")

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1759, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
   1754 verification_mode = VerificationMode(
   1755     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   1756 )
   1758 # Create a dataset builder
-> 1759 builder_instance = load_dataset_builder(
   1760     path=path,
   1761     name=name,
   1762     data_dir=data_dir,
   1763     data_files=data_files,
   1764     cache_dir=cache_dir,
   1765     features=features,
   1766     download_config=download_config,
   1767     download_mode=download_mode,
   1768     revision=revision,
   1769     use_auth_token=use_auth_token,
   1770     **config_kwargs,
   1771 )
   1773 # Return iterable dataset in case of streaming
   1774 if streaming:

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1496, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
   1494     download_config = download_config.copy() if download_config else DownloadConfig()
   1495     download_config.use_auth_token = use_auth_token
-> 1496 dataset_module = dataset_module_factory(
   1497     path,
   1498     revision=revision,
   1499     download_config=download_config,
   1500     download_mode=download_mode,
   1501     data_dir=data_dir,
   1502     data_files=data_files,
   1503 )
   1505 # Get dataset builder class from the processing script
   1506 builder_cls = import_main_class(dataset_module.module_path)

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1218, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1213             if isinstance(e1, FileNotFoundError):
   1214                 raise FileNotFoundError(
   1215                     f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1216                     f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1217                 ) from None
-> 1218             raise e1 from None
   1219 else:
   1220     raise FileNotFoundError(
   1221         f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1222     )

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1195, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1187         return HubDatasetModuleFactoryWithScript(
   1188             path,
   1189             revision=revision,
   (...)
   1192             dynamic_modules_path=dynamic_modules_path,
   1193         ).get_module()
   1194     else:
-> 1195         return HubDatasetModuleFactoryWithoutScript(
   1196             path,
   1197             revision=revision,
   1198             data_dir=data_dir,
   1199             data_files=data_files,
   1200             download_config=download_config,
   1201             download_mode=download_mode,
   1202         ).get_module()
   1203 except (
   1204     Exception
   1205 ) as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   1206     try:

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:767, in HubDatasetModuleFactoryWithoutScript.get_module(self)
    756 def get_module(self) -> DatasetModule:
    757     hfh_dataset_info = hf_api_dataset_info(
    758         HfApi(config.HF_ENDPOINT),
    759         self.name,
   (...)
    762         timeout=100.0,
    763     )
    764     patterns = (
    765         sanitize_patterns(self.data_files)
    766         if self.data_files is not None
--> 767         else get_data_patterns_in_dataset_repository(hfh_dataset_info, self.data_dir)
    768     )
    769     data_files = DataFilesDict.from_hf_repo(
    770         patterns,
    771         dataset_info=hfh_dataset_info,
    772         base_path=self.data_dir,
    773         allowed_extensions=ALL_ALLOWED_EXTENSIONS,
    774     )
    775     module_names = {
    776         key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
    777         for key, data_files_list in data_files.items()
    778     }

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/data_files.py:675, in get_data_patterns_in_dataset_repository(dataset_info, base_path)
    673 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info, base_path=base_path)
    674 try:
--> 675     return _get_data_files_patterns(resolver)
    676 except FileNotFoundError:
    677     raise EmptyDatasetError(
    678         f"The dataset repository at '{dataset_info.id}' doesn't contain any data files"
    679     ) from None

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/data_files.py:236, in _get_data_files_patterns(pattern_resolver)
    234 try:
    235     for pattern in patterns:
--> 236         data_files = pattern_resolver(pattern)
    237         if len(data_files) > 0:
    238             non_empty_splits.append(split)

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/data_files.py:486, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, base_path, allowed_extensions)
    484 else:
    485     base_path = "/"
--> 486 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
    487 matched_paths = [
    488     filepath
    489     for filepath in glob_iter
   (...)
    496     )
    497 ]  # ignore .ipynb and __pycache__, but keep /../
    498 if allowed_extensions is not None:

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/fsspec/spec.py:613, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    609         depth = None
    611 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 613 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    614 pattern = re.compile(pattern)
    616 out = {
    617     p: info
    618     for p, info in sorted(allpaths.items())
   (...)
    625     )
    626 }

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/fsspec/utils.py:729, in glob_translate(pat)
    727     continue
    728 elif "**" in part:
--> 729     raise ValueError(
    730         "Invalid pattern: '**' can only be an entire path component"
    731     )
    732 if part:
    733     results.extend(_translate(part, f"{not_sep}*", not_sep))

ValueError: Invalid pattern: '**' can only be an entire path component
YEAR = 2000

dataset_cutoff = dataset.filter(lambda example: datetime.strptime(example["Release Date"], "%Y-%m-%d").year > YEAR)
dataset_cutoff

To get the dataset, we need to call the key train, obtaining our train_dataset.

We will convert the train_dataset to a dataframe and take a look at it.

train_dataset = dataset_cutoff["train"]
print(f"Number of training examples: {len(train_dataset)}")

df_train = train_dataset.to_pandas()
df_train.head()

We are interested in the Overview column, which contains the movie description and reviews.

We define X_train to be the array containing all the reviews (Overview column).

We will be less pedantic and not split a validation set.

X_train = train_dataset["Overview"]
X_train = np.array(X_train)

We will use the TfidfVectorizer from sklearn to convert the text to a matrix of TF-IDF features. This process can be treated as a feature extraction step.

# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

# Generate the tf-idf vectors for the corpus
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
print(X_train_tfidf.shape)

The shape tells us we have \(D = 34,552\) documents and \(T = 61,460\) unique words.

len(tfidf_vectorizer.vocabulary_)

We will use the cosine_similarity function from sklearn.metrics.pairwise to compute the cosine similarity between all movies.

This means computing the cosine similarity between each document and all other documents in the corpus.

Note that cosine_similarity takes in a matrix of n_samples by n_features and returns a matrix of n_samples by n_samples. So in our example, the documents should correspond to the rows and the features should correspond to the columns.

# %%time
# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(X_train_tfidf, X_train_tfidf)
print(cosine_sim.shape)
# %%time
# compute and print the cosine similarity matrix
cosine_sim_linear_kernel = linear_kernel(X_train_tfidf, X_train_tfidf)
print(cosine_sim_linear_kernel.shape)

It is also known that linear_kernel has faster computation for very sparse and large TF-IDF matrices. They produce the same results.

Next, how to interpret the cosine similarity matrix operated on the TF-IDF matrix?

As mentioned earlier, the cosine similarity assumes your input is in the shape of n_samples by n_features. corresponding to the number of documents the number of unique words respectively.

It returns a matrix of shape n_samples by n_samples. The value at the \(d\)-th row and \(t\)-th column is the cosine similarity between the \(d\)-th document and the \(t\)-th document denoted by:

\[ \text{cosine similarity}_{d, t} \]

Consequently, the matrix’s diagonal is \(1\) since the cosine similarity between a document and itself is \(1\).

The recommender function below is adapted from here.

def recommender(
    title: str, df: pd.DataFrame, cosine_similarity: np.ndarray, top_k: int = 10
) -> pd.DataFrame:
    """Recommends movies based on the cosine similarity matrix.

    Args:
        title (str): Title of the movie.
        df (pd.DataFrame): DataFrame containing the movie dataset.
        cosine_similarity (np.ndarray): Cosine similarity matrix.
        top_k (int, optional): Number of top recommendations to return.
            Defaults to 10.

    Returns:
        pd.DataFrame: DataFrame containing the top-k recommendations
    """
    # Get the index of the movie that matches the title
    idx = df[df["Title"] == title].index[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = cosine_similarity[idx]
    sim_scores = list(enumerate(sim_scores))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top-k most similar movies
    top_k_sim_scores = sim_scores[1 : top_k + 1]
    print(f"Top-k most similar movies: {top_k_sim_scores}")

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top-k most similar movies
    return df.iloc[movie_indices]
recommender(title="Batman: The Dark Knight Returns, Part 1", df = df_train, cosine_similarity=cosine_sim_linear_kernel)

With just TF-IDF and the cosine similiarity metric, we can already build a somewhat naive recommender system.

df_train[df_train["Title"].str.contains("Batman")]

References and Further Readings#