Movie Recommender System

Movie Recommender System#

TF-IDF is used in the case document is as dimension and the case term document matrix.

In this section, we will use TF-IDF and cosine similarity to build a recommender system for movies.

from collections import OrderedDict

import numpy as np
import pandas as pd
from datasets import load_dataset
from rich.pretty import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from datetime import datetime

Let’s load the data and take a look at it.

# Load the IMDb movie reviews dataset
dataset = load_dataset("SandipPalit/Movie_Dataset")

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 2
# Load the IMDb movie reviews dataset
----> 2 dataset = load_dataset("SandipPalit/Movie_Dataset")

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1759, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
verification_mode = VerificationMode(
   (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
)
# Create a dataset builder
-> 1759 builder_instance = load_dataset_builder(
   path=path,
   name=name,
   data_dir=data_dir,
   data_files=data_files,
   cache_dir=cache_dir,
   features=features,
   download_config=download_config,
   download_mode=download_mode,
   revision=revision,
   use_auth_token=use_auth_token,
   **config_kwargs,
)
# Return iterable dataset in case of streaming
if streaming:

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1496, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
   download_config = download_config.copy() if download_config else DownloadConfig()
   download_config.use_auth_token = use_auth_token
-> 1496 dataset_module = dataset_module_factory(
   path,
   revision=revision,
   download_config=download_config,
   download_mode=download_mode,
   data_dir=data_dir,
   data_files=data_files,
)
# Get dataset builder class from the processing script
builder_cls = import_main_class(dataset_module.module_path)

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1218, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
           if isinstance(e1, FileNotFoundError):
               raise FileNotFoundError(
                   f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
                   f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
               ) from None
-> 1218             raise e1 from None
else:
   raise FileNotFoundError(
       f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   )

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:1195, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
       return HubDatasetModuleFactoryWithScript(
           path,
           revision=revision,
   (...)
           dynamic_modules_path=dynamic_modules_path,
       ).get_module()
   else:
-> 1195         return HubDatasetModuleFactoryWithoutScript(
           path,
           revision=revision,
           data_dir=data_dir,
           data_files=data_files,
           download_config=download_config,
           download_mode=download_mode,
       ).get_module()
except (
   Exception
) as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   try:

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/load.py:767, in HubDatasetModuleFactoryWithoutScript.get_module(self)
def get_module(self) -> DatasetModule:
   hfh_dataset_info = hf_api_dataset_info(
       HfApi(config.HF_ENDPOINT),
       self.name,
   (...)
       timeout=100.0,
   )
   patterns = (
       sanitize_patterns(self.data_files)
       if self.data_files is not None
--> 767         else get_data_patterns_in_dataset_repository(hfh_dataset_info, self.data_dir)
   )
   data_files = DataFilesDict.from_hf_repo(
       patterns,
       dataset_info=hfh_dataset_info,
       base_path=self.data_dir,
       allowed_extensions=ALL_ALLOWED_EXTENSIONS,
   )
   module_names = {
       key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
       for key, data_files_list in data_files.items()
   }

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/data_files.py:675, in get_data_patterns_in_dataset_repository(dataset_info, base_path)
resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info, base_path=base_path)
try:
--> 675     return _get_data_files_patterns(resolver)
except FileNotFoundError:
   raise EmptyDatasetError(
       f"The dataset repository at '{dataset_info.id}' doesn't contain any data files"
   ) from None

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/data_files.py:236, in _get_data_files_patterns(pattern_resolver)
try:
   for pattern in patterns:
--> 236         data_files = pattern_resolver(pattern)
       if len(data_files) > 0:
           non_empty_splits.append(split)

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/datasets/data_files.py:486, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, base_path, allowed_extensions)
else:
   base_path = "/"
--> 486 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
matched_paths = [
   filepath
   for filepath in glob_iter
   (...)
   )
]  # ignore .ipynb and __pycache__, but keep /../
if allowed_extensions is not None:

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/fsspec/spec.py:613, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
       depth = None
allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 613 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
pattern = re.compile(pattern)
out = {
   p: info
   for p, info in sorted(allpaths.items())
   (...)
   )
}

File /opt/hostedtoolcache/Python/3.9.20/x64/lib/python3.9/site-packages/fsspec/utils.py:729, in glob_translate(pat)
   continue
elif "**" in part:
--> 729     raise ValueError(
       "Invalid pattern: '**' can only be an entire path component"
   )
if part:
   results.extend(_translate(part, f"{not_sep}*", not_sep))

ValueError: Invalid pattern: '**' can only be an entire path component

YEAR = 2000

dataset_cutoff = dataset.filter(lambda example: datetime.strptime(example["Release Date"], "%Y-%m-%d").year > YEAR)
dataset_cutoff

To get the dataset, we need to call the key train, obtaining our train_dataset.

We will convert the train_dataset to a dataframe and take a look at it.

train_dataset = dataset_cutoff["train"]
print(f"Number of training examples: {len(train_dataset)}")

df_train = train_dataset.to_pandas()
df_train.head()

We are interested in the Overview column, which contains the movie description and reviews.

We define X_train to be the array containing all the reviews (Overview column).

We will be less pedantic and not split a validation set.

X_train = train_dataset["Overview"]
X_train = np.array(X_train)

We will use the TfidfVectorizer from sklearn to convert the text to a matrix of TF-IDF features. This process can be treated as a feature extraction step.

# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

# Generate the tf-idf vectors for the corpus
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
print(X_train_tfidf.shape)

The shape tells us we have \(D = 34,552\) documents and \(T = 61,460\) unique words.

len(tfidf_vectorizer.vocabulary_)

We will use the cosine_similarity function from sklearn.metrics.pairwise to compute the cosine similarity between all movies.

This means computing the cosine similarity between each document and all other documents in the corpus.

Note that cosine_similarity takes in a matrix of n_samples by n_features and returns a matrix of n_samples by n_samples. So in our example, the documents should correspond to the rows and the features should correspond to the columns.

# %%time
# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(X_train_tfidf, X_train_tfidf)
print(cosine_sim.shape)

# %%time
# compute and print the cosine similarity matrix
cosine_sim_linear_kernel = linear_kernel(X_train_tfidf, X_train_tfidf)
print(cosine_sim_linear_kernel.shape)

It is also known that linear_kernel has faster computation for very sparse and large TF-IDF matrices. They produce the same results.

Next, how to interpret the cosine similarity matrix operated on the TF-IDF matrix?

As mentioned earlier, the cosine similarity assumes your input is in the shape of n_samples by n_features. corresponding to the number of documents the number of unique words respectively.

It returns a matrix of shape n_samples by n_samples. The value at the \(d\)-th row and \(t\)-th column is the cosine similarity between the \(d\)-th document and the \(t\)-th document denoted by:

\[ \text{cosine similarity}_{d, t} \]

Consequently, the matrix’s diagonal is \(1\) since the cosine similarity between a document and itself is \(1\).

The recommender function below is adapted from here.

def recommender(
    title: str, df: pd.DataFrame, cosine_similarity: np.ndarray, top_k: int = 10
) -> pd.DataFrame:
    """Recommends movies based on the cosine similarity matrix.

    Args:
        title (str): Title of the movie.
        df (pd.DataFrame): DataFrame containing the movie dataset.
        cosine_similarity (np.ndarray): Cosine similarity matrix.
        top_k (int, optional): Number of top recommendations to return.
            Defaults to 10.

    Returns:
        pd.DataFrame: DataFrame containing the top-k recommendations
    """
    # Get the index of the movie that matches the title
    idx = df[df["Title"] == title].index[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = cosine_similarity[idx]
    sim_scores = list(enumerate(sim_scores))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top-k most similar movies
    top_k_sim_scores = sim_scores[1 : top_k + 1]
    print(f"Top-k most similar movies: {top_k_sim_scores}")

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top-k most similar movies
    return df.iloc[movie_indices]

recommender(title="Batman: The Dark Knight Returns, Part 1", df = df_train, cosine_similarity=cosine_sim_linear_kernel)

With just TF-IDF and the cosine similiarity metric, we can already build a somewhat naive recommender system.

df_train[df_train["Title"].str.contains("Batman")]

References and Further Readings#

Goodboychan: TF-IDF and similarity scores

Movie Recommender System

Contents

Movie Recommender System#

References and Further Readings#