Source code for pysuggestify.PMF

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

[docs]class PMF():
    """
    Probabilistic Matrix Factorization class for building recommendation models.

    """
    def __init__(self, n_dims = 50, lambda_U = 0.3, lambda_V = 0.3):
        self.n_dims = n_dims
        self.lambda_U = lambda_U
        self.lambda_V = lambda_V
    
[docs]    def prepare_data(self, df: pd.DataFrame, row_id_name: str, col_id_name: str, rating_name: str) -> None:
        """
        Prepares the data for building a PMF recommendation model.

        Args:
            df (pandas.DataFrame): The input data containing user-item interactions.
                It should have the following columns: user ID, item ID and rating.

        Returns:
            None

        Examples:
            # Example: Loading data from a CSV file
            PMF_model.prepare_data(ratings, 'userId', 'movieId', 'rating')
        """
        self.df = df
        self.row_id_name = row_id_name
        self.col_id_name = col_id_name
        self.rating_name = rating_name
        self.rating_range = (df[rating_name].min(), df[rating_name].max())
        self.R, self.n_users, self.n_items, self.user_to_row, self.item_to_column = self.get_rating_matrix()
        print('Rating matrix prepared!')
        print('Number of users: ', self.n_users)
        print('Number of items: ', self.n_items)
    
[docs]    def get_rating_matrix(self):
        """
        Method to eturns the rating matrix R representing user-item interactions
        with necessary informations.

        Args:
            None

        Returns:
            R: rating matrix representing user-item interactions
            n_users: number of unique users
            n_items: number of unique items
            _user_to_row: dictionary which translate userID to row number
            _item_to_column: dictionary which translate movieID to column number
        """
        _user_to_row = {}
        _item_to_column = {}

        uniq_users = np.unique(self.df[self.row_id_name].values)
        uniq_movies = np.unique(self.df[self.col_id_name].values)

        for i, user_id in enumerate(uniq_users):
            _user_to_row[user_id] = i

        for j, item_id in enumerate(uniq_movies):
            _item_to_column[item_id] = j

        n_users = len(uniq_users)
        n_items = len(uniq_movies)

        R = np.zeros((n_users, n_items))
        for index, row in self.df.iterrows():
            i = _user_to_row[row[self.row_id_name]]
            j = _item_to_column[row[self.col_id_name]]
            R[i, j] = row[self.rating_name]
        
        return R, n_users, n_items, _user_to_row, _item_to_column
    
[docs]    def transpose_dict(self, dictionary='user'):
        """
        Transposes the given dictionary, swapping keys and values.

        Args:
            dictionary (str): Specifies the dictionary to transpose. Should be either 'user' or 'item'.
                If 'user' is provided, the user-to-row dictionary will be transposed, swapping user IDs
                with their corresponding row indices. If 'item' is provided, the item-to-column dictionary
                will be transposed, swapping item IDs with their corresponding column indices.
                Defaults to 'user'.

        Returns:
            dict: The transposed dictionary, where keys and values are swapped.


        Examples:
            # Example 1: Transposing the user-to-row dictionary
            PMF_model = PMF(n_dims = 30, lambda_U = 0.3, lambda_V = 0.3)
            PMF_model.prepare_data(ratings, 'userId', 'movieId', 'rating')
            transposed_user_dict = PMF_model.transpose_dict(dictionary='user')

            # Example 2: Transposing the item-to-column dictionary
            PMF_model = PMF(n_dims = 30, lambda_U = 0.3, lambda_V = 0.3)
            PMF_model.prepare_data(ratings, 'userId', 'movieId', 'rating')
            transposed_item_dict = PMF_model.transpose_dict(dictionary='item')
        """
        if dictionary == 'user':
            trans_dict = {key: value for (value, key) in self.user_to_row.items()}
        if dictionary == 'item':
            trans_dict = {key: value for (value, key) in self.item_to_column.items()}
        return trans_dict

[docs]    def initialize_parameters(self):
        """
        Initializes the parameters for the recommendation model.
        """
        parameters = {}
        U = np.random.normal(0.0, 1.0/self.lambda_U, (self.n_dims, self.n_users))
        V = np.random.normal(0.0, 1.0/self.lambda_V, (self.n_dims, self.n_items))

        parameters['U'] = U
        parameters['V'] = V
        return parameters

[docs]    def update_parameters(self):
        """
        Method for update parameters - latent space matrixes.
        """
        U = self.parameters['U']
        V = self.parameters['V']
        lambda_U = self.lambda_U
        lambda_V = self.lambda_V

        for i in range(self.n_users):
            V_j = V[:, self.R[i, :] > 0]
            U[:, i] = np.dot(np.linalg.inv(np.dot(V_j, V_j.T) + lambda_U * np.identity(self.n_dims)), np.dot(self.R[i, self.R[i, :] > 0], V_j.T))

        for j in range(self.n_items):
            U_i = U[:, self.R[:, j] > 0]
            V[:, j] = np.dot(np.linalg.inv(np.dot(U_i, U_i.T) + lambda_V * np.identity(self.n_dims)), np.dot(self.R[self.R[:, j] > 0, j], U_i.T))

        self.parameters['U'] = U
        self.parameters['V'] = V

[docs]    def log_a_posteriori(self):
        """
        Returns log a posteriori probability
        """
        lambda_U = self.lambda_U
        lambda_V = self.lambda_V
        U = self.parameters['U']
        V = self.parameters['V']

        UV = np.dot(U.T, V)
        R_UV = (self.R[self.R > 0] - UV[self.R > 0])

        return -0.5 * (np.sum(np.dot(R_UV, R_UV.T)) + lambda_U * np.sum(np.dot(U, U.T)) + lambda_V * np.sum(np.dot(V, V.T)))

[docs]    def update_max_min_ratings(self):
        """
        Update min and max ratings in predicted matrix.
        """
        U = self.parameters['U']
        V = self.parameters['V']
        R = U.T @ V
        self.parameters['min_rating'] = np.min(R)
        self.parameters['max_rating'] = np.max(R)
    
[docs]    def predict(self):
        """
        Generates the predicted rating matrix using the trained model parameters.
        
        Returns:
            R_pred: The predicted rating matrix, where rows correspond to users and
            columns correspond to items. Each element of the matrix represents the predicted
            rating for a user-item pair.


        Examples:
            # Example:
            PMF_model = PMF(n_dims = 30, lambda_U = 0.3, lambda_V = 0.3)
            PMF_model.prepare_data(ratings, 'userId', 'movieId', 'rating')
            PMF_model.fit(n_epochs=50)
            PMF_model.predict()

        """
        U = self.parameters['U']
        V = self.parameters['V']
        R_pred = U.T @ V
        R_pred[R_pred > self.rating_range[1]] = self.rating_range[1]
        R_pred[(R_pred < self.rating_range[0]) & (R_pred != 0)] = self.rating_range[0]
        return R_pred

[docs]    def predict_one(self, user_id, item_id):
        """
        Predicts the rating for a specific user-item pair using the trained model parameters.

        Args:
            user_id: The ID of the user for whom the rating is predicted.
            item_id: The ID of the item for which the rating is predicted.

        Returns:
            float: The predicted rating for the specified user-item pair.

        Raises: TODO
            KeyError: If the user ID or item ID is not found in the respective dictionaries.

        Examples:
            # Example:
            predicted_rating = PMF_model.predict_one(user_id=42, item_id=101)
        """
        U = self.parameters['U']
        V = self.parameters['V']

        r_ij = U[:, self.user_to_row[user_id]].T.reshape(1, -1) @ V[:, self.item_to_column[item_id]].reshape(-1, 1)
        if r_ij[0][0] > self.rating_range[1]:
            r_ij[0][0] = self.rating_range[1]
        elif r_ij[0][0] < self.rating_range[0]:
            r_ij[0][0] = self.rating_range[0]

        return r_ij[0][0]

[docs]    def evaluate(self):
        """
        Evaluates the performance of the recommendation model using root mean squared error (RMSE).

        Returns:
            float: The root mean squared error (RMSE) value indicating the model's performance.

        Examples:
            # Example:
            rmse_score = PMF_model.evaluate()
        """
        ratings_mask = self.R > 0
        R_pred = self.predict()
        rmse = (np.square(self.R[ratings_mask] - R_pred[ratings_mask]).mean(axis=None))**(0.5)
        return rmse

[docs]    def fit(self, n_epochs = 10):
        """
        Trains the recommendation model using the specified number of epochs.

        Args:
            n_epochs (int): The number of epochs (iterations) for training the model.
                Defaults to 10.

        Returns:
            None

        Examples:
            # Example:
            PMF_model.fit(n_epochs=20)
        """
        self.parameters = self.initialize_parameters()
        
        self.history = {}
        self.history['log_p'] = []
        self.history['rmse'] = []

        self.update_max_min_ratings()

        for k in range(n_epochs):
            start_time = time.time()
            self.update_parameters()
            log_ap = self.log_a_posteriori()
            self.history['log_p'].append(log_ap)

            self.update_max_min_ratings()
            new_rmse = self.evaluate()
            self.history['rmse'].append(new_rmse)
            end_time = time.time()
            
            print(f'Epoch {k+1}/{n_epochs} \n  Time: {round(end_time - start_time, 1)}s, Log p posteriori: {round(log_ap, 4)}, RMSE: {round(new_rmse, 4)}')

        self.update_max_min_ratings()
    
[docs]    def plot_history(self):
        """
        Plots the history of the training process, including the log a-posteriori probability
        and root mean square error (RMSE) values over the epochs.

        Returns:
            None

        Examples:
            # Example:
            PMF_model.plot_history()
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        ax1.set_title('Log a-posteriori probability')
        ax1.plot(np.arange(1, len(self.history['log_p'])+1), self.history['log_p'], label='log_p')
        ax1.set_xlabel('epoch')
        ax1.set_ylabel('log_p')
        ax1.legend()

        ax2.set_title('Root Mean Square Error')
        ax2.plot(np.arange(1, len(self.history['rmse'])+1), self.history['rmse'], label='RMSE')
        ax2.set_xlabel('epoch')
        ax2.set_ylabel('RMSE')
        ax2.legend()
        plt.show()
    
[docs]    def get_masked_preds(self):
        """
        Returns the masked predictions where the original rating matrix has zero values.

        Returns:
            numpy.ndarray: The masked predictions, where predictions are shown only for positions
                where the original rating matrix has zero values. Positions with non-zero values
                in the original matrix are masked (set to zero) in the predictions.

        Examples:
            # Example: Getting the masked predictions
            PMF_model.get_masked_preds()
        """
        return self.predict()*(self.R == 0)