content based commit

e1451ea1 · Adrien Payen · 9ec41711 · e1451ea1 · e1451ea1
--- a/analytics_small.ipynb
+++ b/analytics_small.ipynb
@@ -6,15 +6,274 @@
   "metadata": {},
   "outputs": [
    {
-     "ename": "ImportError",
+     "name": "stdout",
-     "evalue": "cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)",
+     "output_type": "stream",
-     "output_type": "error",
+     "text": [
-     "traceback": [
+      "Display The Movies : \n"
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+     ]
-      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+    },
-      "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msparse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m csr_matrix\n\u001b[1;32m     11\u001b[0m \u001b[38;5;66;03m# Constants and functions\u001b[39;00m\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Constant \u001b[38;5;28;01mas\u001b[39;00m C\n\u001b[1;32m     13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mloaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_ratings\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mloaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_items\n",
+    {
-      "\u001b[0;31mImportError\u001b[0m: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)"
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>genres</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>movieId</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Grumpier Old Men (1995)</td>\n",
+       "      <td>Comedy|Romance</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Cutthroat Island (1995)</td>\n",
+       "      <td>Action|Adventure|Romance</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>Babe (1995)</td>\n",
+       "      <td>Children|Drama</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>59</th>\n",
+       "      <td>Confessional, The (Confessionnal, Le) (1995)</td>\n",
+       "      <td>Drama|Mystery</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>64</th>\n",
+       "      <td>Two if by Sea (1996)</td>\n",
+       "      <td>Comedy|Romance</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>148652</th>\n",
+       "      <td>The Ridiculous 6 (2015)</td>\n",
+       "      <td>Comedy|Western</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>151307</th>\n",
+       "      <td>The Lovers and the Despot</td>\n",
+       "      <td>(no genres listed)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>152173</th>\n",
+       "      <td>Michael Jackson's Thriller (1983)</td>\n",
+       "      <td>Horror</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>160440</th>\n",
+       "      <td>The Maid's Room (2014)</td>\n",
+       "      <td>Thriller</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>160656</th>\n",
+       "      <td>Tallulah (2016)</td>\n",
+       "      <td>Drama</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>912 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                title  \\\n",
+       "movieId                                                 \n",
+       "3                             Grumpier Old Men (1995)   \n",
+       "15                            Cutthroat Island (1995)   \n",
+       "34                                        Babe (1995)   \n",
+       "59       Confessional, The (Confessionnal, Le) (1995)   \n",
+       "64                               Two if by Sea (1996)   \n",
+       "...                                               ...   \n",
+       "148652                        The Ridiculous 6 (2015)   \n",
+       "151307                      The Lovers and the Despot   \n",
+       "152173              Michael Jackson's Thriller (1983)   \n",
+       "160440                         The Maid's Room (2014)   \n",
+       "160656                                Tallulah (2016)   \n",
+       "\n",
+       "                           genres  \n",
+       "movieId                            \n",
+       "3                  Comedy|Romance  \n",
+       "15       Action|Adventure|Romance  \n",
+       "34                 Children|Drama  \n",
+       "59                  Drama|Mystery  \n",
+       "64                 Comedy|Romance  \n",
+       "...                           ...  \n",
+       "148652             Comedy|Western  \n",
+       "151307         (no genres listed)  \n",
+       "152173                     Horror  \n",
+       "160440                   Thriller  \n",
+       "160656                      Drama  \n",
+       "\n",
+       "[912 rows x 2 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Display The Ratings : \n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>userId</th>\n",
+       "      <th>movieId</th>\n",
+       "      <th>rating</th>\n",
+       "      <th>timestamp</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>15</td>\n",
+       "      <td>34</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>997938310</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>15</td>\n",
+       "      <td>95</td>\n",
+       "      <td>1.5</td>\n",
+       "      <td>1093028331</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>15</td>\n",
+       "      <td>101</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>1134522072</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>15</td>\n",
+       "      <td>123</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>997938358</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>15</td>\n",
+       "      <td>125</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>1245362506</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5291</th>\n",
+       "      <td>665</td>\n",
+       "      <td>3908</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1046967201</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5292</th>\n",
+       "      <td>665</td>\n",
+       "      <td>4052</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>992838277</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5293</th>\n",
+       "      <td>665</td>\n",
+       "      <td>4351</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>992837743</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5294</th>\n",
+       "      <td>665</td>\n",
+       "      <td>4643</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>997239207</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5295</th>\n",
+       "      <td>665</td>\n",
+       "      <td>5502</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>1046967596</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5296 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      userId  movieId  rating   timestamp\n",
+       "0         15       34     3.0   997938310\n",
+       "1         15       95     1.5  1093028331\n",
+       "2         15      101     4.0  1134522072\n",
+       "3         15      123     4.0   997938358\n",
+       "4         15      125     3.5  1245362506\n",
+       "...      ...      ...     ...         ...\n",
+       "5291     665     3908     1.0  1046967201\n",
+       "5292     665     4052     4.0   992838277\n",
+       "5293     665     4351     4.0   992837743\n",
+       "5294     665     4643     4.0   997239207\n",
+       "5295     665     5502     4.0  1046967596\n",
+       "\n",
+       "[5296 rows x 4 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -52,7 +311,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@@ -71,7 +330,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@@ -96,7 +355,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@@ -148,7 +407,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@@ -167,7 +426,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@@ -186,7 +445,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@@ -206,7 +465,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@@ -229,7 +488,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@@ -252,7 +511,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
@@ -286,7 +545,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@@ -316,7 +575,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@@ -349,7 +608,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -366,7 +625,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
@@ -429,7 +688,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {

 %% Cell type:code id: tags:
 ``` python
 # Reload modules automatically before entering the execution of code
 %load_ext autoreload
 %autoreload 2
 # Third-party imports
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from scipy.sparse import csr_matrix
 # Constants and functions
 from constants import Constant as C
 from loaders import load_ratings
 from loaders import load_items
 from tabulate import tabulate
 # Call the load_items() function and create a variable df_items
 df_movies = load_items()
 # Display the DataFrame
 print("Display The Movies : ")
 display(df_movies)
 # Call the load_ratings() function and create a variable df_ratings
 df_ratings = load_ratings()
 # Display the DataFrame
 print("Display The Ratings : ")
 display(df_ratings)
 ```
 %% Output
-    ---------------------------------------------------------------------------
+    Display The Movies :
-    ImportError                               Traceback (most recent call last)
-Cell     In[1], line 12
-          9 from scipy.sparse import csr_matrix
+    Display The Ratings :
-         11 # Constants and functions
-    ---> 12 from constants import Constant as C
-         13 from loaders import load_ratings
-         14 from loaders import load_items
-    ImportError: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)
 %% Cell type:code id: tags:
 ``` python
 # NUMBER OF MOVIES
 n_movies = df_movies['title'].nunique()
 print(f"Number of movies: {n_movies}")
 ```
 %% Output
    Number of movies: 912
 %% Cell type:code id: tags:
 ``` python
 # THE YEAR RANGE
 df_movies['annee'] = df_movies['title'].str.extract(r'\((.{4})\)')
 df_movies['annee'] = pd.to_numeric(df_movies['annee'], errors='coerce')
 min_range = int(df_movies['annee'].min())
 max_range = int(df_movies['annee'].max())
 print("Minimum range:", min_range)
 print("Maximum range:", max_range)
 ```
 %% Output
    Minimum range: 1921
    Maximum range: 2016
 %% Cell type:code id: tags:
 ``` python
 # LIST OF MOVIE GENRES
 def tabulate_genres(df_movies):
    """Tabulate list of movie genres."""
    # Split genres and explode
    df_movies['genres'] = df_movies['genres'].str.split('|')
    df_movies = df_movies.explode('genres')
    unique_genres = sorted(df_movies['genres'].unique())
    # Tabulate
    print("\nList of all genres:")
    genres_table = [[genre, "|"] for genre in unique_genres]
    print(tabulate(genres_table, tablefmt="plain", numalign="left"))
 # Call the tabulate_genres function
 tabulate_genres(df_movies)
 ```
 %% Output
    List of all genres:
    (no genres listed)  |
    Action              |
    Adventure           |
    Animation           |
    Children            |
    Comedy              |
    Crime               |
    Documentary         |
    Drama               |
    Fantasy             |
    Film-Noir           |
    Horror              |
    IMAX                |
    Musical             |
    Mystery             |
    Romance             |
    Sci-Fi              |
    Thriller            |
    War                 |
    Western             |
 %% Cell type:code id: tags:
 ``` python
 # THE TOTAL NUMBER OF RATINGS
 n_ratings = df_ratings['rating'].count()
 print(f"Number of ratings: {n_ratings}")
 ```
 %% Output
    Number of ratings: 5296
 %% Cell type:code id: tags:
 ``` python
 # THE NUMBER OF UNIQUE USERS
 n_users = df_ratings['userId'].nunique()
 print(f"Number of users: {n_users}")
 ```
 %% Output
    Number of users: 107
 %% Cell type:code id: tags:
 ``` python
 # THE NUMBER OF UNIQUE MOVIES (IN THE RATING MATRIX)
 unique_movies = df_ratings["movieId"].unique()
 num_unique_movies = len(unique_movies)
 print(f"Number of unique movies : {num_unique_movies}")
 ```
 %% Output
    Number of unique movies : 834
 %% Cell type:code id: tags:
 ``` python
 # THE NUMBER OF RATINGS OF THE MOST RATED MOVIES
 def most_rated_movies_ratings_count(df_ratings):
    movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
    most_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.max()]
    print(f"Number of ratings of the most rated movie(s): {most_rated_movies.max()}")
 most_rated_movies_ratings_count(df_ratings)
 ```
 %% Output
    Number of ratings of the most rated movie(s): 75
 %% Cell type:code id: tags:
 ``` python
 # THE NUMBER OF RATINGS OF THE LESS RATED MOVIES
 def least_rated_movies_ratings_count(df_ratings):
    movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
    least_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.min()]
    print("Number of ratings of the least rated movie(s):", least_rated_movies.min())
 least_rated_movies_ratings_count(df_ratings)
 ```
 %% Output
    Number of ratings of the least rated movie(s): 1
 %% Cell type:code id: tags:
 ``` python
 # ALL THE POSSIBLE RATING VALUES; FROM THE SMALLEST VALUE TO THE VALUE HIGHEST
 def all_possible_ratings(df_ratings):
    rating_values = sorted(df_ratings['rating'].unique())
    print("All possible rating values, from smallest to highest:")
    for rating in rating_values:
        print(rating)
 all_possible_ratings(df_ratings)
 ```
 %% Output
    All possible rating values, from smallest to highest:
    0.5
    1.0
    1.5
    2.0
    2.5
    3.0
    3.5
    4.0
    4.5
    5.0
 %% Cell type:code id: tags:
 ``` python
 # THE NUMBER OF MOVIES THAT WERE NOT RATED AT ALL
 def unrated_movies_count(df_ratings, df_movies):
    rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else []
    unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0]
    print("Number of movies that were not rated at all:", unrated_movies_count)
 unrated_movies_count(df_ratings, df_movies)
 ```
 %% Output
    Number of movies that were not rated at all: 78
 %% Cell type:markdown id: tags:
 LONG-TAIL PROPERTY
 %% Cell type:code id: tags:
 ``` python
 # Rating Frequency Distribution
 merged_df = pd.merge(df_ratings,df_movies, on='movieId')
 rating_counts = merged_df['movieId'].value_counts()
 value_counts = rating_counts.value_counts().sort_index()
 plt.figure(figsize=(20, 6))
 plt.plot(value_counts.values, value_counts.index, marker='o', color='skyblue', linestyle='-')  # Swap x and y arguments
 plt.title('Rating Frequency Distribution')
 plt.xlabel('Number of Movies')  # Update x-label
 plt.ylabel('Number of Ratings')  # Update y-label
 plt.xticks(rotation=45)
 plt.grid(axis='x', linestyle='--', alpha=0.7)  # Change grid to x-axis
 plt.tight_layout()
 plt.show()
 ```
 %% Output
 %% Cell type:code id: tags:
 ``` python
 M = df_ratings['userId'].nunique()
 N = df_ratings['movieId'].nunique()
 user_mapper = dict(zip(np.unique(df_ratings["userId"]), list(range(M))))
 movie_mapper = dict(zip(np.unique(df_ratings["movieId"]), list(range(N))))
 user_inv_mapper = dict(zip(list(range(M)), np.unique(df_ratings["userId"])))
 movie_inv_mapper = dict(zip(list(range(N)), np.unique(df_ratings["movieId"])))
 user_index = [user_mapper[i] for i in df_ratings['userId']]
 item_index = [movie_mapper[i] for i in df_ratings['movieId']]
 X = csr_matrix((df_ratings["rating"], (user_index,item_index)), shape=(M,N))
 ```
 %% Cell type:code id: tags:
 ``` python
 def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    Args:
        df: pandas dataframe containing 3 columns (userId, movieId, rating)
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    M = df['userId'].nunique()
    N = df['movieId'].nunique()
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]
    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
 # Assuming df_ratings contains your ratings dataframe
 X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings)
 # Extract the 100 first users and 100 first items
 X_sub = X[:100, :100]
 # Plot the non-zero values of the sparse matrix
 plt.figure(figsize=(8, 6))
 plt.spy(X_sub, markersize=1)
 plt.title('Non-zero values of a sparse matrix')
 plt.xlabel('Movie Index')
 plt.ylabel('User Index')
 plt.show()
 ```
 %% Output
 %% Cell type:code id: tags:
 ``` python
 n_total = X.shape[0]*X.shape[1]
 n_ratings = X.nnz
 sparsity = n_ratings/n_total
 print(f"Matrix sparsity: {round(sparsity*100,2)}%")
 ```
 %% Output
    Matrix sparsity: 5.93%

--- a/content_based.ipynb
+++ b/content_based.ipynb
@@ -10,20 +10,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 9,
   "id": "277473a3",
   "metadata": {},
   "outputs": [
    {
-     "ename": "ImportError",
+     "name": "stdout",
-     "evalue": "cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)",
+     "output_type": "stream",
-     "output_type": "error",
+     "text": [
-     "traceback": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "  %reload_ext autoreload\n"
-      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[1], line 10\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msurprise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AlgoBase\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msurprise\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprediction_algorithms\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpredictions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PredictionImpossible\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mloaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_ratings\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mloaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_items\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Constant \u001b[38;5;28;01mas\u001b[39;00m C\n",
-      "File \u001b[0;32m~/vscodeworkspace/recomsys/loaders.py:7\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# Local imports\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Constant \u001b[38;5;28;01mas\u001b[39;00m C\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msurprise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Reader, Dataset\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_ratings\u001b[39m(surprise_format\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n",
-      "\u001b[0;31mImportError\u001b[0m: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)"
     ]
    }
   ],
@@ -54,7 +50,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "id": "e8378976",
   "metadata": {},
   "outputs": [
@@ -147,7 +143,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "id": "16b0a602",
   "metadata": {},
   "outputs": [],
@@ -184,14 +180,15 @@
    "        elif self.regressor_method == 'random_sample':\n",
    "            for u in self.user_profile:\n",
    "                self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]\n",
-    "        else:\n",
+    "        elif self.regressor_method == 'linear_regression' :\n",
    "            for u in self.user_profile:\n",
    "\n",
-    "                user_ratings = [(trainset.to_raw_iid(iid), rating) for (iid, rating) in trainset.ur[u]]\n",
+    "                user_ratings = [rating for _, rating in trainset.ur[u]]\n",
+    "                item_ids = [iid for iid, _ in trainset.ur[u]]\n",
    "\n",
-    "                df_user = pd.DataFrame(user_ratings, columns = [\"item_id\", \"user_ratings\"])\n",
+    "                df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n",
    "\n",
-    "                df_user[\"item_id\"] = df_user['item_id'].map(trainset.to_raw_idd)\n",
+    "                df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n",
    "\n",
    "                df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n",
    "\n",
@@ -205,6 +202,8 @@
    "                \n",
    "                # Store the computed user profile\n",
    "                self.user_profile[u] = linear_regressor\n",
+    "        else : \n",
+    "            pass\n",
    "\n",
    "            # (implement here the regressor fitting)  \n",
    "        \n",
@@ -223,7 +222,7 @@
    "            rd.seed()\n",
    "            score = rd.choice(self.user_profile[u])\n",
    "        \n",
-    "        else:\n",
+    "        elif self.regressor_method == 'linear_regression':\n",
    "\n",
    "            raw_item_id = self.trainset.to_raw_iid(i)\n",
    "\n",
@@ -232,11 +231,12 @@
    "            linear_regressor = self.user_profile[u]\n",
    "\n",
    "            score= linear_regressor.predict(item_features)[0]\n",
-    "\n",
+    "        else : \n",
+    "            score = None\n",
    "\n",
    "            # (implement here the regressor prediction)\n",
    "\n",
-    "        return score\n"
+    "        return score"
   ]
  },
  {
@@ -249,7 +249,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "id": "69d12f7d",
   "metadata": {},
   "outputs": [
@@ -257,8 +257,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "user: 15         item: 942        r_ui = None   est = 3.59   {'was_impossible': False}\n",
+      "user: 15         item: 942        r_ui = None   est = 3.79   {'was_impossible': False}\n",
-      "user: 15         item: 942        r_ui = None   est = 3.00   {'was_impossible': False}\n"
+      "user: 15         item: 942        r_ui = None   est = 4.00   {'was_impossible': False}\n"
     ]
    }
   ],

 %% Cell type:markdown id:82d5ca82 tags:
 # Packages
 %% Cell type:code id:277473a3 tags:
 ``` python
 %load_ext autoreload
 %autoreload 2
 import numpy as np
 import pandas as pd
 import random as rd
 from surprise import AlgoBase
 from surprise.prediction_algorithms.predictions import PredictionImpossible
 from loaders import load_ratings
 from loaders import load_items
 from constants import Constant as C
 from sklearn.linear_model import LinearRegression
 ```
 %% Output
-    ---------------------------------------------------------------------------
+    The autoreload extension is already loaded. To reload it, use:
-    ImportError                               Traceback (most recent call last)
+      %reload_ext autoreload
-Cell     In[1], line 10
-          7 from surprise import AlgoBase
-          8 from surprise.prediction_algorithms.predictions import PredictionImpossible
-    ---> 10 from loaders import load_ratings
-         11 from loaders import load_items
-         12 from constants import Constant as C
-File     ~/vscodeworkspace/recomsys/loaders.py:7
-          3 import os
-          6 # Local imports
-    ----> 7 from constants import Constant as C
-          8 from surprise import Reader, Dataset
-         10 def load_ratings(surprise_format=False):
-    ImportError: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)
 %% Cell type:markdown id:a42c16bf tags:
 # Explore and select content features
 %% Cell type:code id:e8378976 tags:
 ``` python
 df_items = load_items()
 df_ratings = load_ratings()
 # Example 1 : create title_length features
 df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
 display(df_features.head())
 # (explore here other features)
 ```
 %% Output
 %% Cell type:markdown id:a2c9a2b6 tags:
 # Build a content-based model
 When ready, move the following class in the *models.py* script
 %% Cell type:code id:16b0a602 tags:
 ``` python
 class ContentBased(AlgoBase):
    def __init__(self, features_method, regressor_method):
        AlgoBase.__init__(self)
        self.regressor_method = regressor_method
        self.content_features = self.create_content_features(features_method)
    def create_content_features(self, features_method):
        """Content Analyzer"""
        df_items = load_items()
        if features_method is None:
            df_features = None
        elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
            df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
        else: # (implement other feature creations here)
            raise NotImplementedError(f'Feature method {features_method} not yet implemented')
        return df_features
    def fit(self, trainset):
        """Profile Learner"""
        AlgoBase.fit(self, trainset)
        # Preallocate user profiles
        self.user_profile = {u: None for u in trainset.all_users()}
        if self.regressor_method == 'random_score':
            for u in self.user_profile :
                self.user_profile[u] = rd.uniform(0.5,5)
        elif self.regressor_method == 'random_sample':
            for u in self.user_profile:
                self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]
-        else:
+        elif self.regressor_method == 'linear_regression' :
            for u in self.user_profile:
-                user_ratings = [(trainset.to_raw_iid(iid), rating) for (iid, rating) in trainset.ur[u]]
+                user_ratings = [rating for _, rating in trainset.ur[u]]
+                item_ids = [iid for iid, _ in trainset.ur[u]]
-                df_user = pd.DataFrame(user_ratings, columns = ["item_id", "user_ratings"])
+                df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})
-                df_user["item_id"] = df_user['item_id'].map(trainset.to_raw_idd)
+                df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid)
                df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left')
                X = df_user['n_character_title'].values.reshape(-1,1)
                y = df_user['user_ratings'].values
                linear_regressor = LinearRegression(fit_intercept = False)
                linear_regressor.fit(X,y)
                # Store the computed user profile
                self.user_profile[u] = linear_regressor
+        else :
+            pass
            # (implement here the regressor fitting)
    def estimate(self, u, i):
        """Scoring component used for item filtering"""
        # First, handle cases for unknown users and items
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')
        if self.regressor_method == 'random_score':
            rd.seed()
            score = rd.uniform(0.5,5)
        elif self.regressor_method == 'random_sample':
            rd.seed()
            score = rd.choice(self.user_profile[u])
-        else:
+        elif self.regressor_method == 'linear_regression':
            raw_item_id = self.trainset.to_raw_iid(i)
            item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values
            linear_regressor = self.user_profile[u]
            score= linear_regressor.predict(item_features)[0]
+        else :
+            score = None
            # (implement here the regressor prediction)
        return score
 ```
 %% Cell type:markdown id:ffd75b7e tags:
 The following script test the ContentBased class
 %% Cell type:code id:69d12f7d tags:
 ``` python
 def test_contentbased_class(feature_method, regressor_method):
    """Test the ContentBased class.
    Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
    """
    sp_ratings = load_ratings(surprise_format=True)
    train_set = sp_ratings.build_full_trainset()
    content_algo = ContentBased(feature_method, regressor_method)
    content_algo.fit(train_set)
    anti_test_set_first = train_set.build_anti_testset()[0]
    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
    print(prediction)
 # (call here the test functions with different regressor methods)
 test_contentbased_class(feature_method = "title_length" , regressor_method = "random_score")
 test_contentbased_class(feature_method = "title_length" , regressor_method = "random_sample")
 ```
 %% Output
-    user: 15         item: 942        r_ui = None   est = 3.59   {'was_impossible': False}
+    user: 15         item: 942        r_ui = None   est = 3.79   {'was_impossible': False}
-    user: 15         item: 942        r_ui = None   est = 3.00   {'was_impossible': False}
+    user: 15         item: 942        r_ui = None   est = 4.00   {'was_impossible': False}