commit feature methods

25cec0bf · Adrien Payen · 861461d5 · 25cec0bf · 25cec0bf
--- a/constants.py
+++ b/constants.py
@@ -16,6 +16,10 @@ class Constant:
    LABEL_COL = 'title' # Column name for item labels
    GENRES_COL = 'genres'  # Column name for item genres
+    TAGS_FILENAME = "tags.csv"
+    TAG = 'tag'
    # Evidence
    EVIDENCE_PATH = DATA_PATH / 'evidence' # Path to evidence data
    # - ratings

--- a/content_based.ipynb
+++ b/content_based.ipynb
@@ -10,7 +10,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 41,
   "id": "277473a3",
   "metadata": {},
   "outputs": [
@@ -50,7 +50,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 42,
   "id": "e8378976",
   "metadata": {},
   "outputs": [
@@ -84,24 +84,24 @@
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
-       "      <th>3</th>\n",
+       "      <th>4993</th>\n",
-       "      <td>23</td>\n",
+       "      <td>57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>15</th>\n",
+       "      <th>5952</th>\n",
-       "      <td>23</td>\n",
+       "      <td>45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>34</th>\n",
+       "      <th>527</th>\n",
-       "      <td>11</td>\n",
+       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>59</th>\n",
+       "      <th>2028</th>\n",
-       "      <td>44</td>\n",
+       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>64</th>\n",
+       "      <th>4308</th>\n",
-       "      <td>20</td>\n",
+       "      <td>19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@@ -110,11 +110,25 @@
      "text/plain": [
       "         n_character_title\n",
       "movieId                   \n",
-       "3                       23\n",
+       "4993                    57\n",
-       "15                      23\n",
+       "5952                    45\n",
-       "34                      11\n",
+       "527                     23\n",
-       "59                      44\n",
+       "2028                    26\n",
-       "64                      20"
+       "4308                    19"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0         long\n",
+       "1       boring\n",
+       "2         long\n",
+       "3      romance\n",
+       "4    stupidity\n",
+       "Name: tag, dtype: object"
      ]
     },
     "metadata": {},
@@ -129,6 +143,10 @@
    "df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n",
    "display(df_features.head())\n",
    "\n",
+    "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
+    "df_features = df_tag[C.TAG]\n",
+    "display(df_features.head())\n",
+    "\n",
    "# (explore here other features)\n"
   ]
  },
@@ -143,7 +161,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 43,
   "id": "16b0a602",
   "metadata": {},
   "outputs": [],
@@ -161,6 +179,28 @@
    "            df_features = None\n",
    "        elif features_method == \"title_length\": # a naive method that creates only 1 feature based on title length\n",
    "            df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n",
+    "\n",
+    "        elif features_method == \"movie_year\" :\n",
+    "            df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False)\n",
+    "\n",
+    "        elif features_method == \"genres\" :\n",
+    "            genres_list = df_items['genres'].str.split('|').explode().unique()\n",
+    "            for genre in genres_list:\n",
+    "                df_features = df_items['genres'].str.contains(genre).astype(int)\n",
+    "\n",
+    "        elif features_method == \"rating\" :\n",
+    "            df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')\n",
+    "\n",
+    "        elif features_method == \"tags\" :\n",
+    "            df_features = df_tag['tag'].apply(lambda x: len(x.split(',')))\n",
+    "\n",
+    "        elif features_method == \"tags_length\" :\n",
+    "             df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')))\n",
+    "\n",
+    "        elif features_method == \"timestamp\" :\n",
+    "            df_features =  df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400)\n",
+    "            df_features =  df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400)\n",
+    "\n",
    "        else: # (implement other feature creations here)\n",
    "            raise NotImplementedError(f'Feature method {features_method} not yet implemented')\n",
    "        return df_features\n",
@@ -249,7 +289,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 44,
   "id": "69d12f7d",
   "metadata": {},
   "outputs": [
@@ -257,8 +297,40 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "user: 15         item: 942        r_ui = None   est = 3.79   {'was_impossible': False}\n",
+      "user: 11         item: 1214       r_ui = None   est = 0.86   {'was_impossible': False}\n",
-      "user: 15         item: 942        r_ui = None   est = 4.00   {'was_impossible': False}\n"
+      "user: 11         item: 1214       r_ui = None   est = 1.00   {'was_impossible': False}\n",
+      "user: 11         item: 1214       r_ui = None   est = 4.42   {'was_impossible': False}\n",
+      "user: 11         item: 1214       r_ui = None   est = 3.00   {'was_impossible': False}\n",
+      "user: 11         item: 1214       r_ui = None   est = 4.53   {'was_impossible': False}\n",
+      "user: 11         item: 1214       r_ui = None   est = 3.00   {'was_impossible': False}\n",
+      "user: 11         item: 1214       r_ui = None   est = 0.72   {'was_impossible': False}\n",
+      "user: 11         item: 1214       r_ui = None   est = 4.00   {'was_impossible': False}\n",
+      "user: 11         item: 1214       r_ui = None   est = 3.33   {'was_impossible': False}\n",
+      "user: 11         item: 1214       r_ui = None   est = 3.00   {'was_impossible': False}\n"
+     ]
+    },
+    {
+     "ename": "KeyError",
+     "evalue": "'timestamp'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:3791\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3790\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3791\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3792\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
+      "File \u001b[0;32mindex.pyx:152\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mindex.pyx:181\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'timestamp'",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[44], line 33\u001b[0m\n\u001b[1;32m     30\u001b[0m test_contentbased_class(feature_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtags_length\u001b[39m\u001b[38;5;124m\"\u001b[39m, regressor_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrandom_score\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     31\u001b[0m test_contentbased_class(feature_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtags_length\u001b[39m\u001b[38;5;124m\"\u001b[39m, regressor_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrandom_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 33\u001b[0m \u001b[43mtest_contentbased_class\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature_method\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtimestamp\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mregressor_method\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrandom_score\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     34\u001b[0m test_contentbased_class(feature_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m\"\u001b[39m, regressor_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrandom_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "Cell \u001b[0;32mIn[44], line 7\u001b[0m, in \u001b[0;36mtest_contentbased_class\u001b[0;34m(feature_method, regressor_method)\u001b[0m\n\u001b[1;32m      5\u001b[0m sp_ratings \u001b[38;5;241m=\u001b[39m load_ratings(surprise_format\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m      6\u001b[0m train_set \u001b[38;5;241m=\u001b[39m sp_ratings\u001b[38;5;241m.\u001b[39mbuild_full_trainset()\n\u001b[0;32m----> 7\u001b[0m content_algo \u001b[38;5;241m=\u001b[39m \u001b[43mContentBased\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature_method\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mregressor_method\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      8\u001b[0m content_algo\u001b[38;5;241m.\u001b[39mfit(train_set)\n\u001b[1;32m      9\u001b[0m anti_test_set_first \u001b[38;5;241m=\u001b[39m train_set\u001b[38;5;241m.\u001b[39mbuild_anti_testset()[\u001b[38;5;241m0\u001b[39m]\n",
+      "Cell \u001b[0;32mIn[43], line 5\u001b[0m, in \u001b[0;36mContentBased.__init__\u001b[0;34m(self, features_method, regressor_method)\u001b[0m\n\u001b[1;32m      3\u001b[0m AlgoBase\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregressor_method \u001b[38;5;241m=\u001b[39m regressor_method\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontent_features \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_content_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeatures_method\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[43], line 33\u001b[0m, in \u001b[0;36mContentBased.create_content_features\u001b[0;34m(self, features_method)\u001b[0m\n\u001b[1;32m     30\u001b[0m      df_features \u001b[38;5;241m=\u001b[39m df_tag[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtag\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[38;5;28msum\u001b[39m(\u001b[38;5;28mlen\u001b[39m(tag) \u001b[38;5;28;01mfor\u001b[39;00m tag \u001b[38;5;129;01min\u001b[39;00m x\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m'\u001b[39m)))\n\u001b[1;32m     32\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m features_method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m\"\u001b[39m :\n\u001b[0;32m---> 33\u001b[0m     df_features \u001b[38;5;241m=\u001b[39m  df_items[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp_sin\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39msin(\u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m np\u001b[38;5;241m.\u001b[39mpi \u001b[38;5;241m*\u001b[39m \u001b[43mdf_items\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimestamp\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m86400\u001b[39m)\n\u001b[1;32m     34\u001b[0m     df_features \u001b[38;5;241m=\u001b[39m  df_items[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp_cos\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mcos(\u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m np\u001b[38;5;241m.\u001b[39mpi \u001b[38;5;241m*\u001b[39m df_items[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m86400\u001b[39m)\n\u001b[1;32m     36\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m: \u001b[38;5;66;03m# (implement other feature creations here)\u001b[39;00m\n",
+      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py:3893\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3891\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   3892\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 3893\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m   3895\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
+      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:3798\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3793\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m   3794\u001b[0m         \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m   3795\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m   3796\u001b[0m     ):\n\u001b[1;32m   3797\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3798\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m   3799\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m   3800\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m   3801\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m   3802\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[1;32m   3803\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'timestamp'"
     ]
    }
   ],
@@ -278,7 +350,25 @@
    "# (call here the test functions with different regressor methods)\n",
    "\n",
    "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_score\")\n",
-    "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_sample\")"
+    "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_sample\")\n",
+    "\n",
+    "test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"random_score\")\n",
+    "test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"random_sample\")\n",
+    "\n",
+    "test_contentbased_class(feature_method=\"genres\", regressor_method=\"random_score\")\n",
+    "test_contentbased_class(feature_method=\"genres\", regressor_method=\"random_sample\")\n",
+    "\n",
+    "test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_score\")\n",
+    "test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_sample\")\n",
+    "\n",
+    "test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_score\")\n",
+    "test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_sample\")\n",
+    "\n",
+    "test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_score\")\n",
+    "test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_sample\")\n",
+    "\n",
+    "test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"random_score\")\n",
+    "test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"random_sample\")"
   ]
  }
 ],

 %% Cell type:markdown id:82d5ca82 tags:
 # Packages
 %% Cell type:code id:277473a3 tags:
 ``` python
 %load_ext autoreload
 %autoreload 2
 import numpy as np
 import pandas as pd
 import random as rd
 from surprise import AlgoBase
 from surprise.prediction_algorithms.predictions import PredictionImpossible
 from loaders import load_ratings
 from loaders import load_items
 from constants import Constant as C
 from sklearn.linear_model import LinearRegression
 ```
 %% Output
    The autoreload extension is already loaded. To reload it, use:
      %reload_ext autoreload
 %% Cell type:markdown id:a42c16bf tags:
 # Explore and select content features
 %% Cell type:code id:e8378976 tags:
 ``` python
 df_items = load_items()
 df_ratings = load_ratings()
 # Example 1 : create title_length features
 df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
 display(df_features.head())
+df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
+df_features = df_tag[C.TAG]
+display(df_features.head())
 # (explore here other features)
 ```
 %% Output
 %% Cell type:markdown id:a2c9a2b6 tags:
 # Build a content-based model
 When ready, move the following class in the *models.py* script
 %% Cell type:code id:16b0a602 tags:
 ``` python
 class ContentBased(AlgoBase):
    def __init__(self, features_method, regressor_method):
        AlgoBase.__init__(self)
        self.regressor_method = regressor_method
        self.content_features = self.create_content_features(features_method)
    def create_content_features(self, features_method):
        """Content Analyzer"""
        df_items = load_items()
        if features_method is None:
            df_features = None
        elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
            df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
+        elif features_method == "movie_year" :
+            df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False)
+        elif features_method == "genres" :
+            genres_list = df_items['genres'].str.split('|').explode().unique()
+            for genre in genres_list:
+                df_features = df_items['genres'].str.contains(genre).astype(int)
+        elif features_method == "rating" :
+            df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')
+        elif features_method == "tags" :
+            df_features = df_tag['tag'].apply(lambda x: len(x.split(',')))
+        elif features_method == "tags_length" :
+             df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')))
+        elif features_method == "timestamp" :
+            df_features =  df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400)
+            df_features =  df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400)
        else: # (implement other feature creations here)
            raise NotImplementedError(f'Feature method {features_method} not yet implemented')
        return df_features
    def fit(self, trainset):
        """Profile Learner"""
        AlgoBase.fit(self, trainset)
        # Preallocate user profiles
        self.user_profile = {u: None for u in trainset.all_users()}
        if self.regressor_method == 'random_score':
            for u in self.user_profile :
                self.user_profile[u] = rd.uniform(0.5,5)
        elif self.regressor_method == 'random_sample':
            for u in self.user_profile:
                self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]
        elif self.regressor_method == 'linear_regression' :
            for u in self.user_profile:
                user_ratings = [rating for _, rating in trainset.ur[u]]
                item_ids = [iid for iid, _ in trainset.ur[u]]
                df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})
                df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid)
                df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left')
                X = df_user['n_character_title'].values.reshape(-1,1)
                y = df_user['user_ratings'].values
                linear_regressor = LinearRegression(fit_intercept = False)
                linear_regressor.fit(X,y)
                # Store the computed user profile
                self.user_profile[u] = linear_regressor
        else :
            pass
            # (implement here the regressor fitting)
    def estimate(self, u, i):
        """Scoring component used for item filtering"""
        # First, handle cases for unknown users and items
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')
        if self.regressor_method == 'random_score':
            rd.seed()
            score = rd.uniform(0.5,5)
        elif self.regressor_method == 'random_sample':
            rd.seed()
            score = rd.choice(self.user_profile[u])
        elif self.regressor_method == 'linear_regression':
            raw_item_id = self.trainset.to_raw_iid(i)
            item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values
            linear_regressor = self.user_profile[u]
            score= linear_regressor.predict(item_features)[0]
        else :
            score = None
            # (implement here the regressor prediction)
        return score
 ```
 %% Cell type:markdown id:ffd75b7e tags:
 The following script test the ContentBased class
 %% Cell type:code id:69d12f7d tags:
 ``` python
 def test_contentbased_class(feature_method, regressor_method):
    """Test the ContentBased class.
    Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
    """
    sp_ratings = load_ratings(surprise_format=True)
    train_set = sp_ratings.build_full_trainset()
    content_algo = ContentBased(feature_method, regressor_method)
    content_algo.fit(train_set)
    anti_test_set_first = train_set.build_anti_testset()[0]
    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
    print(prediction)
 # (call here the test functions with different regressor methods)
 test_contentbased_class(feature_method = "title_length" , regressor_method = "random_score")
 test_contentbased_class(feature_method = "title_length" , regressor_method = "random_sample")
+test_contentbased_class(feature_method="movie_year", regressor_method="random_score")
+test_contentbased_class(feature_method="movie_year", regressor_method="random_sample")
+test_contentbased_class(feature_method="genres", regressor_method="random_score")
+test_contentbased_class(feature_method="genres", regressor_method="random_sample")
+test_contentbased_class(feature_method="rating", regressor_method="random_score")
+test_contentbased_class(feature_method="rating", regressor_method="random_sample")
+test_contentbased_class(feature_method="tags", regressor_method="random_score")
+test_contentbased_class(feature_method="tags", regressor_method="random_sample")
+test_contentbased_class(feature_method="tags_length", regressor_method="random_score")
+test_contentbased_class(feature_method="tags_length", regressor_method="random_sample")
+test_contentbased_class(feature_method="timestamp", regressor_method="random_score")
+test_contentbased_class(feature_method="timestamp", regressor_method="random_sample")
 ```
 %% Output
-    user: 15         item: 942        r_ui = None   est = 3.79   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 0.86   {'was_impossible': False}
-    user: 15         item: 942        r_ui = None   est = 4.00   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 1.00   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 4.42   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 3.00   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 4.53   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 3.00   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 0.72   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 4.00   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 3.33   {'was_impossible': False}
+    user: 11         item: 1214       r_ui = None   est = 3.00   {'was_impossible': False}
+    ---------------------------------------------------------------------------
+    KeyError                                  Traceback (most recent call last)
+File     /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:3791, in Index.get_loc(self, key)
+       3790 try:
+    -> 3791     return self._engine.get_loc(casted_key)
+       3792 except KeyError as err:
+File     index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()
+File     index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()
+File     pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()
+File     pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()
+    KeyError: 'timestamp'
+The above exception was the direct cause of the following exception:
+    KeyError                                  Traceback (most recent call last)
+Cell     In[44], line 33
+         30 test_contentbased_class(feature_method="tags_length", regressor_method="random_score")
+         31 test_contentbased_class(feature_method="tags_length", regressor_method="random_sample")
+    ---> 33 test_contentbased_class(feature_method="timestamp", regressor_method="random_score")
+         34 test_contentbased_class(feature_method="timestamp", regressor_method="random_sample")
+Cell     In[44], line 7, in test_contentbased_class(feature_method, regressor_method)
+          5 sp_ratings = load_ratings(surprise_format=True)
+          6 train_set = sp_ratings.build_full_trainset()
+    ----> 7 content_algo = ContentBased(feature_method, regressor_method)
+          8 content_algo.fit(train_set)
+          9 anti_test_set_first = train_set.build_anti_testset()[0]
+Cell     In[43], line 5, in ContentBased.__init__(self, features_method, regressor_method)
+          3 AlgoBase.__init__(self)
+          4 self.regressor_method = regressor_method
+    ----> 5 self.content_features = self.create_content_features(features_method)
+Cell     In[43], line 33, in ContentBased.create_content_features(self, features_method)
+         30      df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')))
+         32 elif features_method == "timestamp" :
+    ---> 33     df_features =  df_items['timestamp_sin'] = np.sin(2 * np.pi * df_items['timestamp'] / 86400)
+         34     df_features =  df_items['timestamp_cos'] = np.cos(2 * np.pi * df_items['timestamp'] / 86400)
+         36 else: # (implement other feature creations here)
+File     /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py:3893, in DataFrame.__getitem__(self, key)
+       3891 if self.columns.nlevels > 1:
+       3892     return self._getitem_multilevel(key)
+    -> 3893 indexer = self.columns.get_loc(key)
+       3894 if is_integer(indexer):
+       3895     indexer = [indexer]
+File     /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:3798, in Index.get_loc(self, key)
+       3793     if isinstance(casted_key, slice) or (
+       3794         isinstance(casted_key, abc.Iterable)
+       3795         and any(isinstance(x, slice) for x in casted_key)
+       3796     ):
+       3797         raise InvalidIndexError(key)
+    -> 3798     raise KeyError(key) from err
+       3799 except TypeError:
+       3800     # If we have a listlike key, _check_indexing_error will raise
+       3801     #  InvalidIndexError. Otherwise we fall through and re-raise
+       3802     #  the TypeError.
+       3803     self._check_indexing_error(key)
+    KeyError: 'timestamp'