update content based

c9d719a3 · Adrien Payen · 8cdb6fca · c9d719a3
--- a/content_based.ipynb
+++ b/content_based.ipynb
@@ -10,19 +10,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 1,
   "id": "277473a3",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
@@ -40,7 +31,13 @@
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n",
    "from sklearn.svm import SVR\n",
-    "from sklearn.feature_extraction.text import TfidfVectorizer"
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.linear_model import Lasso, Ridge, ElasticNet\n",
+    "from sklearn.neighbors import KNeighborsRegressor\n",
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "from sklearn.ensemble import AdaBoostRegressor\n",
+    "from xgboost import XGBRegressor\n",
+    "from lightgbm import LGBMRegressor"
   ]
  },
  {
@@ -53,26 +50,89 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 2,
   "id": "e8378976",
   "metadata": {},
   "outputs": [
    {
-     "ename": "FileNotFoundError",
+     "data": {
-     "evalue": "[Errno 2] No such file or directory: 'data/test/content/movies.csv'",
+      "text/html": [
-     "output_type": "error",
+       "<div>\n",
-     "traceback": [
+       "<style scoped>\n",
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+       "    .dataframe tbody tr th:only-of-type {\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+       "        vertical-align: middle;\n",
-      "Cell \u001b[0;32mIn[16], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# All the dataframes\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df_items \u001b[38;5;241m=\u001b[39m \u001b[43mload_items\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      3\u001b[0m df_ratings \u001b[38;5;241m=\u001b[39m load_ratings()\n\u001b[1;32m      4\u001b[0m df_tag \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(C\u001b[38;5;241m.\u001b[39mCONTENT_PATH\u001b[38;5;241m/\u001b[39mC\u001b[38;5;241m.\u001b[39mTAGS_FILENAME)\n",
+       "    }\n",
-      "File \u001b[0;32m~/Desktop/Université/Recommender Systems/recomsys/loaders.py:34\u001b[0m, in \u001b[0;36mload_items\u001b[0;34m()\u001b[0m\n\u001b[1;32m     28\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_items\u001b[39m():\n\u001b[1;32m     29\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Loads items data.\u001b[39;00m\n\u001b[1;32m     30\u001b[0m \n\u001b[1;32m     31\u001b[0m \u001b[38;5;124;03m    Returns:\u001b[39;00m\n\u001b[1;32m     32\u001b[0m \u001b[38;5;124;03m        DataFrame: Items data.\u001b[39;00m\n\u001b[1;32m     33\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 34\u001b[0m     df_items \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCONTENT_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mITEMS_FILENAME\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# ce qui se trouve dans le movie csv\u001b[39;00m\n\u001b[1;32m     35\u001b[0m     df_items \u001b[38;5;241m=\u001b[39m df_items\u001b[38;5;241m.\u001b[39mset_index(C\u001b[38;5;241m.\u001b[39mITEM_ID_COL) \u001b[38;5;66;03m# movie id\u001b[39;00m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m df_items\n",
+       "\n",
-      "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m   1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m   1014\u001b[0m     dialect,\n\u001b[1;32m   1015\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1022\u001b[0m     dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m   1023\u001b[0m )\n\u001b[1;32m   1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+       "    .dataframe tbody tr th {\n",
-      "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m    623\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
+       "        vertical-align: top;\n",
-      "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1617\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
+       "    }\n",
-      "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1878\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1879\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1881\u001b[0m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1882\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1883\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1884\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1885\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1886\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1887\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1888\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
+       "\n",
-      "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    869\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    870\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    871\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m    872\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m    874\u001b[0m \u001b[43m            \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    875\u001b[0m \u001b[43m            \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    876\u001b[0m \u001b[43m            \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    877\u001b[0m \u001b[43m            \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    878\u001b[0m \u001b[43m            \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    879\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    880\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    881\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m    882\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
+       "    .dataframe thead th {\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/test/content/movies.csv'"
+       "        text-align: right;\n",
-     ]
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>n_character_title</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>movieId</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>4993</th>\n",
+       "      <td>57</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5952</th>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>527</th>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2028</th>\n",
+       "      <td>26</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4308</th>\n",
+       "      <td>19</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         n_character_title\n",
+       "movieId                   \n",
+       "4993                    57\n",
+       "5952                    45\n",
+       "527                     23\n",
+       "2028                    26\n",
+       "4308                    19"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0         long\n",
+       "1       boring\n",
+       "2         long\n",
+       "3      romance\n",
+       "4    stupidity\n",
+       "Name: tag, dtype: object"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -106,26 +166,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "16b0a602",
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
+     "ename": "NameError",
-     "output_type": "stream",
+     "evalue": "name 'Lasso' is not defined",
-     "text": [
+     "output_type": "error",
-      "0\n",
+     "traceback": [
-      "1\n",
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "2\n",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "3\n",
+      "Cell \u001b[0;32mIn[3], line 147\u001b[0m\n\u001b[1;32m    145\u001b[0m trainset \u001b[38;5;241m=\u001b[39m surprise_data\u001b[38;5;241m.\u001b[39mbuild_full_trainset()\n\u001b[1;32m    146\u001b[0m testset \u001b[38;5;241m=\u001b[39m trainset\u001b[38;5;241m.\u001b[39mbuild_anti_testset()\n\u001b[0;32m--> 147\u001b[0m \u001b[43mcb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    150\u001b[0m \u001b[38;5;66;03m#print(\"RMSE: \", cb.rmse(testset))\u001b[39;00m\n\u001b[1;32m    151\u001b[0m \n\u001b[1;32m    152\u001b[0m \n\u001b[1;32m    153\u001b[0m \u001b[38;5;66;03m#Example explanations for users:\u001b[39;00m\n\u001b[1;32m    154\u001b[0m \u001b[38;5;28mprint\u001b[39m(cb\u001b[38;5;241m.\u001b[39mexplain(\u001b[38;5;241m11\u001b[39m))\n",
-      "4\n",
+      "Cell \u001b[0;32mIn[3], line 88\u001b[0m, in \u001b[0;36mContentBased.fit\u001b[0;34m(self, trainset)\u001b[0m\n\u001b[1;32m     80\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muser_profile[u] \u001b[38;5;241m=\u001b[39m [rating \u001b[38;5;28;01mfor\u001b[39;00m (_, rating) \u001b[38;5;129;01min\u001b[39;00m trainset\u001b[38;5;241m.\u001b[39mur[u]]\n\u001b[1;32m     82\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     83\u001b[0m     regressor_models \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m     84\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlinear_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: LinearRegression(fit_intercept\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[1;32m     85\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msvr_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: SVR(kernel\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrbf\u001b[39m\u001b[38;5;124m'\u001b[39m, C\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m, epsilon\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m),\n\u001b[1;32m     86\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgradient_boosting\u001b[39m\u001b[38;5;124m'\u001b[39m: GradientBoostingRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m, learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m, max_depth\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m),\n\u001b[1;32m     87\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrandom_forest\u001b[39m\u001b[38;5;124m'\u001b[39m: RandomForestRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m),\n\u001b[0;32m---> 88\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlasso_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mLasso\u001b[49m(alpha\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m),\n\u001b[1;32m     89\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mridge_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: Ridge(alpha\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1.0\u001b[39m),\n\u001b[1;32m     90\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124melastic_net\u001b[39m\u001b[38;5;124m'\u001b[39m: ElasticNet(alpha\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1.0\u001b[39m, l1_ratio\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.5\u001b[39m),\n\u001b[1;32m     91\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mknn_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: KNeighborsRegressor(n_neighbors\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m),\n\u001b[1;32m     92\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdecision_tree\u001b[39m\u001b[38;5;124m'\u001b[39m: DecisionTreeRegressor(max_depth\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m),\n\u001b[1;32m     93\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124madaboost\u001b[39m\u001b[38;5;124m'\u001b[39m: AdaBoostRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m50\u001b[39m),\n\u001b[1;32m     94\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mxgboost\u001b[39m\u001b[38;5;124m'\u001b[39m: XGBRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m, learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m, max_depth\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m),\n\u001b[1;32m     95\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlightgbm\u001b[39m\u001b[38;5;124m'\u001b[39m: LGBMRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m, learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m, max_depth\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m)\n\u001b[1;32m     96\u001b[0m     }\n\u001b[1;32m     98\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregressor_method \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m regressor_models:\n\u001b[1;32m     99\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRegressor method \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregressor_method\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not yet implemented\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
-      "5\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'Lasso' is not defined"
-      "None\n",
-      "{'n_character_title': array([0.03019692])}\n",
-      "{'n_character_title': array([0.04098154])}\n",
-      "{'n_character_title': array([0.02942264])}\n",
-      "{'n_character_title': array([0.08196307])}\n",
-      "{'n_character_title': array([0.02798739])}\n"
     ]
    }
   ],
@@ -344,7 +398,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.0"
+   "version": "3.12.2"
  }
 },
 "nbformat": 4,

 %% Cell type:markdown id:82d5ca82 tags:
 # Packages
 %% Cell type:code id:277473a3 tags:
 ``` python
 %load_ext autoreload
 %autoreload 2
 import numpy as np
 import pandas as pd
 import random as rd
 from surprise import AlgoBase
 from surprise.prediction_algorithms.predictions import PredictionImpossible
 from loaders import load_ratings
 from loaders import load_items
 from constants import Constant as C
 from sklearn.linear_model import LinearRegression
 from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
 from sklearn.svm import SVR
 from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import Lasso, Ridge, ElasticNet
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import AdaBoostRegressor
+from xgboost import XGBRegressor
+from lightgbm import LGBMRegressor
 ```
-%% Output
-    The autoreload extension is already loaded. To reload it, use:
-      %reload_ext autoreload
 %% Cell type:markdown id:a42c16bf tags:
 # Explore and select content features
 %% Cell type:code id:e8378976 tags:
 ``` python
 # All the dataframes
 df_items = load_items()
 df_ratings = load_ratings()
 df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
 #df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
 # df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
 # Example 1 : create title_length features
 df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
 display(df_features.head())
 df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
 df_features = df_tag[C.TAG]
 display(df_features.head())
 # (explore here other features)
 ```
 %% Output
-    ---------------------------------------------------------------------------
-    FileNotFoundError                         Traceback (most recent call last)
-Cell     In[16], line 2
-          1 # All the dataframes
-    ----> 2 df_items = load_items()
-          3 df_ratings = load_ratings()
-          4 df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
-File     ~/Desktop/Université/Recommender Systems/recomsys/loaders.py:34, in load_items()
-         28 def load_items():
-         29     """Loads items data.
-         30
-         31     Returns:
-         32         DataFrame: Items data.
-         33     """
-    ---> 34     df_items = pd.read_csv(C.CONTENT_PATH / C.ITEMS_FILENAME) # ce qui se trouve dans le movie csv
-         35     df_items = df_items.set_index(C.ITEM_ID_COL) # movie id
-         36     return df_items
-File     ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
-       1013 kwds_defaults = _refine_defaults_read(
-       1014     dialect,
-       1015     delimiter,
-       (...)
-       1022     dtype_backend=dtype_backend,
-       1023 )
-       1024 kwds.update(kwds_defaults)
-    -> 1026 return _read(filepath_or_buffer, kwds)
-File     ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
-        617 _validate_names(kwds.get("names", None))
-        619 # Create the parser.
-    --> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
-        622 if chunksize or iterator:
-        623     return parser
-File     ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
-       1617     self.options["has_index_names"] = kwds["has_index_names"]
-       1619 self.handles: IOHandles | None = None
-    -> 1620 self._engine = self._make_engine(f, self.engine)
-File     ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
-       1878     if "b" not in mode:
-       1879         mode += "b"
-    -> 1880 self.handles = get_handle(
-       1881     f,
-       1882     mode,
-       1883     encoding=self.options.get("encoding", None),
-       1884     compression=self.options.get("compression", None),
-       1885     memory_map=self.options.get("memory_map", False),
-       1886     is_text=is_text,
-       1887     errors=self.options.get("encoding_errors", "strict"),
-       1888     storage_options=self.options.get("storage_options", None),
-       1889 )
-       1890 assert self.handles is not None
-       1891 f = self.handles.handle
-File     ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
-        868 elif isinstance(handle, str):
-        869     # Check whether the filename is to be opened in binary mode.
-        870     # Binary mode does not support 'encoding' and 'newline'.
-        871     if ioargs.encoding and "b" not in ioargs.mode:
-        872         # Encoding
-    --> 873         handle = open(
-        874             handle,
-        875             ioargs.mode,
-        876             encoding=ioargs.encoding,
-        877             errors=errors,
-        878             newline="",
-        879         )
-        880     else:
-        881         # Binary mode
-        882         handle = open(handle, ioargs.mode)
-    FileNotFoundError: [Errno 2] No such file or directory: 'data/test/content/movies.csv'
 %% Cell type:markdown id:a2c9a2b6 tags:
 # Build a content-based model
 When ready, move the following class in the *models.py* script
 %% Cell type:code id:16b0a602 tags:
 ``` python
 # ContetnBased
 class ContentBased(AlgoBase):
    def __init__(self, features_method, regressor_method):
        AlgoBase.__init__(self)
        self.regressor_method = regressor_method
        self.features_methods = features_method
        self.content_features = self.create_content_features(features_method)
        self.user_profile = {}
        self.user_profile_explain = {}
    def create_content_features(self, features_methods):
        """Content Analyzer"""
        df_items = load_items()
        df_ratings = load_ratings()
        df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
        df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
        df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
        df_features = pd.DataFrame(index=df_items.index)
        for method in features_methods:
            if method == "title_length":
                df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')
                df_features = pd.concat([df_features, df_title_length], axis=1)
            elif method == "movie_year":
                df_movie_year = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year')
                df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)
            elif method == "genre":
                tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)
                tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])
                df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())
                df_features = pd.concat([df_features, df_tfidf_genres], axis=1)
            elif method == "avg_rating":
                df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')
                df_features = df_features.join(df_avg_rating, on='movieId')
            else:
                raise NotImplementedError(f'Feature method {method} not yet implemented')
        # Handle missing values in df_features
        df_features.fillna(0, inplace=True)
        return df_features
    def fit(self, trainset):
        """Profile Learner"""
        AlgoBase.fit(self, trainset)
        # Preallocate user profiles
        self.user_profile = {u: None for u in trainset.all_users()}
        self.user_profile_explain = {}
        epsilon = 1e-10  # Small value to prevent division by zero
        for u in trainset.all_users():
            raw_user_id = trainset.to_raw_uid(u)
            self.user_profile_explain[raw_user_id] = {}
            user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])
            item_ids = [iid for (iid, _) in trainset.ur[u]]
            raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]
            feature_values = self.content_features.loc[raw_item_ids].values
            norms = np.linalg.norm(feature_values, axis=0) + epsilon
            weighted_features = feature_values / norms
            feature_importance = weighted_features.T @ user_ratings
            feature_importance /= np.sum(user_ratings)
            self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))
        if self.regressor_method == 'random_score':
            for u in self.user_profile:
                self.user_profile[u] = rd.uniform(0.5, 5)
        elif self.regressor_method == 'random_sample':
            for u in self.user_profile:
                self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]
        else:
            regressor_models = {
                'linear_regression': LinearRegression(fit_intercept=False),
                'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),
                'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
                'random_forest': RandomForestRegressor(n_estimators=100),
                'lasso_regression': Lasso(alpha=0.1),
                'ridge_regression': Ridge(alpha=1.0),
                'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),
                'knn_regression': KNeighborsRegressor(n_neighbors=1),
                'decision_tree': DecisionTreeRegressor(max_depth=5),
                'adaboost': AdaBoostRegressor(n_estimators=50),
                'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
                'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
            }
            if self.regressor_method not in regressor_models:
                raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')
            for u in self.user_profile:
                user_ratings = [rating for (_, rating) in trainset.ur[u]]
                item_ids = [iid for (iid, _) in trainset.ur[u]]
                raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]
                df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})
                df_user = df_user.merge(self.content_features, left_on="item_id", right_index=True, how='left')
                X = df_user.drop(columns=['item_id', 'user_ratings'])
                y = df_user['user_ratings']
                regressor = regressor_models[self.regressor_method]
                regressor.fit(X, y)
                self.user_profile[u] = regressor
    def estimate(self, u, i):
        """Scoring component used for item filtering"""
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')
        if self.regressor_method == 'random_score':
            return rd.uniform(0.5, 5)
        elif self.regressor_method == 'random_sample':
            return rd.choice(self.user_profile[u])
        else:
            raw_item_id = self.trainset.to_raw_iid(i)
            item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)
            regressor = self.user_profile[u]
            item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)
            return regressor.predict(item_features_df)[0]
    def explain(self, u):
        if u in self.user_profile_explain:
            return self.user_profile_explain[u]
        else:
            return None
 #Example usage:
 cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
 surprise_data = load_ratings(surprise_format=True)
 trainset = surprise_data.build_full_trainset()
 testset = trainset.build_anti_testset()
 cb.fit(trainset)
 #print("RMSE: ", cb.rmse(testset))
 #Example explanations for users:
 print(cb.explain(11))
 print(cb.explain(13))
 print(cb.explain(17))
 print(cb.explain(23))
 print(cb.explain(27))
 print(cb.explain(73))
 ```
 %% Output
-    0
+    ---------------------------------------------------------------------------
-    1
+    NameError                                 Traceback (most recent call last)
-    2
+Cell     In[3], line 147
-    3
+        145 trainset = surprise_data.build_full_trainset()
-    4
+        146 testset = trainset.build_anti_testset()
-    5
+    --> 147 cb.fit(trainset)
-    None
+        150 #print("RMSE: ", cb.rmse(testset))
-    {'n_character_title': array([0.03019692])}
+        151
-    {'n_character_title': array([0.04098154])}
+        152
-    {'n_character_title': array([0.02942264])}
+        153 #Example explanations for users:
-    {'n_character_title': array([0.08196307])}
+        154 print(cb.explain(11))
-    {'n_character_title': array([0.02798739])}
+Cell     In[3], line 88, in ContentBased.fit(self, trainset)
+         80         self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]
+         82 else:
+         83     regressor_models = {
+         84         'linear_regression': LinearRegression(fit_intercept=False),
+         85         'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),
+         86         'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
+         87         'random_forest': RandomForestRegressor(n_estimators=100),
+    ---> 88         'lasso_regression': Lasso(alpha=0.1),
+         89         'ridge_regression': Ridge(alpha=1.0),
+         90         'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),
+         91         'knn_regression': KNeighborsRegressor(n_neighbors=1),
+         92         'decision_tree': DecisionTreeRegressor(max_depth=5),
+         93         'adaboost': AdaBoostRegressor(n_estimators=50),
+         94         'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
+         95         'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
+         96     }
+         98     if self.regressor_method not in regressor_models:
+         99         raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')
+    NameError: name 'Lasso' is not defined
 %% Cell type:markdown id:ffd75b7e tags:
 The following script test the ContentBased class
 %% Cell type:code id:69d12f7d tags:
 ``` python
 def test_contentbased_class(feature_method, regressor_method):
    """Test the ContentBased class.
    Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
    """
    sp_ratings = load_ratings(surprise_format=True)
    train_set = sp_ratings.build_full_trainset()
    content_algo = ContentBased(feature_method, regressor_method)
    content_algo.fit(train_set)
    anti_test_set_first = train_set.build_anti_testset()[0]
    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
    print(prediction)
 test_contentbased_class(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
 ```