update files

ad331907 · Adrien Payen · aa44f259 · ad331907 · ad331907
--- a/content_based copy.ipynb
+++ b/content_based copy.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "82d5ca82",
+   "metadata": {},
+   "source": [
+    "# Packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "277473a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "\n",
+    "# third parties imports\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import random as rd\n",
+    "from surprise import AlgoBase, SVD\n",
+    "from surprise import PredictionImpossible\n",
+    "\n",
+    "# import local\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from loaders import load_items, load_ratings\n",
+    "from constants import Constant as C\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n",
+    "from sklearn.svm import SVR\n",
+    "\n",
+    "from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet\n",
+    "from sklearn.svm import SVR\n",
+    "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor\n",
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "from sklearn.neighbors import KNeighborsRegressor\n",
+    "from xgboost import XGBRegressor\n",
+    "from lightgbm import LGBMRegressor\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a42c16bf",
+   "metadata": {},
+   "source": [
+    "# Explore and select content features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e8378976",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>n_character_title</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>movieId</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>34</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         n_character_title\n",
+       "movieId                   \n",
+       "1                       16\n",
+       "2                       14\n",
+       "3                       23\n",
+       "4                       24\n",
+       "5                       34"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0    sandra 'boring' bullock\n",
+       "1                    dentist\n",
+       "2                   Cambodia\n",
+       "3                    Russian\n",
+       "4                forgettable\n",
+       "Name: tag, dtype: object"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# All the dataframes\n",
+    "df_items = load_items()\n",
+    "df_ratings = load_ratings()\n",
+    "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
+    "#df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n",
+    "# df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n",
+    "\n",
+    "\n",
+    "# Example 1 : create title_length features\n",
+    "df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n",
+    "display(df_features.head())\n",
+    "\n",
+    "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
+    "df_features = df_tag[C.TAG]\n",
+    "display(df_features.head())\n",
+    "\n",
+    "# (explore here other features)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2c9a2b6",
+   "metadata": {},
+   "source": [
+    "# Build a content-based model\n",
+    "When ready, move the following class in the *models.py* script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "16b0a602",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'title_length': 0.1497645139703848, 'movie_year': 0.16218667420100635, '(no genres listed)': 0.0, 'action': 0.09449072815753193, 'adventure': 0.08778978776313201, 'animation': 0.0, 'children': 0.038431411145366176, 'comedy': 0.07268129109348041, 'crime': 0.09469516433772891, 'documentary': 0.0611428358670058, 'drama': 0.10494783392380302, 'fantasy': 0.025806451608591505, 'film-noir': 0.025806451609512046, 'horror': 0.018342712153336858, 'imax': 0.06947533670577526, 'musical': 0.0, 'mystery': 0.06234903350217154, 'romance': 0.036771716124540825, 'sci-fi': 0.059571001735546115, 'thriller': 0.0993122803165238, 'war': 0.04002978709072218, 'western': 0.04547648227079719, 'avg_rating': 0.16263357553020436}\n",
+      "{'title_length': 0.12975573389578626, 'movie_year': 0.13738555574364605, '(no genres listed)': 0.0, 'action': 0.0640388318396414, 'adventure': 0.0827515664964472, 'animation': 0.05686854568650957, 'children': 0.06799492283569505, 'comedy': 0.07354182680364503, 'crime': 0.05543740962624167, 'documentary': 0.0, 'drama': 0.09170589087803577, 'fantasy': 0.061481521263689595, 'film-noir': 0.0, 'horror': 0.015113350123518238, 'imax': 0.04592205020685974, 'musical': 0.03201459126079391, 'mystery': 0.03412706135338736, 'romance': 0.05989121250223656, 'sci-fi': 0.04370793816378273, 'thriller': 0.045800659191095036, 'war': 0.04907194751877139, 'western': 0.027287416762806844, 'avg_rating': 0.13740560847192132}\n",
+      "{'title_length': 0.04702378569892371, 'movie_year': 0.052440003628289225, '(no genres listed)': 0.0, 'action': 0.020439581335728367, 'adventure': 0.015593308332521032, 'animation': 0.004256286923052558, 'children': 0.003520723090188317, 'comedy': 0.018972762464944913, 'crime': 0.028340544273099223, 'documentary': 0.005823989517206729, 'drama': 0.037415345194166824, 'fantasy': 0.013643903080149476, 'film-noir': 0.015390183296279798, 'horror': 0.01926898253629829, 'imax': 0.0014716703456143566, 'musical': 0.0061519348279224124, 'mystery': 0.02847033164163413, 'romance': 0.019827342468818163, 'sci-fi': 0.022573488552024915, 'thriller': 0.03522231545147593, 'war': 0.010339617301415098, 'western': 0.005663885036293055, 'avg_rating': 0.05327750989412312}\n",
+      "{'title_length': 0.033402138126294736, 'movie_year': 0.03710065977291947, '(no genres listed)': 0.0, 'action': 0.014528522669579273, 'adventure': 0.013963913494241694, 'animation': 0.005764814103226412, 'children': 0.006513197483932152, 'comedy': 0.017763201411495646, 'crime': 0.016002513666599556, 'documentary': 0.004292962983778595, 'drama': 0.027458210593047847, 'fantasy': 0.009302633945770895, 'film-noir': 0.006823368830454359, 'horror': 0.007391689869010394, 'imax': 0.004855154663168369, 'musical': 0.0058909467772061425, 'mystery': 0.012191560732760487, 'romance': 0.01723631022081761, 'sci-fi': 0.010817269433255231, 'thriller': 0.01658593988724716, 'war': 0.010193212979882352, 'western': 0.0052038255339472966, 'avg_rating': 0.03742403427834079}\n",
+      "{'title_length': 0.20154225634108316, 'movie_year': 0.20848962267389695, '(no genres listed)': 0.0, 'action': 0.04545454544645529, 'adventure': 0.04545454544730129, 'animation': 0.0, 'children': 0.0, 'comedy': 0.07177284969293253, 'crime': 0.1145252645738102, 'documentary': 0.0, 'drama': 0.16778172557550536, 'fantasy': 0.0, 'film-noir': 0.0, 'horror': 0.06315936177961773, 'imax': 0.0, 'musical': 0.0, 'mystery': 0.08510520557533159, 'romance': 0.09754755529442835, 'sci-fi': 0.045454545449454146, 'thriller': 0.12542163704872258, 'war': 0.08035304331050673, 'western': 0.0, 'avg_rating': 0.21152969571139305}\n",
+      "{'title_length': 0.021927486954368552, 'movie_year': 0.02488786702116846, '(no genres listed)': 0.0007363092498113207, 'action': 0.013836432470735639, 'adventure': 0.011610617815573265, 'animation': 0.007520799115717832, 'children': 0.006287966766754299, 'comedy': 0.012951125615087338, 'crime': 0.011084119744598393, 'documentary': 0.0018287715645832062, 'drama': 0.015221252640276463, 'fantasy': 0.008631010164284143, 'film-noir': 0.0024629052522566544, 'horror': 0.008816299251739122, 'imax': 0.005347204099216887, 'musical': 0.0038827346462235236, 'mystery': 0.0068652812039576095, 'romance': 0.008086664541950757, 'sci-fi': 0.010304269379559203, 'thriller': 0.013200133984104478, 'war': 0.005127335699821772, 'western': 0.0036215200349232765, 'avg_rating': 0.025470698706944836}\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# ContetnBased\n",
+    "class ContentBased(AlgoBase):\n",
+    "    def __init__(self, features_method, regressor_method):\n",
+    "        AlgoBase.__init__(self)\n",
+    "        self.regressor_method = regressor_method\n",
+    "        self.features_methods = features_method\n",
+    "        self.content_features = self.create_content_features(features_method)\n",
+    "        self.user_profile = {}\n",
+    "        self.user_profile_explain = {}\n",
+    "\n",
+    "    def create_content_features(self, features_methods):\n",
+    "        \"\"\"Content Analyzer\"\"\"\n",
+    "        df_items = load_items()\n",
+    "        df_ratings = load_ratings()\n",
+    "        df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
+    "        df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n",
+    "        df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n",
+    "\n",
+    "        df_features = pd.DataFrame(index=df_items.index)\n",
+    "\n",
+    "        for method in features_methods:\n",
+    "            if method == \"title_length\":\n",
+    "                df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')\n",
+    "                df_features = pd.concat([df_features, df_title_length], axis=1)\n",
+    "            \n",
+    "            elif method == \"movie_year\":\n",
+    "                df_movie_year = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n",
+    "                df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)\n",
+    "            \n",
+    "            elif method == \"genre\":\n",
+    "                tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)\n",
+    "                tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n",
+    "                df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())\n",
+    "                df_features = pd.concat([df_features, df_tfidf_genres], axis=1)\n",
+    "\n",
+    "            elif method == \"avg_rating\":\n",
+    "                df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')\n",
+    "                df_features = df_features.join(df_avg_rating, on='movieId')\n",
+    "\n",
+    "            else:\n",
+    "                raise NotImplementedError(f'Feature method {method} not yet implemented')\n",
+    "\n",
+    "        # Handle missing values in df_features\n",
+    "        df_features.fillna(0, inplace=True)\n",
+    "\n",
+    "        return df_features\n",
+    "\n",
+    "    def fit(self, trainset):\n",
+    "        \"\"\"Profile Learner\"\"\"\n",
+    "        AlgoBase.fit(self, trainset)\n",
+    "\n",
+    "        # Preallocate user profiles\n",
+    "        self.user_profile = {u: None for u in trainset.all_users()}\n",
+    "        self.user_profile_explain = {}\n",
+    "\n",
+    "        epsilon = 1e-10  # Small value to prevent division by zero\n",
+    "\n",
+    "        for u in trainset.all_users():\n",
+    "            raw_user_id = trainset.to_raw_uid(u)\n",
+    "            self.user_profile_explain[raw_user_id] = {}\n",
+    "\n",
+    "            user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])\n",
+    "            item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
+    "            raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
+    "\n",
+    "            feature_values = self.content_features.loc[raw_item_ids].values\n",
+    "            norms = np.linalg.norm(feature_values, axis=0) + epsilon\n",
+    "            weighted_features = feature_values / norms\n",
+    "            feature_importance = weighted_features.T @ user_ratings\n",
+    "            feature_importance /= np.sum(user_ratings)\n",
+    "\n",
+    "            self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))\n",
+    "\n",
+    "        if self.regressor_method == 'random_score':\n",
+    "            for u in self.user_profile:\n",
+    "                self.user_profile[u] = rd.uniform(0.5, 5)\n",
+    "\n",
+    "        elif self.regressor_method == 'random_sample':\n",
+    "            for u in self.user_profile:\n",
+    "                self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]\n",
+    "\n",
+    "        else:\n",
+    "            regressor_models = {\n",
+    "                'linear_regression': LinearRegression(fit_intercept=False),\n",
+    "                'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),\n",
+    "                'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
+    "                'random_forest': RandomForestRegressor(n_estimators=100),\n",
+    "                'lasso_regression': Lasso(alpha=0.1),\n",
+    "                'ridge_regression': Ridge(alpha=1.0),\n",
+    "                'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),\n",
+    "                'knn_regression': KNeighborsRegressor(n_neighbors=1),\n",
+    "                'decision_tree': DecisionTreeRegressor(max_depth=5),\n",
+    "                'adaboost': AdaBoostRegressor(n_estimators=50),\n",
+    "                'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
+    "                'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n",
+    "            }\n",
+    "\n",
+    "            if self.regressor_method not in regressor_models:\n",
+    "                raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')\n",
+    "\n",
+    "            for u in self.user_profile:\n",
+    "                user_ratings = [rating for (_, rating) in trainset.ur[u]]\n",
+    "                item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
+    "                raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
+    "\n",
+    "                df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n",
+    "                df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n",
+    "\n",
+    "                X = df_user.drop(columns=['item_id', 'user_ratings'])\n",
+    "                y = df_user['user_ratings']\n",
+    "\n",
+    "                regressor = regressor_models[self.regressor_method]\n",
+    "                regressor.fit(X, y)\n",
+    "\n",
+    "                self.user_profile[u] = regressor\n",
+    "\n",
+    "    def estimate(self, u, i):\n",
+    "        \"\"\"Scoring component used for item filtering\"\"\"\n",
+    "        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n",
+    "            raise PredictionImpossible('User and/or item is unknown.')\n",
+    "\n",
+    "        if self.regressor_method == 'random_score':\n",
+    "            return rd.uniform(0.5, 5)\n",
+    "\n",
+    "        elif self.regressor_method == 'random_sample':\n",
+    "            return rd.choice(self.user_profile[u])\n",
+    "\n",
+    "        else:\n",
+    "            raw_item_id = self.trainset.to_raw_iid(i)\n",
+    "            item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)\n",
+    "            regressor = self.user_profile[u]\n",
+    "            item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)\n",
+    "            return regressor.predict(item_features_df)[0]\n",
+    "\n",
+    "    def explain(self, u):\n",
+    "        if u in self.user_profile_explain:\n",
+    "            return self.user_profile_explain[u]\n",
+    "        else:\n",
+    "            return None\n",
+    "\n",
+    "\n",
+    "#Example usage:\n",
+    "cb = ContentBased([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")\n",
+    "surprise_data = load_ratings(surprise_format=True)\n",
+    "trainset = surprise_data.build_full_trainset()\n",
+    "testset = trainset.build_anti_testset()\n",
+    "cb.fit(trainset)\n",
+    "\n",
+    "\n",
+    "#print(\"RMSE: \", cb.rmse(testset))\n",
+    "\n",
+    "\n",
+    "#Example explanations for users:\n",
+    "print(cb.explain(11))\n",
+    "\n",
+    "print(cb.explain(13))\n",
+    "\n",
+    "print(cb.explain(17))\n",
+    "\n",
+    "print(cb.explain(23))\n",
+    "\n",
+    "print(cb.explain(27))\n",
+    "\n",
+    "print(cb.explain(73))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ffd75b7e",
+   "metadata": {},
+   "source": [
+    "The following script test the ContentBased class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "69d12f7d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "user: 1          item: 10         r_ui = None   est = 0.72   {'was_impossible': False}\n"
+     ]
+    }
+   ],
+   "source": [
+    "def test_contentbased_class(feature_method, regressor_method):\n",
+    "    \"\"\"Test the ContentBased class.\n",
+    "    Tries to make a prediction on the first (user,item ) tuple of the anti_test_set\n",
+    "    \"\"\"\n",
+    "    sp_ratings = load_ratings(surprise_format=True)\n",
+    "    train_set = sp_ratings.build_full_trainset()\n",
+    "    content_algo = ContentBased(feature_method, regressor_method)\n",
+    "    content_algo.fit(train_set)\n",
+    "    anti_test_set_first = train_set.build_anti_testset()[0]\n",
+    "    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n",
+    "    print(prediction)\n",
+    "\n",
+    "test_contentbased_class([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:82d5ca82 tags:
+# Packages
+%% Cell type:code id:277473a3 tags:
+``` python
+%load_ext autoreload
+%autoreload 2
+# third parties imports
+import pandas as pd
+import numpy as np
+import random as rd
+from surprise import AlgoBase, SVD
+from surprise import PredictionImpossible
+# import local
+from sklearn.feature_extraction.text import TfidfVectorizer
+from loaders import load_items, load_ratings
+from constants import Constant as C
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
+from sklearn.svm import SVR
+from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
+from sklearn.svm import SVR
+from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.neighbors import KNeighborsRegressor
+from xgboost import XGBRegressor
+from lightgbm import LGBMRegressor
+```
+%% Output
+    The autoreload extension is already loaded. To reload it, use:
+      %reload_ext autoreload
+%% Cell type:markdown id:a42c16bf tags:
+# Explore and select content features
+%% Cell type:code id:e8378976 tags:
+``` python
+# All the dataframes
+df_items = load_items()
+df_ratings = load_ratings()
+df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
+#df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
+# df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
+# Example 1 : create title_length features
+df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
+display(df_features.head())
+df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
+df_features = df_tag[C.TAG]
+display(df_features.head())
+# (explore here other features)
+```
+%% Output
+%% Cell type:markdown id:a2c9a2b6 tags:
+# Build a content-based model
+When ready, move the following class in the *models.py* script
+%% Cell type:code id:16b0a602 tags:
+``` python
+# ContetnBased
+class ContentBased(AlgoBase):
+    def __init__(self, features_method, regressor_method):
+        AlgoBase.__init__(self)
+        self.regressor_method = regressor_method
+        self.features_methods = features_method
+        self.content_features = self.create_content_features(features_method)
+        self.user_profile = {}
+        self.user_profile_explain = {}
+    def create_content_features(self, features_methods):
+        """Content Analyzer"""
+        df_items = load_items()
+        df_ratings = load_ratings()
+        df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
+        df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
+        df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
+        df_features = pd.DataFrame(index=df_items.index)
+        for method in features_methods:
+            if method == "title_length":
+                df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')
+                df_features = pd.concat([df_features, df_title_length], axis=1)
+            elif method == "movie_year":
+                df_movie_year = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year')
+                df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)
+            elif method == "genre":
+                tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)
+                tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])
+                df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())
+                df_features = pd.concat([df_features, df_tfidf_genres], axis=1)
+            elif method == "avg_rating":
+                df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')
+                df_features = df_features.join(df_avg_rating, on='movieId')
+            else:
+                raise NotImplementedError(f'Feature method {method} not yet implemented')
+        # Handle missing values in df_features
+        df_features.fillna(0, inplace=True)
+        return df_features
+    def fit(self, trainset):
+        """Profile Learner"""
+        AlgoBase.fit(self, trainset)
+        # Preallocate user profiles
+        self.user_profile = {u: None for u in trainset.all_users()}
+        self.user_profile_explain = {}
+        epsilon = 1e-10  # Small value to prevent division by zero
+        for u in trainset.all_users():
+            raw_user_id = trainset.to_raw_uid(u)
+            self.user_profile_explain[raw_user_id] = {}
+            user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])
+            item_ids = [iid for (iid, _) in trainset.ur[u]]
+            raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]
+            feature_values = self.content_features.loc[raw_item_ids].values
+            norms = np.linalg.norm(feature_values, axis=0) + epsilon
+            weighted_features = feature_values / norms
+            feature_importance = weighted_features.T @ user_ratings
+            feature_importance /= np.sum(user_ratings)
+            self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))
+        if self.regressor_method == 'random_score':
+            for u in self.user_profile:
+                self.user_profile[u] = rd.uniform(0.5, 5)
+        elif self.regressor_method == 'random_sample':
+            for u in self.user_profile:
+                self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]
+        else:
+            regressor_models = {
+                'linear_regression': LinearRegression(fit_intercept=False),
+                'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),
+                'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
+                'random_forest': RandomForestRegressor(n_estimators=100),
+                'lasso_regression': Lasso(alpha=0.1),
+                'ridge_regression': Ridge(alpha=1.0),
+                'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),
+                'knn_regression': KNeighborsRegressor(n_neighbors=1),
+                'decision_tree': DecisionTreeRegressor(max_depth=5),
+                'adaboost': AdaBoostRegressor(n_estimators=50),
+                'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
+                'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
+            }
+            if self.regressor_method not in regressor_models:
+                raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')
+            for u in self.user_profile:
+                user_ratings = [rating for (_, rating) in trainset.ur[u]]
+                item_ids = [iid for (iid, _) in trainset.ur[u]]
+                raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]
+                df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})
+                df_user = df_user.merge(self.content_features, left_on="item_id", right_index=True, how='left')
+                X = df_user.drop(columns=['item_id', 'user_ratings'])
+                y = df_user['user_ratings']
+                regressor = regressor_models[self.regressor_method]
+                regressor.fit(X, y)
+                self.user_profile[u] = regressor
+    def estimate(self, u, i):
+        """Scoring component used for item filtering"""
+        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
+            raise PredictionImpossible('User and/or item is unknown.')
+        if self.regressor_method == 'random_score':
+            return rd.uniform(0.5, 5)
+        elif self.regressor_method == 'random_sample':
+            return rd.choice(self.user_profile[u])
+        else:
+            raw_item_id = self.trainset.to_raw_iid(i)
+            item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)
+            regressor = self.user_profile[u]
+            item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)
+            return regressor.predict(item_features_df)[0]
+    def explain(self, u):
+        if u in self.user_profile_explain:
+            return self.user_profile_explain[u]
+        else:
+            return None
+#Example usage:
+cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
+surprise_data = load_ratings(surprise_format=True)
+trainset = surprise_data.build_full_trainset()
+testset = trainset.build_anti_testset()
+cb.fit(trainset)
+#print("RMSE: ", cb.rmse(testset))
+#Example explanations for users:
+print(cb.explain(11))
+print(cb.explain(13))
+print(cb.explain(17))
+print(cb.explain(23))
+print(cb.explain(27))
+print(cb.explain(73))
+```
+%% Output
+    {'title_length': 0.1497645139703848, 'movie_year': 0.16218667420100635, '(no genres listed)': 0.0, 'action': 0.09449072815753193, 'adventure': 0.08778978776313201, 'animation': 0.0, 'children': 0.038431411145366176, 'comedy': 0.07268129109348041, 'crime': 0.09469516433772891, 'documentary': 0.0611428358670058, 'drama': 0.10494783392380302, 'fantasy': 0.025806451608591505, 'film-noir': 0.025806451609512046, 'horror': 0.018342712153336858, 'imax': 0.06947533670577526, 'musical': 0.0, 'mystery': 0.06234903350217154, 'romance': 0.036771716124540825, 'sci-fi': 0.059571001735546115, 'thriller': 0.0993122803165238, 'war': 0.04002978709072218, 'western': 0.04547648227079719, 'avg_rating': 0.16263357553020436}
+    {'title_length': 0.12975573389578626, 'movie_year': 0.13738555574364605, '(no genres listed)': 0.0, 'action': 0.0640388318396414, 'adventure': 0.0827515664964472, 'animation': 0.05686854568650957, 'children': 0.06799492283569505, 'comedy': 0.07354182680364503, 'crime': 0.05543740962624167, 'documentary': 0.0, 'drama': 0.09170589087803577, 'fantasy': 0.061481521263689595, 'film-noir': 0.0, 'horror': 0.015113350123518238, 'imax': 0.04592205020685974, 'musical': 0.03201459126079391, 'mystery': 0.03412706135338736, 'romance': 0.05989121250223656, 'sci-fi': 0.04370793816378273, 'thriller': 0.045800659191095036, 'war': 0.04907194751877139, 'western': 0.027287416762806844, 'avg_rating': 0.13740560847192132}
+    {'title_length': 0.04702378569892371, 'movie_year': 0.052440003628289225, '(no genres listed)': 0.0, 'action': 0.020439581335728367, 'adventure': 0.015593308332521032, 'animation': 0.004256286923052558, 'children': 0.003520723090188317, 'comedy': 0.018972762464944913, 'crime': 0.028340544273099223, 'documentary': 0.005823989517206729, 'drama': 0.037415345194166824, 'fantasy': 0.013643903080149476, 'film-noir': 0.015390183296279798, 'horror': 0.01926898253629829, 'imax': 0.0014716703456143566, 'musical': 0.0061519348279224124, 'mystery': 0.02847033164163413, 'romance': 0.019827342468818163, 'sci-fi': 0.022573488552024915, 'thriller': 0.03522231545147593, 'war': 0.010339617301415098, 'western': 0.005663885036293055, 'avg_rating': 0.05327750989412312}
+    {'title_length': 0.033402138126294736, 'movie_year': 0.03710065977291947, '(no genres listed)': 0.0, 'action': 0.014528522669579273, 'adventure': 0.013963913494241694, 'animation': 0.005764814103226412, 'children': 0.006513197483932152, 'comedy': 0.017763201411495646, 'crime': 0.016002513666599556, 'documentary': 0.004292962983778595, 'drama': 0.027458210593047847, 'fantasy': 0.009302633945770895, 'film-noir': 0.006823368830454359, 'horror': 0.007391689869010394, 'imax': 0.004855154663168369, 'musical': 0.0058909467772061425, 'mystery': 0.012191560732760487, 'romance': 0.01723631022081761, 'sci-fi': 0.010817269433255231, 'thriller': 0.01658593988724716, 'war': 0.010193212979882352, 'western': 0.0052038255339472966, 'avg_rating': 0.03742403427834079}
+    {'title_length': 0.20154225634108316, 'movie_year': 0.20848962267389695, '(no genres listed)': 0.0, 'action': 0.04545454544645529, 'adventure': 0.04545454544730129, 'animation': 0.0, 'children': 0.0, 'comedy': 0.07177284969293253, 'crime': 0.1145252645738102, 'documentary': 0.0, 'drama': 0.16778172557550536, 'fantasy': 0.0, 'film-noir': 0.0, 'horror': 0.06315936177961773, 'imax': 0.0, 'musical': 0.0, 'mystery': 0.08510520557533159, 'romance': 0.09754755529442835, 'sci-fi': 0.045454545449454146, 'thriller': 0.12542163704872258, 'war': 0.08035304331050673, 'western': 0.0, 'avg_rating': 0.21152969571139305}
+    {'title_length': 0.021927486954368552, 'movie_year': 0.02488786702116846, '(no genres listed)': 0.0007363092498113207, 'action': 0.013836432470735639, 'adventure': 0.011610617815573265, 'animation': 0.007520799115717832, 'children': 0.006287966766754299, 'comedy': 0.012951125615087338, 'crime': 0.011084119744598393, 'documentary': 0.0018287715645832062, 'drama': 0.015221252640276463, 'fantasy': 0.008631010164284143, 'film-noir': 0.0024629052522566544, 'horror': 0.008816299251739122, 'imax': 0.005347204099216887, 'musical': 0.0038827346462235236, 'mystery': 0.0068652812039576095, 'romance': 0.008086664541950757, 'sci-fi': 0.010304269379559203, 'thriller': 0.013200133984104478, 'war': 0.005127335699821772, 'western': 0.0036215200349232765, 'avg_rating': 0.025470698706944836}
+%% Cell type:markdown id:ffd75b7e tags:
+The following script test the ContentBased class
+%% Cell type:code id:69d12f7d tags:
+``` python
+def test_contentbased_class(feature_method, regressor_method):
+    """Test the ContentBased class.
+    Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
+    """
+    sp_ratings = load_ratings(surprise_format=True)
+    train_set = sp_ratings.build_full_trainset()
+    content_algo = ContentBased(feature_method, regressor_method)
+    content_algo.fit(train_set)
+    anti_test_set_first = train_set.build_anti_testset()[0]
+    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
+    print(prediction)
+test_contentbased_class(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
+```
+%% Output
+    user: 1          item: 10         r_ui = None   est = 0.72   {'was_impossible': False}
--- a/recommender.py
+++ b/recommender.py
@@ -683,9 +683,8 @@ def compare_similarity_measures(trainset,testset):
    results['KNN_Pearson_RMSE'] = rmse_pearson
    results['KNN_Pearson_MAE'] = mae_pearson
    # Train and evaluate KNN model with Jaccard similarity
-    sim_options_jaccard = {'name': 'jaccard','user_based': True}
+    sim_options_jaccard = {'name': '','user_based': True}
    user_based_jaccard = KNNWithMeans(sim_options=sim_options_jaccard)
    user_based_jaccard.fit(trainset)
    predictions_jaccard = user_based_jaccard.test(testset)
@@ -771,33 +770,33 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas
    inter_user_diversity_scores['UserBased'] = user_based_model.inter_user_diversity(all_top_n_recommendations_ub)
-    # # #KNN model
+    # #KNN model
-    # knn_model = RecommenderSystem_KNN(ratings_path)
+    knn_model = RecommenderSystem_KNN(ratings_path)
-    # knn_model.train_knn_model()
+    knn_model.train_knn_model()
-    # all_top_n_recommendations_knn = {}
+    all_top_n_recommendations_knn = {}
-    # for user_id in range(knn_model.trainset.n_users):
+    for user_id in range(knn_model.trainset.n_users):
-    #     try:
+        try:
-    #         trainset_user_id = knn_model.trainset.to_raw_uid(user_id)
+            trainset_user_id = knn_model.trainset.to_raw_uid(user_id)
-    #         top_n_recommendations_knn = knn_model.get_top_n_recommendations(trainset_user_id, n=10)
+            top_n_recommendations_knn = knn_model.get_top_n_recommendations(trainset_user_id, n=10)
-    #         all_top_n_recommendations_knn[trainset_user_id] = top_n_recommendations_knn
+            all_top_n_recommendations_knn[trainset_user_id] = top_n_recommendations_knn
-    #     except ValueError:
+        except ValueError:
-    #         print(f"User {trainset_user_id} is not part of the training set for KNN model. Skipping...")
+            print(f"User {trainset_user_id} is not part of the training set for KNN model. Skipping...")
-    # inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(all_top_n_recommendations_knn)
+    inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(all_top_n_recommendations_knn)
-    # # Other user-based models
+    # Other user-based models
-    # for other_model in other_user_based_models:
+    for other_model in other_user_based_models:
-    #     other_model.load_model()
+        other_model.load_model()
-    #     all_top_n_recommendations_other = {}
+        all_top_n_recommendations_other = {}
-    #     # Get predictions for all users in the test set
+        # Get predictions for all users in the test set
-    #     all_user_ids = set(user for user, _, _ in testset)
+        all_user_ids = set(user for user, _, _ in testset)
-    #     for user_id in all_user_ids:
+        for user_id in all_user_ids:
-    #         other_model.user_id = user_id  # Update the user ID for the model
+            other_model.user_id = user_id  # Update the user ID for the model
-    #         top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10)
+            top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10)
-    #         all_top_n_recommendations_other[user_id] = top_n_predictions
+            all_top_n_recommendations_other[user_id] = top_n_predictions
-    #     inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other)
+        inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other)
    return inter_user_diversity_scores
@@ -983,8 +982,7 @@ class ContentBased(AlgoBase):
                'knn_regression': KNeighborsRegressor(n_neighbors=1),
                'decision_tree': DecisionTreeRegressor(max_depth=5),
                'adaboost': AdaBoostRegressor(n_estimators=50),
-                'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
+                'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
-                'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
            }
            if self.regressor_method not in regressor_models: