diff --git a/configs.py b/configs.py index 2fdbd42033f60f409b2029ee4868980916704a28..c0f669fba58487f879da2fa769b4afd7a276846f 100644 --- a/configs.py +++ b/configs.py @@ -8,32 +8,80 @@ class EvalConfig: # List of models to evaluate, each tuple containing model_name, model class, and model parameters (dict) + models = [ ("baseline_1", ModelBaseline1, {}), ("baseline_2", ModelBaseline2, {}), ("baseline_3", ModelBaseline3, {}), ("baseline_4", ModelBaseline4, {}), + + ("title_length_ContentBased_sample", ContentBased, {"features_method" : "relevance", "regressor_method" : "random_sample"}), + ("title_length_ContentBased_score", ContentBased, {"features_method" : "relevance", "regressor_method" : "random_score"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "relevance", "regressor_method" : "linear_regression"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "relevance", "regressor_method" : "svr_regression"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "relevance", "regressor_method" : "gradient_boosting"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "relevance", "regressor_method" : "random_forest"}), + + ("title_length_ContentBased_sample", ContentBased, {"features_method" : "combination", "regressor_method" : "random_sample"}), + ("title_length_ContentBased_score", ContentBased, {"features_method" : "combination", "regressor_method" : "random_score"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "combination", "regressor_method" : "linear_regression"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "combination", "regressor_method" : "svr_regression"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "combination", "regressor_method" : "gradient_boosting"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "combination", "regressor_method" : "random_forest"}), + ("title_length_ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}), ("title_length_ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}), ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "svr_regression"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "gradient_boosting"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_forest"}), + ("movie_year_ContentBased_sample", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_sample"}), ("movie_year_ContentBased_score", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_score"}), - #("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "linear_regression"}) + #("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "linear_regression"}), + #("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "svr_regression"}), + #("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "gradient_boosting"}), + #("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_forest"}), + + ("genres_ContentBased_sample", ContentBased, {"features_method" : "genres", "regressor_method" : "random_sample"}), ("genres_ContentBased_score", ContentBased, {"features_method" : "genres", "regressor_method" : "random_score"}), #("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "linear_regression"}), + #("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "svr_regression"}), + #("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "gradient_boosting"}), + #("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "random_forest"}), + ("rating_ContentBased_sample", ContentBased, {"features_method" : "rating", "regressor_method" : "random_sample"}), ("rating_ContentBased_score", ContentBased, {"features_method" : "rating", "regressor_method" : "random_score"}), #("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "linear_regression"}), + #("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "svr_regression"}), + #("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "gradient_boosting"}), + #("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "random_forest"}), + + ("tags_ContentBased_sample", ContentBased, {"features_method" : "tags", "regressor_method" : "random_sample"}), ("tags_ContentBased_score", ContentBased, {"features_method" : "tags", "regressor_method" : "random_score"}), #("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "linear_regression"}), + #("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "svr_regression"}), + #("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "gradient_boosting"}), + #("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "random_forest"}), + + + ("tags_length_ContentBased_sample", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_sample"}), ("tags_length_ContentBased_score", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_score"}), #("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "linear_regression"}), + #("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "svr_regression"}), + #("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "gradient_boosting"}), + #("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_forest"}), + + ("timestamp_ContentBased_sample", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_sample"}), ("timestamp_ContentBased_score", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_score"}), #("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "linear_regression"}) + #("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "svr_regression"}) + #("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "gradient_boosting"}) + #("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_forest"}) # model_name, model class, model parameters (dict) ] diff --git a/constants.py b/constants.py index c67ccc27c854da09dac88ddf30eb8913c67c3782..e6125570df096d8edd74e9f50c99041779d73b03 100644 --- a/constants.py +++ b/constants.py @@ -6,7 +6,7 @@ class Constant: """Constants for dataset paths and column names.""" - DATA_PATH = Path('data/hackathon') # -- fill here the dataset size to use + DATA_PATH = Path('data/test') # -- fill here the dataset size to use # Content CONTENT_PATH = DATA_PATH / 'content' # Path to content data diff --git a/content_based.ipynb b/content_based.ipynb index 7c12361204ffd09164375fb170cd551108919f56..eebd7d8541ff2a49cfc9ad67ca63598390b2bf39 100644 --- a/content_based.ipynb +++ b/content_based.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 34, "id": "277473a3", "metadata": {}, "outputs": [ @@ -38,6 +38,8 @@ "from constants import Constant as C\n", "\n", "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n", + "from sklearn.svm import SVR\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, @@ -51,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 35, "id": "e8378976", "metadata": {}, "outputs": [ @@ -137,8 +139,13 @@ } ], "source": [ + "# All the dataframes\n", "df_items = load_items()\n", "df_ratings = load_ratings()\n", + "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", + "df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n", + "df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n", + "\n", "\n", "# Example 1 : create title_length features\n", "df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n", @@ -162,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 36, "id": "16b0a602", "metadata": {}, "outputs": [], @@ -176,31 +183,48 @@ " def create_content_features(self, features_method):\n", " \"\"\"Content Analyzer\"\"\"\n", " df_items = load_items()\n", + " df_ratings = load_ratings()\n", + " df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", + " df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n", + " df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n", + "\n", " if features_method is None:\n", " df_features = None\n", + "\n", + " elif features_method == \"relevance\" :\n", + " df_features = df_genome_score.groupby('movieId')[\"relevance\"].transform('mean').to_frame('avg_relevance')\n", + "\n", " elif features_method == \"title_length\": # a naive method that creates only 1 feature based on title length\n", " df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n", - "\n", + " \n", " elif features_method == \"movie_year\" :\n", - " df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False)\n", + " df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", "\n", " elif features_method == \"genres\" :\n", " genres_list = df_items['genres'].str.split('|').explode().unique()\n", " for genre in genres_list:\n", - " df_features = df_items['genres'].str.contains(genre).astype(int)\n", - "\n", + " df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')\n", + " \n", + " elif features_method == \"combination\":\n", + " genres_list = df_items['genres'].str.split('|').explode().unique()\n", + " df_movie = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", + " for genre in genres_list:\n", + " df_genre = df_items['genres'].str.contains(genre).astype(int).to_frame(genre)\n", + " \n", + " df_tag['tag_length'] = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')) if isinstance(x, str) else 0)\n", + " df_features = pd.concat([df_genre, df_tag, df_movie], axis=1)\n", + " \n", " elif features_method == \"rating\" :\n", " df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')\n", + " \n", "\n", " elif features_method == \"tags\" :\n", - " df_features = df_tag['tag'].apply(lambda x: len(x.split(',')))\n", + " df_features = df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0).to_frame('tags')\n", "\n", " elif features_method == \"tags_length\" :\n", - " df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')))\n", + " \n", + " df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0).to_frame('n_character_tags')\n", "\n", - " elif features_method == \"timestamp\" :\n", - " df_features = df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400)\n", - " df_features = df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400)\n", "\n", " else: # (implement other feature creations here)\n", " raise NotImplementedError(f'Feature method {features_method} not yet implemented')\n", @@ -221,6 +245,7 @@ " elif self.regressor_method == 'random_sample':\n", " for u in self.user_profile:\n", " self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]\n", + "\n", " elif self.regressor_method == 'linear_regression' :\n", " for u in self.user_profile:\n", "\n", @@ -232,8 +257,42 @@ " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", "\n", " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", + " \n", + " if 'n_character_title' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['n_character_title'].values.reshape(-1, 1)\n", + "\n", + " elif 'avg_relevance' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", + " \n", + " elif 'movie_year' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['movie_year'].values.reshape(-1, 1)\n", + " \n", + " elif 'genres' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['genres'].values.reshape(-1, 1)\n", + " \n", + " elif 'combination' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['combination'].values.reshape(-1, 1)\n", + " \n", + " elif 'avg_rating' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['avg_rating'].values.reshape(-1, 1)\n", + "\n", + " elif 'tags' in df_user.columns:\n", + " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", + " X = df_user['tags'].values.reshape(-1, 1)\n", + "\n", + " elif 'n_character_tags' in df_user.columns:\n", + " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", + " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", "\n", - " X = df_user['n_character_title'].values.reshape(-1,1)\n", + " else:\n", + " # Si aucune caractéristique appropriée n'est disponible\n", + " continue # Ou gère le cas d'erreur/exception ici\n", "\n", " y = df_user['user_ratings'].values\n", "\n", @@ -243,6 +302,165 @@ " \n", " # Store the computed user profile\n", " self.user_profile[u] = linear_regressor\n", + "\n", + " elif self.regressor_method == 'svr_regression':\n", + " for u in self.user_profile:\n", + "\n", + " user_ratings = [rating for _, rating in trainset.ur[u]]\n", + " item_ids = [iid for iid, _ in trainset.ur[u]]\n", + "\n", + " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", + "\n", + " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", + "\n", + " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", + "\n", + " if 'n_character_title' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['n_character_title'].values.reshape(-1, 1)\n", + "\n", + " elif 'avg_relevance' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", + " \n", + " elif 'movie_year' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['movie_year'].values.reshape(-1, 1)\n", + " \n", + " elif 'genres' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['genres'].values.reshape(-1, 1)\n", + " \n", + " elif 'combination' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['combination'].values.reshape(-1, 1)\n", + " \n", + " elif 'avg_rating' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['avg_rating'].values.reshape(-1, 1)\n", + "\n", + " elif 'tags' in df_user.columns:\n", + " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", + " X = df_user['tags'].values.reshape(-1, 1)\n", + "\n", + " elif 'n_character_tags' in df_user.columns:\n", + " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", + " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", + "\n", + " else:\n", + " # Si aucune caractéristique appropriée n'est disponible\n", + " continue # Ou gère le cas d'erreur/exception ici\n", + " y = df_user['user_ratings'].values\n", + " svr_regressor = SVR(kernel='rbf', C=10, epsilon=0.2)\n", + " svr_regressor.fit(X, y)\n", + " self.user_profile[u] = svr_regressor\n", + "\n", + " elif self.regressor_method == 'gradient_boosting':\n", + " for u in self.user_profile:\n", + "\n", + " user_ratings = [rating for _, rating in trainset.ur[u]]\n", + " item_ids = [iid for iid, _ in trainset.ur[u]]\n", + "\n", + " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", + "\n", + " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", + "\n", + " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", + "\n", + " if 'n_character_title' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['n_character_title'].values.reshape(-1, 1)\n", + "\n", + " elif 'avg_relevance' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", + " \n", + " elif 'movie_year' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['movie_year'].values.reshape(-1, 1)\n", + " \n", + " elif 'genres' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['genres'].values.reshape(-1, 1)\n", + " \n", + " elif 'combination' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['combination'].values.reshape(-1, 1)\n", + " \n", + " elif 'avg_rating' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['avg_rating'].values.reshape(-1, 1)\n", + "\n", + " elif 'tags' in df_user.columns:\n", + " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", + " X = df_user['tags'].values.reshape(-1, 1)\n", + "\n", + " elif 'n_character_tags' in df_user.columns:\n", + " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", + " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", + "\n", + " else:\n", + " # Si aucune caractéristique appropriée n'est disponible\n", + " continue # Ou gère le cas d'erreur/exception ici\n", + " \n", + " y = df_user['user_ratings'].values\n", + " gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n", + " gb_regressor.fit(X, y)\n", + " self.user_profile[u] = gb_regressor\n", + "\n", + "\n", + " elif self.regressor_method == 'random_forest':\n", + " for u in self.user_profile:\n", + "\n", + " user_ratings = [rating for _, rating in trainset.ur[u]]\n", + " item_ids = [iid for iid, _ in trainset.ur[u]]\n", + "\n", + " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", + "\n", + " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", + "\n", + " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", + "\n", + " if 'n_character_title' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['n_character_title'].values.reshape(-1, 1)\n", + "\n", + " elif 'avg_relevance' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", + " \n", + " elif 'movie_year' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['movie_year'].values.reshape(-1, 1)\n", + " \n", + " elif 'genres' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['genres'].values.reshape(-1, 1)\n", + " \n", + " elif 'combination' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['combination'].values.reshape(-1, 1)\n", + " \n", + " elif 'avg_rating' in df_user.columns:\n", + " # Si 'n_character_title' est disponible comme caractéristique\n", + " X = df_user['avg_rating'].values.reshape(-1, 1)\n", + "\n", + " elif 'tags' in df_user.columns:\n", + " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", + " X = df_user['tags'].values.reshape(-1, 1)\n", + "\n", + " elif 'n_character_tags' in df_user.columns:\n", + " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", + " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", + "\n", + " else:\n", + " # Si aucune caractéristique appropriée n'est disponible\n", + " continue # Ou gère le cas d'erreur/exception ici\n", + " y = df_user['user_ratings'].values\n", + " rf_regressor = RandomForestRegressor(n_estimators=100)\n", + " rf_regressor.fit(X, y)\n", + " self.user_profile[u] = rf_regressor\n", + "\n", " else : \n", " pass\n", "\n", @@ -272,6 +490,34 @@ " linear_regressor = self.user_profile[u]\n", "\n", " score= linear_regressor.predict(item_features)[0]\n", + " \n", + " elif self.regressor_method == 'svr_regression':\n", + "\n", + " raw_item_id = self.trainset.to_raw_iid(i)\n", + "\n", + " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", + "\n", + " svr_regressor = self.user_profile[u]\n", + " score = svr_regressor.predict(item_features)[0]\n", + " \n", + " elif self.regressor_method == 'gradient_boosting':\n", + "\n", + " raw_item_id = self.trainset.to_raw_iid(i)\n", + "\n", + " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", + "\n", + " gradient_boosting = self.user_profile[u]\n", + " score = gradient_boosting.predict(item_features)[0]\n", + " \n", + " elif self.regressor_method == 'random_forest':\n", + "\n", + " raw_item_id = self.trainset.to_raw_iid(i)\n", + "\n", + " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", + "\n", + " randomforest = self.user_profile[u]\n", + " score = randomforest.predict(item_features)[0]\n", + " \n", " else : \n", " score = None\n", "\n", @@ -282,7 +528,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 37, "id": "baab88b7", "metadata": {}, "outputs": [ @@ -587,7 +833,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 38, "id": "69d12f7d", "metadata": {}, "outputs": [ @@ -596,39 +842,54 @@ "output_type": "stream", "text": [ "title_length :\n", - "user: 11 item: 1214 r_ui = None est = 1.19 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 4.50 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.28 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.50 {'was_impossible': False}\n", "user: 11 item: 1214 r_ui = None est = 1.09 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 0.80 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.00 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.41 {'was_impossible': False}\n", "\n", "\n", "movie_year : \n", - "user: 11 item: 1214 r_ui = None est = 1.24 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.67 {'was_impossible': False}\n", "user: 11 item: 1214 r_ui = None est = 4.50 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.15 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.75 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.50 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.56 {'was_impossible': False}\n", + "\n", + "\n", + "relevance : \n", + "user: 11 item: 1214 r_ui = None est = 3.61 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 5.00 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.13 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.70 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 2.17 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 2.18 {'was_impossible': False}\n", "\n", "\n", "genres : \n", - "user: 11 item: 1214 r_ui = None est = 4.94 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 4.00 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.95 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.50 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 0.50 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.50 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.17 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.17 {'was_impossible': False}\n", "\n", "\n", "rating : \n", - "user: 11 item: 1214 r_ui = None est = 1.76 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 3.00 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.46 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 4.50 {'was_impossible': False}\n", "\n", "\n", "tags : \n", - "user: 11 item: 1214 r_ui = None est = 3.67 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 4.00 {'was_impossible': False}\n", - "\n", - "\n", - "tags_length : \n", - "user: 11 item: 1214 r_ui = None est = 1.73 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.47 {'was_impossible': False}\n", "user: 11 item: 1214 r_ui = None est = 1.00 {'was_impossible': False}\n", "\n", "\n", - "timestamp : \n", - "user: 11 item: 1214 r_ui = None est = 3.65 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 5.00 {'was_impossible': False}\n" + "tags_length : \n", + "user: 11 item: 1214 r_ui = None est = 2.29 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 4.00 {'was_impossible': False}\n" ] } ], @@ -645,41 +906,63 @@ " prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n", " print(prediction)\n", "\n", - "# (call here the test functions with different regressor methods)\n", + "\n", + "\n", "print(\"title_length :\")\n", "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_score\")\n", "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_sample\")\n", "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"linear_regression\")\n", + "test_contentbased_class(feature_method= \"title_length\", regressor_method= \"svr_regression\")\n", + "test_contentbased_class(feature_method= \"title_length\", regressor_method= \"gradient_boosting\")\n", + "test_contentbased_class(feature_method= \"title_length\", regressor_method= \"random_forest\")\n", "print(\"\\n\")\n", "print(\"movie_year : \")\n", - "test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"random_score\")\n", - "test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"random_sample\")\n", - "#test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"linear_regression\")\n", + "test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_score\")\n", + "test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_sample\")\n", + "test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"linear_regression\")\n", + "test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"svr_regression\")\n", + "test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"gradient_boosting\")\n", + "test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_forest\")\n", + "print(\"\\n\")\n", + "print(\"relevance : \") \n", + "test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_score\")\n", + "test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_sample\")\n", + "test_contentbased_class(feature_method= \"relevance\", regressor_method= \"linear_regression\")\n", + "test_contentbased_class(feature_method= \"relevance\", regressor_method= \"svr_regression\")\n", + "test_contentbased_class(feature_method= \"relevance\", regressor_method= \"gradient_boosting\")\n", + "test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_forest\")\n", "print(\"\\n\")\n", - "print(\"genres : \")\n", - "test_contentbased_class(feature_method=\"genres\", regressor_method=\"random_score\")\n", - "test_contentbased_class(feature_method=\"genres\", regressor_method=\"random_sample\")\n", - "#test_contentbased_class(feature_method=\"genres\", regressor_method=\"linear_regression\")\n", + "print(\"genres : \") \n", + "test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_score\")\n", + "test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_sample\")\n", + "test_contentbased_class(feature_method= \"genres\", regressor_method= \"linear_regression\")\n", + "test_contentbased_class(feature_method= \"genres\", regressor_method= \"svr_regression\")\n", + "test_contentbased_class(feature_method= \"genres\", regressor_method= \"gradient_boosting\")\n", + "test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_forest\")\n", "print(\"\\n\")\n", "print(\"rating : \")\n", - "test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_score\")\n", - "test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_sample\")\n", - "#test_contentbased_class(feature_method=\"rating\", regressor_method=\"linear_regression\")\n", + "test_contentbased_class(feature_method= \"rating\", regressor_method=\"random_score\")\n", + "test_contentbased_class(feature_method= \"rating\", regressor_method=\"random_sample\")\n", + "#test_contentbased_class(feature_method= \"rating\", regressor_method=\"linear_regression\")\n", + "#test_contentbased_class(feature_method=\"rating\", regressor_method=\"svr_regression\")\n", + "#test_contentbased_class(feature_method=\"rating\", regressor_method=\"gradient_boosting\")\n", + "#test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_forest\")\n", "print(\"\\n\")\n", "print(\"tags : \")\n", "test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_score\")\n", "test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_sample\")\n", "#test_contentbased_class(feature_method=\"tags\", regressor_method=\"linear_regression\")\n", + "# test_contentbased_class(feature_method=\"tags\", regressor_method=\"svr_regression\")\n", + "# test_contentbased_class(feature_method=\"tags\", regressor_method=\"gradient_boosting\")\n", + "# test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_forest\")\n", "print(\"\\n\")\n", "print(\"tags_length : \")\n", "test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_score\")\n", "test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_sample\")\n", - "#test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"linear_regression\")\n", - "print(\"\\n\")\n", - "print(\"timestamp : \")\n", - "test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"random_score\")\n", - "test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"random_sample\")\n", - "#test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"linear_regression\")" + "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"linear_regression\")\n", + "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"svr_regression\")\n", + "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"gradient_boosting\")\n", + "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_forest\")" ] } ], diff --git a/evaluator.ipynb b/evaluator.ipynb index efef1b7475bc2ec4598c9b8af197c29d5c358977..fac2fd4aac639618a6584cb667d973ec8415c5a4 100644 --- a/evaluator.ipynb +++ b/evaluator.ipynb @@ -13,19 +13,10 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 1, "id": "6aaf9140", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "# reloads modules automatically before entering the execution of code\n", "%load_ext autoreload\n", @@ -59,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 2, "id": "d6d82188", "metadata": {}, "outputs": [], @@ -201,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 3, "id": "f1849e55", "metadata": {}, "outputs": [], @@ -257,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 4, "id": "704f4d2a", "metadata": {}, "outputs": [ @@ -270,309 +261,23 @@ "- computing metric mae\n", "- computing metric rmse\n", "Training loo predictions\n", - "Training full predictions\n", - "Handling model baseline_2\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model baseline_3\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model baseline_4\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model title_length_ContentBased_sample\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model title_length_ContentBased_score\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model title_length_ContentBased_Lr\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model movie_year_ContentBased_sample\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model movie_year_ContentBased_score\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model genres_ContentBased_sample\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model genres_ContentBased_score\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model rating_ContentBased_sample\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model rating_ContentBased_score\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model tags_ContentBased_sample\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model tags_ContentBased_score\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model tags_length_ContentBased_sample\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model tags_length_ContentBased_score\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model timestamp_ContentBased_sample\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model timestamp_ContentBased_score\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "The data has been exported to the evaluation report\n" + "Training full predictions\n" ] }, { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>mae</th>\n", - " <th>rmse</th>\n", - " <th>hit_rate</th>\n", - " <th>novelty</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>baseline_1</th>\n", - " <td>1.312500</td>\n", - " <td>1.667708</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>baseline_2</th>\n", - " <td>1.315250</td>\n", - " <td>1.572990</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>baseline_3</th>\n", - " <td>1.318182</td>\n", - " <td>1.465689</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>baseline_4</th>\n", - " <td>1.363953</td>\n", - " <td>1.523985</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>title_length_ContentBased_sample</th>\n", - " <td>1.375000</td>\n", - " <td>1.750000</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>title_length_ContentBased_score</th>\n", - " <td>1.556280</td>\n", - " <td>2.063469</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>title_length_ContentBased_Lr</th>\n", - " <td>1.625729</td>\n", - " <td>1.773594</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>movie_year_ContentBased_sample</th>\n", - " <td>2.250000</td>\n", - " <td>2.610077</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>movie_year_ContentBased_score</th>\n", - " <td>1.866274</td>\n", - " <td>2.111422</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>genres_ContentBased_sample</th>\n", - " <td>1.875000</td>\n", - " <td>2.271136</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>genres_ContentBased_score</th>\n", - " <td>1.463388</td>\n", - " <td>1.793363</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>rating_ContentBased_sample</th>\n", - " <td>1.289773</td>\n", - " <td>1.715759</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>rating_ContentBased_score</th>\n", - " <td>2.482206</td>\n", - " <td>2.795490</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>tags_ContentBased_sample</th>\n", - " <td>1.937500</td>\n", - " <td>2.128673</td>\n", - " <td>0.5</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>tags_ContentBased_score</th>\n", - " <td>1.683499</td>\n", - " <td>1.782805</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>tags_length_ContentBased_sample</th>\n", - " <td>1.187500</td>\n", - " <td>1.704773</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>tags_length_ContentBased_score</th>\n", - " <td>1.564917</td>\n", - " <td>1.944345</td>\n", - " <td>0.5</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>timestamp_ContentBased_sample</th>\n", - " <td>1.875000</td>\n", - " <td>2.277608</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>timestamp_ContentBased_score</th>\n", - " <td>1.265317</td>\n", - " <td>1.512329</td>\n", - " <td>1.0</td>\n", - " <td>4.0</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " mae rmse hit_rate novelty\n", - "baseline_1 1.312500 1.667708 1.0 4.0\n", - "baseline_2 1.315250 1.572990 1.0 4.0\n", - "baseline_3 1.318182 1.465689 1.0 4.0\n", - "baseline_4 1.363953 1.523985 1.0 4.0\n", - "title_length_ContentBased_sample 1.375000 1.750000 1.0 4.0\n", - "title_length_ContentBased_score 1.556280 2.063469 1.0 4.0\n", - "title_length_ContentBased_Lr 1.625729 1.773594 1.0 4.0\n", - "movie_year_ContentBased_sample 2.250000 2.610077 1.0 4.0\n", - "movie_year_ContentBased_score 1.866274 2.111422 1.0 4.0\n", - "genres_ContentBased_sample 1.875000 2.271136 1.0 4.0\n", - "genres_ContentBased_score 1.463388 1.793363 1.0 4.0\n", - "rating_ContentBased_sample 1.289773 1.715759 1.0 4.0\n", - "rating_ContentBased_score 2.482206 2.795490 1.0 4.0\n", - "tags_ContentBased_sample 1.937500 2.128673 0.5 4.0\n", - "tags_ContentBased_score 1.683499 1.782805 1.0 4.0\n", - "tags_length_ContentBased_sample 1.187500 1.704773 1.0 4.0\n", - "tags_length_ContentBased_score 1.564917 1.944345 0.5 4.0\n", - "timestamp_ContentBased_sample 1.875000 2.277608 1.0 4.0\n", - "timestamp_ContentBased_score 1.265317 1.512329 1.0 4.0" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 16\u001b[0m\n\u001b[1;32m 14\u001b[0m sp_ratings \u001b[38;5;241m=\u001b[39m load_ratings(surprise_format\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 15\u001b[0m precomputed_dict \u001b[38;5;241m=\u001b[39m precomputed_information(pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/tiny/evidence/ratings.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[0;32m---> 16\u001b[0m evaluation_report \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_evaluation_report\u001b[49m\u001b[43m(\u001b[49m\u001b[43mEvalConfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msp_ratings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprecomputed_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mAVAILABLE_METRICS\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m export_evaluation_report(evaluation_report)\n", + "Cell \u001b[0;32mIn[2], line 114\u001b[0m, in \u001b[0;36mcreate_evaluation_report\u001b[0;34m(eval_config, sp_ratings, precomputed_dict, available_metrics)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(eval_config\u001b[38;5;241m.\u001b[39mfull_metrics) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTraining full predictions\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m--> 114\u001b[0m anti_testset_top_n \u001b[38;5;241m=\u001b[39m \u001b[43mgenerate_full_top_n\u001b[49m\u001b[43m(\u001b[49m\u001b[43malgo\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msp_ratings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meval_config\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m metric \u001b[38;5;129;01min\u001b[39;00m eval_config\u001b[38;5;241m.\u001b[39mfull_metrics:\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m metric \u001b[38;5;129;01min\u001b[39;00m available_metrics[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfull\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", + "Cell \u001b[0;32mIn[2], line 44\u001b[0m, in \u001b[0;36mgenerate_full_top_n\u001b[0;34m(algo, ratings_dataset, eval_config)\u001b[0m\n\u001b[1;32m 42\u001b[0m algo\u001b[38;5;241m.\u001b[39mfit(full_trainset) \u001b[38;5;66;03m# Train the algorithm on the full training set\u001b[39;00m\n\u001b[1;32m 43\u001b[0m anti_testset \u001b[38;5;241m=\u001b[39m full_trainset\u001b[38;5;241m.\u001b[39mbuild_anti_testset() \u001b[38;5;66;03m# Build the anti test-set\u001b[39;00m\n\u001b[0;32m---> 44\u001b[0m predictions \u001b[38;5;241m=\u001b[39m \u001b[43malgo\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m(\u001b[49m\u001b[43manti_testset\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Get predictions on the anti test-set\u001b[39;00m\n\u001b[1;32m 45\u001b[0m top_n \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m uid, iid, _, est, _ \u001b[38;5;129;01min\u001b[39;00m predictions:\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/surprise/prediction_algorithms/algo_base.py:161\u001b[0m, in \u001b[0;36mAlgoBase.test\u001b[0;34m(self, testset, verbose)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Test the algorithm on given testset, i.e. estimate all the ratings\u001b[39;00m\n\u001b[1;32m 143\u001b[0m \u001b[38;5;124;03min the given testset.\u001b[39;00m\n\u001b[1;32m 144\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;124;03m that contains all the estimated ratings.\u001b[39;00m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;66;03m# The ratings are translated back to their original scale.\u001b[39;00m\n\u001b[1;32m 160\u001b[0m predictions \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m--> 161\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43muid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43miid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mr_ui_trans\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m (uid, iid, r_ui_trans) \u001b[38;5;129;01min\u001b[39;00m testset\n\u001b[1;32m 163\u001b[0m ]\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m predictions\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/surprise/prediction_algorithms/algo_base.py:119\u001b[0m, in \u001b[0;36mAlgoBase.predict\u001b[0;34m(self, uid, iid, r_ui, clip, verbose)\u001b[0m\n\u001b[1;32m 117\u001b[0m lower_bound, higher_bound \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrainset\u001b[38;5;241m.\u001b[39mrating_scale\n\u001b[1;32m 118\u001b[0m est \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmin\u001b[39m(higher_bound, est)\n\u001b[0;32m--> 119\u001b[0m est \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mmax\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mlower_bound\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mest\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 121\u001b[0m pred \u001b[38;5;241m=\u001b[39m Prediction(uid, iid, r_ui, est, details)\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verbose:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] } ], "source": [ @@ -590,7 +295,7 @@ "}\n", "\n", "sp_ratings = load_ratings(surprise_format=True)\n", - "precomputed_dict = precomputed_information(pd.read_csv(\"data/tiny/evidence/ratings.csv\"))\n", + "precomputed_dict = precomputed_information(pd.read_csv(\"data/test/evidence/ratings.csv\"))\n", "evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)\n", "export_evaluation_report(evaluation_report)" ] diff --git a/models.py b/models.py index 790d8feacef92ecbca42e92f7c233bad6e65d040..13fe7ae5d6835ff4324de063cba7eda470c41333 100644 --- a/models.py +++ b/models.py @@ -5,7 +5,7 @@ from collections import defaultdict import pandas as pd import numpy as np import random as rd -from surprise import AlgoBase, SVD, KNNWithMeans +from surprise import AlgoBase, SVD from surprise import PredictionImpossible # import local @@ -120,36 +120,32 @@ class ContentBased(AlgoBase): df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') elif features_method == "movie_year" : - df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False) + df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year') elif features_method == "genres" : genres_list = df_items['genres'].str.split('|').explode().unique() for genre in genres_list: - df_features = df_items['genres'].str.contains(genre).astype(int) + df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres') - elif features_method == "combination" : + elif features_method == "combination": genres_list = df_items['genres'].str.split('|').explode().unique() - df_movie = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False) + df_movie = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year') for genre in genres_list: - df_genre = df_items['genres'].str.contains(genre).astype(int) - df_tag = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0) - df_year = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False) - + df_genre = df_items['genres'].str.contains(genre).astype(int).to_frame(genre) + + df_tag['tag_length'] = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')) if isinstance(x, str) else 0) df_features = pd.concat([df_genre, df_tag, df_movie], axis=1) - + elif features_method == "rating" : df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating') elif features_method == "tags" : - df_features = df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0) + df_features = df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0).to_frame('tags') elif features_method == "tags_length" : - df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0) + df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0).to_frame('n_character_tags') - elif features_method == "timestamp" : - df_features = df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400) - df_features = df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400) else: # (implement other feature creations here) raise NotImplementedError(f'Feature method {features_method} not yet implemented') @@ -170,6 +166,7 @@ class ContentBased(AlgoBase): elif self.regressor_method == 'random_sample': for u in self.user_profile: self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]] + elif self.regressor_method == 'linear_regression' : for u in self.user_profile: @@ -181,8 +178,42 @@ class ContentBased(AlgoBase): df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid) df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') + + if 'n_character_title' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['n_character_title'].values.reshape(-1, 1) + + elif 'avg_relevance' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['avg_relevance'].values.reshape(-1, 1) + + elif 'movie_year' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['movie_year'].values.reshape(-1, 1) + + elif 'genres' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['genres'].values.reshape(-1, 1) + + elif 'combination' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['combination'].values.reshape(-1, 1) + + elif 'avg_rating' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['avg_rating'].values.reshape(-1, 1) - X = df_user['n_character_title'].values.reshape(-1,1) + elif 'tags' in df_user.columns: + # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) + X = df_user['tags'].values.reshape(-1, 1) + + elif 'n_character_tags' in df_user.columns: + # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) + X = df_user['n_character_tags'].values.reshape(-1, 1) + + else: + # Si aucune caractéristique appropriée n'est disponible + continue # Ou gère le cas d'erreur/exception ici y = df_user['user_ratings'].values @@ -205,9 +236,43 @@ class ContentBased(AlgoBase): df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') - X = df_user['n_character_title'].values.reshape(-1, 1) + if 'n_character_title' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['n_character_title'].values.reshape(-1, 1) + + elif 'avg_relevance' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['avg_relevance'].values.reshape(-1, 1) + + elif 'movie_year' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['movie_year'].values.reshape(-1, 1) + + elif 'genres' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['genres'].values.reshape(-1, 1) + + elif 'combination' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['combination'].values.reshape(-1, 1) + + elif 'avg_rating' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['avg_rating'].values.reshape(-1, 1) + + elif 'tags' in df_user.columns: + # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) + X = df_user['tags'].values.reshape(-1, 1) + + elif 'n_character_tags' in df_user.columns: + # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) + X = df_user['n_character_tags'].values.reshape(-1, 1) + + else: + # Si aucune caractéristique appropriée n'est disponible + continue # Ou gère le cas d'erreur/exception ici y = df_user['user_ratings'].values - svr_regressor = SVR(kernel='rbf', C=0.00000000001, epsilon=0.2) + svr_regressor = SVR(kernel='rbf', C=10, epsilon=0.2) svr_regressor.fit(X, y) self.user_profile[u] = svr_regressor @@ -223,7 +288,42 @@ class ContentBased(AlgoBase): df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') - X = df_user['n_character_title'].values.reshape(-1, 1) + if 'n_character_title' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['n_character_title'].values.reshape(-1, 1) + + elif 'avg_relevance' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['avg_relevance'].values.reshape(-1, 1) + + elif 'movie_year' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['movie_year'].values.reshape(-1, 1) + + elif 'genres' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['genres'].values.reshape(-1, 1) + + elif 'combination' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['combination'].values.reshape(-1, 1) + + elif 'avg_rating' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['avg_rating'].values.reshape(-1, 1) + + elif 'tags' in df_user.columns: + # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) + X = df_user['tags'].values.reshape(-1, 1) + + elif 'n_character_tags' in df_user.columns: + # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) + X = df_user['n_character_tags'].values.reshape(-1, 1) + + else: + # Si aucune caractéristique appropriée n'est disponible + continue # Ou gère le cas d'erreur/exception ici + y = df_user['user_ratings'].values gb_regressor = GradientBoostingRegressor(n_estimators=10000000, learning_rate=0.1, max_depth=3) gb_regressor.fit(X, y) @@ -242,12 +342,45 @@ class ContentBased(AlgoBase): df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') - X = df_user['n_character_title'].values.reshape(-1, 1) + if 'n_character_title' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['n_character_title'].values.reshape(-1, 1) + + elif 'avg_relevance' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['avg_relevance'].values.reshape(-1, 1) + + elif 'movie_year' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['movie_year'].values.reshape(-1, 1) + + elif 'genres' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['genres'].values.reshape(-1, 1) + + elif 'combination' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['combination'].values.reshape(-1, 1) + + elif 'avg_rating' in df_user.columns: + # Si 'n_character_title' est disponible comme caractéristique + X = df_user['avg_rating'].values.reshape(-1, 1) + + elif 'tags' in df_user.columns: + # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) + X = df_user['tags'].values.reshape(-1, 1) + + elif 'n_character_tags' in df_user.columns: + # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) + X = df_user['n_character_tags'].values.reshape(-1, 1) + + else: + # Si aucune caractéristique appropriée n'est disponible + continue # Ou gère le cas d'erreur/exception ici y = df_user['user_ratings'].values rf_regressor = RandomForestRegressor(n_estimators=100) rf_regressor.fit(X, y) self.user_profile[u] = rf_regressor - else : pass @@ -326,39 +459,52 @@ def test_contentbased_class(feature_method, regressor_method): prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1]) print(prediction) -""" + + print("title_length :") test_contentbased_class(feature_method = "title_length" , regressor_method = "random_score") test_contentbased_class(feature_method = "title_length" , regressor_method = "random_sample") test_contentbased_class(feature_method = "title_length" , regressor_method = "linear_regression") +test_contentbased_class(feature_method="title_length", regressor_method="svr_regression") +test_contentbased_class(feature_method="title_length", regressor_method="gradient_boosting") +test_contentbased_class(feature_method="title_length", regressor_method="random_forest") print("\n") print("movie_year : ") test_contentbased_class(feature_method="movie_year", regressor_method="random_score") test_contentbased_class(feature_method="movie_year", regressor_method="random_sample") -#test_contentbased_class(feature_method="movie_year", regressor_method="linear_regression") +test_contentbased_class(feature_method="movie_year", regressor_method="linear_regression") +test_contentbased_class(feature_method="movie_year", regressor_method="svr_regression") +test_contentbased_class(feature_method="movie_year", regressor_method="gradient_boosting") +test_contentbased_class(feature_method="movie_year", regressor_method="random_forest") print("\n") print("genres : ") test_contentbased_class(feature_method="genres", regressor_method="random_score") test_contentbased_class(feature_method="genres", regressor_method="random_sample") -#test_contentbased_class(feature_method="genres", regressor_method="linear_regression") +test_contentbased_class(feature_method="genres", regressor_method="linear_regression") +test_contentbased_class(feature_method="genres", regressor_method="svr_regression") +test_contentbased_class(feature_method="genres", regressor_method="gradient_boosting") +test_contentbased_class(feature_method="genres", regressor_method="random_forest") print("\n") print("rating : ") test_contentbased_class(feature_method="rating", regressor_method="random_score") test_contentbased_class(feature_method="rating", regressor_method="random_sample") -#test_contentbased_class(feature_method="rating", regressor_method="linear_regression") +test_contentbased_class(feature_method="rating", regressor_method="linear_regression") +# test_contentbased_class(feature_method="rating", regressor_method="svr_regression") +# test_contentbased_class(feature_method="rating", regressor_method="gradient_boosting") +# test_contentbased_class(feature_method="rating", regressor_method="random_forest") print("\n") print("tags : ") test_contentbased_class(feature_method="tags", regressor_method="random_score") test_contentbased_class(feature_method="tags", regressor_method="random_sample") #test_contentbased_class(feature_method="tags", regressor_method="linear_regression") +# test_contentbased_class(feature_method="tags", regressor_method="svr_regression") +# test_contentbased_class(feature_method="tags", regressor_method="gradient_boosting") +# test_contentbased_class(feature_method="tags", regressor_method="random_forest") print("\n") print("tags_length : ") test_contentbased_class(feature_method="tags_length", regressor_method="random_score") test_contentbased_class(feature_method="tags_length", regressor_method="random_sample") -#test_contentbased_class(feature_method="tags_length", regressor_method="linear_regression") -print("\n") -print("timestamp : ") -test_contentbased_class(feature_method="timestamp", regressor_method="random_score") -test_contentbased_class(feature_method="timestamp", regressor_method="random_sample") -#test_contentbased_class(feature_method="timestamp", regressor_method="linear_regression") -""" \ No newline at end of file +# test_contentbased_class(feature_method="tags_length", regressor_method="linear_regression") +# test_contentbased_class(feature_method="tags_length", regressor_method="svr_regression") +# test_contentbased_class(feature_method="tags_length", regressor_method="gradient_boosting") +# test_contentbased_class(feature_method="tags_length", regressor_method="random_forest") \ No newline at end of file