diff --git a/content_based copy.ipynb b/content_based copy.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..fffea99ab7e327cbe56f9b267cd486ef0a2fd8c1 --- /dev/null +++ b/content_based copy.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "82d5ca82", + "metadata": {}, + "source": [ + "# Packages" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "277473a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "\n", + "# third parties imports\n", + "import pandas as pd\n", + "import numpy as np\n", + "import random as rd\n", + "from surprise import AlgoBase, SVD\n", + "from surprise import PredictionImpossible\n", + "\n", + "# import local\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from loaders import load_items, load_ratings\n", + "from constants import Constant as C\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n", + "from sklearn.svm import SVR\n", + "\n", + "from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet\n", + "from sklearn.svm import SVR\n", + "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from xgboost import XGBRegressor\n", + "from lightgbm import LGBMRegressor\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "a42c16bf", + "metadata": {}, + "source": [ + "# Explore and select content features" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e8378976", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>n_character_title</th>\n", + " </tr>\n", + " <tr>\n", + " <th>movieId</th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>16</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>23</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>34</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " n_character_title\n", + "movieId \n", + "1 16\n", + "2 14\n", + "3 23\n", + "4 24\n", + "5 34" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 sandra 'boring' bullock\n", + "1 dentist\n", + "2 Cambodia\n", + "3 Russian\n", + "4 forgettable\n", + "Name: tag, dtype: object" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# All the dataframes\n", + "df_items = load_items()\n", + "df_ratings = load_ratings()\n", + "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", + "#df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n", + "# df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n", + "\n", + "\n", + "# Example 1 : create title_length features\n", + "df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n", + "display(df_features.head())\n", + "\n", + "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", + "df_features = df_tag[C.TAG]\n", + "display(df_features.head())\n", + "\n", + "# (explore here other features)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a2c9a2b6", + "metadata": {}, + "source": [ + "# Build a content-based model\n", + "When ready, move the following class in the *models.py* script" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "16b0a602", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'title_length': 0.1497645139703848, 'movie_year': 0.16218667420100635, '(no genres listed)': 0.0, 'action': 0.09449072815753193, 'adventure': 0.08778978776313201, 'animation': 0.0, 'children': 0.038431411145366176, 'comedy': 0.07268129109348041, 'crime': 0.09469516433772891, 'documentary': 0.0611428358670058, 'drama': 0.10494783392380302, 'fantasy': 0.025806451608591505, 'film-noir': 0.025806451609512046, 'horror': 0.018342712153336858, 'imax': 0.06947533670577526, 'musical': 0.0, 'mystery': 0.06234903350217154, 'romance': 0.036771716124540825, 'sci-fi': 0.059571001735546115, 'thriller': 0.0993122803165238, 'war': 0.04002978709072218, 'western': 0.04547648227079719, 'avg_rating': 0.16263357553020436}\n", + "{'title_length': 0.12975573389578626, 'movie_year': 0.13738555574364605, '(no genres listed)': 0.0, 'action': 0.0640388318396414, 'adventure': 0.0827515664964472, 'animation': 0.05686854568650957, 'children': 0.06799492283569505, 'comedy': 0.07354182680364503, 'crime': 0.05543740962624167, 'documentary': 0.0, 'drama': 0.09170589087803577, 'fantasy': 0.061481521263689595, 'film-noir': 0.0, 'horror': 0.015113350123518238, 'imax': 0.04592205020685974, 'musical': 0.03201459126079391, 'mystery': 0.03412706135338736, 'romance': 0.05989121250223656, 'sci-fi': 0.04370793816378273, 'thriller': 0.045800659191095036, 'war': 0.04907194751877139, 'western': 0.027287416762806844, 'avg_rating': 0.13740560847192132}\n", + "{'title_length': 0.04702378569892371, 'movie_year': 0.052440003628289225, '(no genres listed)': 0.0, 'action': 0.020439581335728367, 'adventure': 0.015593308332521032, 'animation': 0.004256286923052558, 'children': 0.003520723090188317, 'comedy': 0.018972762464944913, 'crime': 0.028340544273099223, 'documentary': 0.005823989517206729, 'drama': 0.037415345194166824, 'fantasy': 0.013643903080149476, 'film-noir': 0.015390183296279798, 'horror': 0.01926898253629829, 'imax': 0.0014716703456143566, 'musical': 0.0061519348279224124, 'mystery': 0.02847033164163413, 'romance': 0.019827342468818163, 'sci-fi': 0.022573488552024915, 'thriller': 0.03522231545147593, 'war': 0.010339617301415098, 'western': 0.005663885036293055, 'avg_rating': 0.05327750989412312}\n", + "{'title_length': 0.033402138126294736, 'movie_year': 0.03710065977291947, '(no genres listed)': 0.0, 'action': 0.014528522669579273, 'adventure': 0.013963913494241694, 'animation': 0.005764814103226412, 'children': 0.006513197483932152, 'comedy': 0.017763201411495646, 'crime': 0.016002513666599556, 'documentary': 0.004292962983778595, 'drama': 0.027458210593047847, 'fantasy': 0.009302633945770895, 'film-noir': 0.006823368830454359, 'horror': 0.007391689869010394, 'imax': 0.004855154663168369, 'musical': 0.0058909467772061425, 'mystery': 0.012191560732760487, 'romance': 0.01723631022081761, 'sci-fi': 0.010817269433255231, 'thriller': 0.01658593988724716, 'war': 0.010193212979882352, 'western': 0.0052038255339472966, 'avg_rating': 0.03742403427834079}\n", + "{'title_length': 0.20154225634108316, 'movie_year': 0.20848962267389695, '(no genres listed)': 0.0, 'action': 0.04545454544645529, 'adventure': 0.04545454544730129, 'animation': 0.0, 'children': 0.0, 'comedy': 0.07177284969293253, 'crime': 0.1145252645738102, 'documentary': 0.0, 'drama': 0.16778172557550536, 'fantasy': 0.0, 'film-noir': 0.0, 'horror': 0.06315936177961773, 'imax': 0.0, 'musical': 0.0, 'mystery': 0.08510520557533159, 'romance': 0.09754755529442835, 'sci-fi': 0.045454545449454146, 'thriller': 0.12542163704872258, 'war': 0.08035304331050673, 'western': 0.0, 'avg_rating': 0.21152969571139305}\n", + "{'title_length': 0.021927486954368552, 'movie_year': 0.02488786702116846, '(no genres listed)': 0.0007363092498113207, 'action': 0.013836432470735639, 'adventure': 0.011610617815573265, 'animation': 0.007520799115717832, 'children': 0.006287966766754299, 'comedy': 0.012951125615087338, 'crime': 0.011084119744598393, 'documentary': 0.0018287715645832062, 'drama': 0.015221252640276463, 'fantasy': 0.008631010164284143, 'film-noir': 0.0024629052522566544, 'horror': 0.008816299251739122, 'imax': 0.005347204099216887, 'musical': 0.0038827346462235236, 'mystery': 0.0068652812039576095, 'romance': 0.008086664541950757, 'sci-fi': 0.010304269379559203, 'thriller': 0.013200133984104478, 'war': 0.005127335699821772, 'western': 0.0036215200349232765, 'avg_rating': 0.025470698706944836}\n" + ] + } + ], + "source": [ + "\n", + "# ContetnBased\n", + "class ContentBased(AlgoBase):\n", + " def __init__(self, features_method, regressor_method):\n", + " AlgoBase.__init__(self)\n", + " self.regressor_method = regressor_method\n", + " self.features_methods = features_method\n", + " self.content_features = self.create_content_features(features_method)\n", + " self.user_profile = {}\n", + " self.user_profile_explain = {}\n", + "\n", + " def create_content_features(self, features_methods):\n", + " \"\"\"Content Analyzer\"\"\"\n", + " df_items = load_items()\n", + " df_ratings = load_ratings()\n", + " df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", + " df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n", + " df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n", + "\n", + " df_features = pd.DataFrame(index=df_items.index)\n", + "\n", + " for method in features_methods:\n", + " if method == \"title_length\":\n", + " df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')\n", + " df_features = pd.concat([df_features, df_title_length], axis=1)\n", + " \n", + " elif method == \"movie_year\":\n", + " df_movie_year = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", + " df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)\n", + " \n", + " elif method == \"genre\":\n", + " tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)\n", + " tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n", + " df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())\n", + " df_features = pd.concat([df_features, df_tfidf_genres], axis=1)\n", + "\n", + " elif method == \"avg_rating\":\n", + " df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')\n", + " df_features = df_features.join(df_avg_rating, on='movieId')\n", + "\n", + " else:\n", + " raise NotImplementedError(f'Feature method {method} not yet implemented')\n", + "\n", + " # Handle missing values in df_features\n", + " df_features.fillna(0, inplace=True)\n", + "\n", + " return df_features\n", + "\n", + " def fit(self, trainset):\n", + " \"\"\"Profile Learner\"\"\"\n", + " AlgoBase.fit(self, trainset)\n", + "\n", + " # Preallocate user profiles\n", + " self.user_profile = {u: None for u in trainset.all_users()}\n", + " self.user_profile_explain = {}\n", + "\n", + " epsilon = 1e-10 # Small value to prevent division by zero\n", + "\n", + " for u in trainset.all_users():\n", + " raw_user_id = trainset.to_raw_uid(u)\n", + " self.user_profile_explain[raw_user_id] = {}\n", + "\n", + " user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])\n", + " item_ids = [iid for (iid, _) in trainset.ur[u]]\n", + " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n", + "\n", + " feature_values = self.content_features.loc[raw_item_ids].values\n", + " norms = np.linalg.norm(feature_values, axis=0) + epsilon\n", + " weighted_features = feature_values / norms\n", + " feature_importance = weighted_features.T @ user_ratings\n", + " feature_importance /= np.sum(user_ratings)\n", + "\n", + " self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))\n", + "\n", + " if self.regressor_method == 'random_score':\n", + " for u in self.user_profile:\n", + " self.user_profile[u] = rd.uniform(0.5, 5)\n", + "\n", + " elif self.regressor_method == 'random_sample':\n", + " for u in self.user_profile:\n", + " self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]\n", + "\n", + " else:\n", + " regressor_models = {\n", + " 'linear_regression': LinearRegression(fit_intercept=False),\n", + " 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),\n", + " 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n", + " 'random_forest': RandomForestRegressor(n_estimators=100),\n", + " 'lasso_regression': Lasso(alpha=0.1),\n", + " 'ridge_regression': Ridge(alpha=1.0),\n", + " 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),\n", + " 'knn_regression': KNeighborsRegressor(n_neighbors=1),\n", + " 'decision_tree': DecisionTreeRegressor(max_depth=5),\n", + " 'adaboost': AdaBoostRegressor(n_estimators=50),\n", + " 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n", + " 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n", + " }\n", + "\n", + " if self.regressor_method not in regressor_models:\n", + " raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')\n", + "\n", + " for u in self.user_profile:\n", + " user_ratings = [rating for (_, rating) in trainset.ur[u]]\n", + " item_ids = [iid for (iid, _) in trainset.ur[u]]\n", + " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n", + "\n", + " df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n", + " df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n", + "\n", + " X = df_user.drop(columns=['item_id', 'user_ratings'])\n", + " y = df_user['user_ratings']\n", + "\n", + " regressor = regressor_models[self.regressor_method]\n", + " regressor.fit(X, y)\n", + "\n", + " self.user_profile[u] = regressor\n", + "\n", + " def estimate(self, u, i):\n", + " \"\"\"Scoring component used for item filtering\"\"\"\n", + " if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n", + " raise PredictionImpossible('User and/or item is unknown.')\n", + "\n", + " if self.regressor_method == 'random_score':\n", + " return rd.uniform(0.5, 5)\n", + "\n", + " elif self.regressor_method == 'random_sample':\n", + " return rd.choice(self.user_profile[u])\n", + "\n", + " else:\n", + " raw_item_id = self.trainset.to_raw_iid(i)\n", + " item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)\n", + " regressor = self.user_profile[u]\n", + " item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)\n", + " return regressor.predict(item_features_df)[0]\n", + "\n", + " def explain(self, u):\n", + " if u in self.user_profile_explain:\n", + " return self.user_profile_explain[u]\n", + " else:\n", + " return None\n", + "\n", + "\n", + "#Example usage:\n", + "cb = ContentBased([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")\n", + "surprise_data = load_ratings(surprise_format=True)\n", + "trainset = surprise_data.build_full_trainset()\n", + "testset = trainset.build_anti_testset()\n", + "cb.fit(trainset)\n", + "\n", + "\n", + "#print(\"RMSE: \", cb.rmse(testset))\n", + "\n", + "\n", + "#Example explanations for users:\n", + "print(cb.explain(11))\n", + "\n", + "print(cb.explain(13))\n", + "\n", + "print(cb.explain(17))\n", + "\n", + "print(cb.explain(23))\n", + "\n", + "print(cb.explain(27))\n", + "\n", + "print(cb.explain(73))\n" + ] + }, + { + "cell_type": "markdown", + "id": "ffd75b7e", + "metadata": {}, + "source": [ + "The following script test the ContentBased class" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "69d12f7d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "user: 1 item: 10 r_ui = None est = 0.72 {'was_impossible': False}\n" + ] + } + ], + "source": [ + "def test_contentbased_class(feature_method, regressor_method):\n", + " \"\"\"Test the ContentBased class.\n", + " Tries to make a prediction on the first (user,item ) tuple of the anti_test_set\n", + " \"\"\"\n", + " sp_ratings = load_ratings(surprise_format=True)\n", + " train_set = sp_ratings.build_full_trainset()\n", + " content_algo = ContentBased(feature_method, regressor_method)\n", + " content_algo.fit(train_set)\n", + " anti_test_set_first = train_set.build_anti_testset()[0]\n", + " prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n", + " print(prediction)\n", + "\n", + "test_contentbased_class([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/recommender.py b/recommender.py index 05858236d9769f33fc86ea9decead7c130af20d7..e9295588fdca77379242fa409fe3d00781dd0818 100644 --- a/recommender.py +++ b/recommender.py @@ -683,9 +683,8 @@ def compare_similarity_measures(trainset,testset): results['KNN_Pearson_RMSE'] = rmse_pearson results['KNN_Pearson_MAE'] = mae_pearson - # Train and evaluate KNN model with Jaccard similarity - sim_options_jaccard = {'name': 'jaccard','user_based': True} + sim_options_jaccard = {'name': '','user_based': True} user_based_jaccard = KNNWithMeans(sim_options=sim_options_jaccard) user_based_jaccard.fit(trainset) predictions_jaccard = user_based_jaccard.test(testset) @@ -771,33 +770,33 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas inter_user_diversity_scores['UserBased'] = user_based_model.inter_user_diversity(all_top_n_recommendations_ub) - # # #KNN model - # knn_model = RecommenderSystem_KNN(ratings_path) - # knn_model.train_knn_model() - # all_top_n_recommendations_knn = {} - # for user_id in range(knn_model.trainset.n_users): - # try: - # trainset_user_id = knn_model.trainset.to_raw_uid(user_id) - # top_n_recommendations_knn = knn_model.get_top_n_recommendations(trainset_user_id, n=10) - # all_top_n_recommendations_knn[trainset_user_id] = top_n_recommendations_knn - # except ValueError: - # print(f"User {trainset_user_id} is not part of the training set for KNN model. Skipping...") - - # inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(all_top_n_recommendations_knn) - - # # Other user-based models - # for other_model in other_user_based_models: - # other_model.load_model() - # all_top_n_recommendations_other = {} + # #KNN model + knn_model = RecommenderSystem_KNN(ratings_path) + knn_model.train_knn_model() + all_top_n_recommendations_knn = {} + for user_id in range(knn_model.trainset.n_users): + try: + trainset_user_id = knn_model.trainset.to_raw_uid(user_id) + top_n_recommendations_knn = knn_model.get_top_n_recommendations(trainset_user_id, n=10) + all_top_n_recommendations_knn[trainset_user_id] = top_n_recommendations_knn + except ValueError: + print(f"User {trainset_user_id} is not part of the training set for KNN model. Skipping...") + + inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(all_top_n_recommendations_knn) + + # Other user-based models + for other_model in other_user_based_models: + other_model.load_model() + all_top_n_recommendations_other = {} - # # Get predictions for all users in the test set - # all_user_ids = set(user for user, _, _ in testset) - # for user_id in all_user_ids: - # other_model.user_id = user_id # Update the user ID for the model - # top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10) - # all_top_n_recommendations_other[user_id] = top_n_predictions + # Get predictions for all users in the test set + all_user_ids = set(user for user, _, _ in testset) + for user_id in all_user_ids: + other_model.user_id = user_id # Update the user ID for the model + top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10) + all_top_n_recommendations_other[user_id] = top_n_predictions - # inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other) + inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other) return inter_user_diversity_scores @@ -983,8 +982,7 @@ class ContentBased(AlgoBase): 'knn_regression': KNeighborsRegressor(n_neighbors=1), 'decision_tree': DecisionTreeRegressor(max_depth=5), 'adaboost': AdaBoostRegressor(n_estimators=50), - 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3), - 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3) + 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3) } if self.regressor_method not in regressor_models: