diff --git a/content_based copy.ipynb b/content_based copy.ipynb deleted file mode 100644 index fffea99ab7e327cbe56f9b267cd486ef0a2fd8c1..0000000000000000000000000000000000000000 --- a/content_based copy.ipynb +++ /dev/null @@ -1,427 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "82d5ca82", - "metadata": {}, - "source": [ - "# Packages" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "277473a3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "\n", - "# third parties imports\n", - "import pandas as pd\n", - "import numpy as np\n", - "import random as rd\n", - "from surprise import AlgoBase, SVD\n", - "from surprise import PredictionImpossible\n", - "\n", - "# import local\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from loaders import load_items, load_ratings\n", - "from constants import Constant as C\n", - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n", - "from sklearn.svm import SVR\n", - "\n", - "from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet\n", - "from sklearn.svm import SVR\n", - "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor\n", - "from sklearn.tree import DecisionTreeRegressor\n", - "from sklearn.neighbors import KNeighborsRegressor\n", - "from xgboost import XGBRegressor\n", - "from lightgbm import LGBMRegressor\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "a42c16bf", - "metadata": {}, - "source": [ - "# Explore and select content features" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "e8378976", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>n_character_title</th>\n", - " </tr>\n", - " <tr>\n", - " <th>movieId</th>\n", - " <th></th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>16</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>14</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>23</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>24</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>34</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " n_character_title\n", - "movieId \n", - "1 16\n", - "2 14\n", - "3 23\n", - "4 24\n", - "5 34" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 sandra 'boring' bullock\n", - "1 dentist\n", - "2 Cambodia\n", - "3 Russian\n", - "4 forgettable\n", - "Name: tag, dtype: object" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# All the dataframes\n", - "df_items = load_items()\n", - "df_ratings = load_ratings()\n", - "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", - "#df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n", - "# df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n", - "\n", - "\n", - "# Example 1 : create title_length features\n", - "df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n", - "display(df_features.head())\n", - "\n", - "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", - "df_features = df_tag[C.TAG]\n", - "display(df_features.head())\n", - "\n", - "# (explore here other features)\n" - ] - }, - { - "cell_type": "markdown", - "id": "a2c9a2b6", - "metadata": {}, - "source": [ - "# Build a content-based model\n", - "When ready, move the following class in the *models.py* script" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "16b0a602", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'title_length': 0.1497645139703848, 'movie_year': 0.16218667420100635, '(no genres listed)': 0.0, 'action': 0.09449072815753193, 'adventure': 0.08778978776313201, 'animation': 0.0, 'children': 0.038431411145366176, 'comedy': 0.07268129109348041, 'crime': 0.09469516433772891, 'documentary': 0.0611428358670058, 'drama': 0.10494783392380302, 'fantasy': 0.025806451608591505, 'film-noir': 0.025806451609512046, 'horror': 0.018342712153336858, 'imax': 0.06947533670577526, 'musical': 0.0, 'mystery': 0.06234903350217154, 'romance': 0.036771716124540825, 'sci-fi': 0.059571001735546115, 'thriller': 0.0993122803165238, 'war': 0.04002978709072218, 'western': 0.04547648227079719, 'avg_rating': 0.16263357553020436}\n", - "{'title_length': 0.12975573389578626, 'movie_year': 0.13738555574364605, '(no genres listed)': 0.0, 'action': 0.0640388318396414, 'adventure': 0.0827515664964472, 'animation': 0.05686854568650957, 'children': 0.06799492283569505, 'comedy': 0.07354182680364503, 'crime': 0.05543740962624167, 'documentary': 0.0, 'drama': 0.09170589087803577, 'fantasy': 0.061481521263689595, 'film-noir': 0.0, 'horror': 0.015113350123518238, 'imax': 0.04592205020685974, 'musical': 0.03201459126079391, 'mystery': 0.03412706135338736, 'romance': 0.05989121250223656, 'sci-fi': 0.04370793816378273, 'thriller': 0.045800659191095036, 'war': 0.04907194751877139, 'western': 0.027287416762806844, 'avg_rating': 0.13740560847192132}\n", - "{'title_length': 0.04702378569892371, 'movie_year': 0.052440003628289225, '(no genres listed)': 0.0, 'action': 0.020439581335728367, 'adventure': 0.015593308332521032, 'animation': 0.004256286923052558, 'children': 0.003520723090188317, 'comedy': 0.018972762464944913, 'crime': 0.028340544273099223, 'documentary': 0.005823989517206729, 'drama': 0.037415345194166824, 'fantasy': 0.013643903080149476, 'film-noir': 0.015390183296279798, 'horror': 0.01926898253629829, 'imax': 0.0014716703456143566, 'musical': 0.0061519348279224124, 'mystery': 0.02847033164163413, 'romance': 0.019827342468818163, 'sci-fi': 0.022573488552024915, 'thriller': 0.03522231545147593, 'war': 0.010339617301415098, 'western': 0.005663885036293055, 'avg_rating': 0.05327750989412312}\n", - "{'title_length': 0.033402138126294736, 'movie_year': 0.03710065977291947, '(no genres listed)': 0.0, 'action': 0.014528522669579273, 'adventure': 0.013963913494241694, 'animation': 0.005764814103226412, 'children': 0.006513197483932152, 'comedy': 0.017763201411495646, 'crime': 0.016002513666599556, 'documentary': 0.004292962983778595, 'drama': 0.027458210593047847, 'fantasy': 0.009302633945770895, 'film-noir': 0.006823368830454359, 'horror': 0.007391689869010394, 'imax': 0.004855154663168369, 'musical': 0.0058909467772061425, 'mystery': 0.012191560732760487, 'romance': 0.01723631022081761, 'sci-fi': 0.010817269433255231, 'thriller': 0.01658593988724716, 'war': 0.010193212979882352, 'western': 0.0052038255339472966, 'avg_rating': 0.03742403427834079}\n", - "{'title_length': 0.20154225634108316, 'movie_year': 0.20848962267389695, '(no genres listed)': 0.0, 'action': 0.04545454544645529, 'adventure': 0.04545454544730129, 'animation': 0.0, 'children': 0.0, 'comedy': 0.07177284969293253, 'crime': 0.1145252645738102, 'documentary': 0.0, 'drama': 0.16778172557550536, 'fantasy': 0.0, 'film-noir': 0.0, 'horror': 0.06315936177961773, 'imax': 0.0, 'musical': 0.0, 'mystery': 0.08510520557533159, 'romance': 0.09754755529442835, 'sci-fi': 0.045454545449454146, 'thriller': 0.12542163704872258, 'war': 0.08035304331050673, 'western': 0.0, 'avg_rating': 0.21152969571139305}\n", - "{'title_length': 0.021927486954368552, 'movie_year': 0.02488786702116846, '(no genres listed)': 0.0007363092498113207, 'action': 0.013836432470735639, 'adventure': 0.011610617815573265, 'animation': 0.007520799115717832, 'children': 0.006287966766754299, 'comedy': 0.012951125615087338, 'crime': 0.011084119744598393, 'documentary': 0.0018287715645832062, 'drama': 0.015221252640276463, 'fantasy': 0.008631010164284143, 'film-noir': 0.0024629052522566544, 'horror': 0.008816299251739122, 'imax': 0.005347204099216887, 'musical': 0.0038827346462235236, 'mystery': 0.0068652812039576095, 'romance': 0.008086664541950757, 'sci-fi': 0.010304269379559203, 'thriller': 0.013200133984104478, 'war': 0.005127335699821772, 'western': 0.0036215200349232765, 'avg_rating': 0.025470698706944836}\n" - ] - } - ], - "source": [ - "\n", - "# ContetnBased\n", - "class ContentBased(AlgoBase):\n", - " def __init__(self, features_method, regressor_method):\n", - " AlgoBase.__init__(self)\n", - " self.regressor_method = regressor_method\n", - " self.features_methods = features_method\n", - " self.content_features = self.create_content_features(features_method)\n", - " self.user_profile = {}\n", - " self.user_profile_explain = {}\n", - "\n", - " def create_content_features(self, features_methods):\n", - " \"\"\"Content Analyzer\"\"\"\n", - " df_items = load_items()\n", - " df_ratings = load_ratings()\n", - " df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", - " df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n", - " df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n", - "\n", - " df_features = pd.DataFrame(index=df_items.index)\n", - "\n", - " for method in features_methods:\n", - " if method == \"title_length\":\n", - " df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')\n", - " df_features = pd.concat([df_features, df_title_length], axis=1)\n", - " \n", - " elif method == \"movie_year\":\n", - " df_movie_year = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", - " df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)\n", - " \n", - " elif method == \"genre\":\n", - " tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)\n", - " tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n", - " df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())\n", - " df_features = pd.concat([df_features, df_tfidf_genres], axis=1)\n", - "\n", - " elif method == \"avg_rating\":\n", - " df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')\n", - " df_features = df_features.join(df_avg_rating, on='movieId')\n", - "\n", - " else:\n", - " raise NotImplementedError(f'Feature method {method} not yet implemented')\n", - "\n", - " # Handle missing values in df_features\n", - " df_features.fillna(0, inplace=True)\n", - "\n", - " return df_features\n", - "\n", - " def fit(self, trainset):\n", - " \"\"\"Profile Learner\"\"\"\n", - " AlgoBase.fit(self, trainset)\n", - "\n", - " # Preallocate user profiles\n", - " self.user_profile = {u: None for u in trainset.all_users()}\n", - " self.user_profile_explain = {}\n", - "\n", - " epsilon = 1e-10 # Small value to prevent division by zero\n", - "\n", - " for u in trainset.all_users():\n", - " raw_user_id = trainset.to_raw_uid(u)\n", - " self.user_profile_explain[raw_user_id] = {}\n", - "\n", - " user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])\n", - " item_ids = [iid for (iid, _) in trainset.ur[u]]\n", - " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n", - "\n", - " feature_values = self.content_features.loc[raw_item_ids].values\n", - " norms = np.linalg.norm(feature_values, axis=0) + epsilon\n", - " weighted_features = feature_values / norms\n", - " feature_importance = weighted_features.T @ user_ratings\n", - " feature_importance /= np.sum(user_ratings)\n", - "\n", - " self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))\n", - "\n", - " if self.regressor_method == 'random_score':\n", - " for u in self.user_profile:\n", - " self.user_profile[u] = rd.uniform(0.5, 5)\n", - "\n", - " elif self.regressor_method == 'random_sample':\n", - " for u in self.user_profile:\n", - " self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]\n", - "\n", - " else:\n", - " regressor_models = {\n", - " 'linear_regression': LinearRegression(fit_intercept=False),\n", - " 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),\n", - " 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n", - " 'random_forest': RandomForestRegressor(n_estimators=100),\n", - " 'lasso_regression': Lasso(alpha=0.1),\n", - " 'ridge_regression': Ridge(alpha=1.0),\n", - " 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),\n", - " 'knn_regression': KNeighborsRegressor(n_neighbors=1),\n", - " 'decision_tree': DecisionTreeRegressor(max_depth=5),\n", - " 'adaboost': AdaBoostRegressor(n_estimators=50),\n", - " 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n", - " 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n", - " }\n", - "\n", - " if self.regressor_method not in regressor_models:\n", - " raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')\n", - "\n", - " for u in self.user_profile:\n", - " user_ratings = [rating for (_, rating) in trainset.ur[u]]\n", - " item_ids = [iid for (iid, _) in trainset.ur[u]]\n", - " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n", - "\n", - " df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n", - " df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n", - "\n", - " X = df_user.drop(columns=['item_id', 'user_ratings'])\n", - " y = df_user['user_ratings']\n", - "\n", - " regressor = regressor_models[self.regressor_method]\n", - " regressor.fit(X, y)\n", - "\n", - " self.user_profile[u] = regressor\n", - "\n", - " def estimate(self, u, i):\n", - " \"\"\"Scoring component used for item filtering\"\"\"\n", - " if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n", - " raise PredictionImpossible('User and/or item is unknown.')\n", - "\n", - " if self.regressor_method == 'random_score':\n", - " return rd.uniform(0.5, 5)\n", - "\n", - " elif self.regressor_method == 'random_sample':\n", - " return rd.choice(self.user_profile[u])\n", - "\n", - " else:\n", - " raw_item_id = self.trainset.to_raw_iid(i)\n", - " item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)\n", - " regressor = self.user_profile[u]\n", - " item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)\n", - " return regressor.predict(item_features_df)[0]\n", - "\n", - " def explain(self, u):\n", - " if u in self.user_profile_explain:\n", - " return self.user_profile_explain[u]\n", - " else:\n", - " return None\n", - "\n", - "\n", - "#Example usage:\n", - "cb = ContentBased([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")\n", - "surprise_data = load_ratings(surprise_format=True)\n", - "trainset = surprise_data.build_full_trainset()\n", - "testset = trainset.build_anti_testset()\n", - "cb.fit(trainset)\n", - "\n", - "\n", - "#print(\"RMSE: \", cb.rmse(testset))\n", - "\n", - "\n", - "#Example explanations for users:\n", - "print(cb.explain(11))\n", - "\n", - "print(cb.explain(13))\n", - "\n", - "print(cb.explain(17))\n", - "\n", - "print(cb.explain(23))\n", - "\n", - "print(cb.explain(27))\n", - "\n", - "print(cb.explain(73))\n" - ] - }, - { - "cell_type": "markdown", - "id": "ffd75b7e", - "metadata": {}, - "source": [ - "The following script test the ContentBased class" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "69d12f7d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "user: 1 item: 10 r_ui = None est = 0.72 {'was_impossible': False}\n" - ] - } - ], - "source": [ - "def test_contentbased_class(feature_method, regressor_method):\n", - " \"\"\"Test the ContentBased class.\n", - " Tries to make a prediction on the first (user,item ) tuple of the anti_test_set\n", - " \"\"\"\n", - " sp_ratings = load_ratings(surprise_format=True)\n", - " train_set = sp_ratings.build_full_trainset()\n", - " content_algo = ContentBased(feature_method, regressor_method)\n", - " content_algo.fit(train_set)\n", - " anti_test_set_first = train_set.build_anti_testset()[0]\n", - " prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n", - " print(prediction)\n", - "\n", - "test_contentbased_class([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/recommender.py b/recommender.py index e9295588fdca77379242fa409fe3d00781dd0818..b4c16f5a814393c6228366c2d91c08de8b3a816f 100644 --- a/recommender.py +++ b/recommender.py @@ -9,13 +9,15 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt -from scipy.stats import pearsonr from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error -from surprise.similarities import pearson + +from surprise.similarities import Similarity +from surprise.prediction_algorithms.knns import KNNWithMeans +from sklearn.metrics.pairwise import cosine_similarity from sklearn.neighbors import KNeighborsRegressor @@ -153,6 +155,24 @@ class UserBased(AlgoBase): similarity_matrix[j, i] = similarity self.sim = similarity_matrix + + def compute_cosine(self): + """ + Compute the similarity matrix based on user ratings using cosine similarity. + """ + n_users = self.trainset.n_users + similarity_matrix = np.zeros((n_users, n_users)) + + for i in range(n_users): + for j in range(i + 1, n_users): + support = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j])) + if support >= self.min_k: + # Calculate cosine similarity + similarity = cosine_similarity([self.ratings_matrix[i]], [self.ratings_matrix[j]])[0, 0] + similarity_matrix[i, j] = similarity + similarity_matrix[j, i] = similarity + + self.sim = similarity_matrix def compute_mean_ratings(self): """ @@ -438,6 +458,16 @@ class RecommenderSystem_KNN : average_distance = 0.0 return average_distance + + def train_knn_model(self): + """ + Train the KNN model on the ratings data and evaluate its RMSE. + """ + # Compute similarity matrix using cosine similarity + sim_matrix = cosine_similarity(self.trainset._raw2inner_id_items) + self.model.sim = sim_matrix + + def evaluate_knn_rmse_for_different_k(self): """ @@ -608,6 +638,49 @@ class OtherUserBased: inter_user_diversity_score = np.mean(similarities) return inter_user_diversity_score + +########################################################################################################################### +####################################################### CUSTOM METRICS #################################################### +########################################################################################################################### + + +class CustomKNNWithMeans(KNNWithMeans): + def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): + sim_options['user_based'] = True + sim_options['name'] = 'custom' # Nom de la mesure de similarité personnalisée + super().__init__(k=k, min_k=min_k, sim_options=sim_options, **kwargs) + + def fit(self, trainset): + # Ici, vous devez implémenter votre propre calcul de similarité personnalisée + # Par exemple, vous pouvez utiliser une fonction définie par l'utilisateur pour calculer la similarité + self.sim = Similarity() # Remplacez Similarity par votre propre calcul de similarité + super().fit(trainset) + + + +class CustomUserBased(UserBased): + def __init__(self, k=20, min_k=20, sim_options={}, **kwargs): + sim_options['name'] = 'custom' # Nom de la mesure de similarité personnalisée + super().__init__(k=k, min_k=min_k, sim_options=sim_options, **kwargs) + + def compute_similarity_matrix(self): + """ + Calcule la matrice de similarité basée sur les évaluations des utilisateurs. + """ + n_users = self.trainset.n_users + similarity_matrix = np.eye(n_users) + + for i in range(n_users): + for j in range(i + 1, n_users): + support = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j])) + if support >= self.min_k: + intersection = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j])) + union = np.sum(~np.isnan(self.ratings_matrix[i]) | ~np.isnan(self.ratings_matrix[j])) + similarity = intersection / union + similarity_matrix[i, j] = similarity + similarity_matrix[j, i] = similarity + + self.sim = similarity_matrix ########################################################################################################################### ####################################################### COMPARISON MODEL ################################################## ########################################################################################################################### @@ -651,6 +724,7 @@ def compare_models(): #compare_models() + def compare_similarity_measures(trainset,testset): """ Compare the similarity measures MAE and RMSE with Jaccard and MSD for KNN and UserBased models. @@ -683,15 +757,6 @@ def compare_similarity_measures(trainset,testset): results['KNN_Pearson_RMSE'] = rmse_pearson results['KNN_Pearson_MAE'] = mae_pearson - # Train and evaluate KNN model with Jaccard similarity - sim_options_jaccard = {'name': '','user_based': True} - user_based_jaccard = KNNWithMeans(sim_options=sim_options_jaccard) - user_based_jaccard.fit(trainset) - predictions_jaccard = user_based_jaccard.test(testset) - rmse_jaccard = accuracy.rmse(predictions_jaccard) - mae_jaccard = accuracy.mae(predictions_jaccard) - results['KNN_Jaccard_RMSE'] = rmse_jaccard - results['KNN_Jaccard_MAE'] = mae_jaccard # Train and evaluate UserBased model with MSD similarity user_based_msd = UserBased(sim_options={'name': 'msd','user_based': True}) @@ -702,15 +767,6 @@ def compare_similarity_measures(trainset,testset): results['UserBased_MSD_RMSE'] = rmse_user_based_msd results['UserBased_MSD_MAE'] = mae_user_based_msd - # Train and evaluate UserBased model with Jaccard similarity - user_based_jaccard = UserBased(sim_options={'name': 'jaccard','user_based': True}) - user_based_jaccard.fit(trainset) - predictions_user_based_jaccard = user_based_jaccard.test(testset) - rmse_user_based_jaccard = accuracy.rmse(predictions_user_based_jaccard) - mae_user_based_jaccard = accuracy.mae(predictions_user_based_jaccard) - results['UserBased_Jaccard_RMSE'] = rmse_user_based_jaccard - results['UserBased_Jaccard_MAE'] = mae_user_based_jaccard - # Train and evaluate UserBased model with Pearson correlation similarity user_based_pearson = UserBased(sim_options={'name': 'pearson'}) user_based_pearson.fit(trainset) @@ -720,6 +776,7 @@ def compare_similarity_measures(trainset,testset): results['UserBased_Pearson_RMSE'] = rmse_user_based_pearson results['UserBased_Pearson_MAE'] = mae_user_based_pearson + # Train and evaluate OtherUserBased models for user_name, user_id in [('Adrien', -1), ('Audrey', -2), ('Nathanael', -3), ('Charles', -4)]: other_user_based = OtherUserBased(user_name, user_id)