Skip to content
Extraits de code Groupes Projets
Valider fb8db879 rédigé par Adrien Payen's avatar Adrien Payen
Parcourir les fichiers

update for hackathon

parent 40620d98
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -13,9 +13,27 @@ class EvalConfig: ...@@ -13,9 +13,27 @@ class EvalConfig:
("baseline_2", ModelBaseline2, {}), ("baseline_2", ModelBaseline2, {}),
("baseline_3", ModelBaseline3, {}), ("baseline_3", ModelBaseline3, {}),
("baseline_4", ModelBaseline4, {}), ("baseline_4", ModelBaseline4, {}),
("ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}), ("title_length_ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}),
("ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}), ("title_length_ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}),
("ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"}) ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"}),
("movie_year_ContentBased_sample", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_sample"}),
("movie_year_ContentBased_score", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_score"}),
#("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "linear_regression"})
("genres_ContentBased_sample", ContentBased, {"features_method" : "genres", "regressor_method" : "random_sample"}),
("genres_ContentBased_score", ContentBased, {"features_method" : "genres", "regressor_method" : "random_score"}),
#("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "linear_regression"}),
("rating_ContentBased_sample", ContentBased, {"features_method" : "rating", "regressor_method" : "random_sample"}),
("rating_ContentBased_score", ContentBased, {"features_method" : "rating", "regressor_method" : "random_score"}),
#("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "linear_regression"}),
("tags_ContentBased_sample", ContentBased, {"features_method" : "tags", "regressor_method" : "random_sample"}),
("tags_ContentBased_score", ContentBased, {"features_method" : "tags", "regressor_method" : "random_score"}),
#("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "linear_regression"}),
("tags_length_ContentBased_sample", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_sample"}),
("tags_length_ContentBased_score", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_score"}),
#("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "linear_regression"}),
("timestamp_ContentBased_sample", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_sample"}),
("timestamp_ContentBased_score", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_score"}),
#("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "linear_regression"})
# model_name, model class, model parameters (dict) # model_name, model class, model parameters (dict)
] ]
...@@ -34,3 +52,4 @@ class EvalConfig: ...@@ -34,3 +52,4 @@ class EvalConfig:
# Loo parameters # Loo parameters
top_n_value = 10 # -- configure the numer of recommendations (> 1) -- top_n_value = 10 # -- configure the numer of recommendations (> 1) --
%% Cell type:markdown id:a665885b tags: %% Cell type:markdown id:a665885b tags:
# Evaluator Module # Evaluator Module
The Evaluator module creates evaluation reports. The Evaluator module creates evaluation reports.
Reports contain evaluation metrics depending on models specified in the evaluation config. Reports contain evaluation metrics depending on models specified in the evaluation config.
%% Cell type:code id:6aaf9140 tags: %% Cell type:code id:6aaf9140 tags:
``` python ``` python
# reloads modules automatically before entering the execution of code # reloads modules automatically before entering the execution of code
%load_ext autoreload %load_ext autoreload
%autoreload 2 %autoreload 2
# imports # imports
import numpy as np import numpy as np
import pandas as pd import pandas as pd
# local imports # local imports
from configs import EvalConfig from configs import EvalConfig
from constants import Constant as C from constants import Constant as C
from loaders import export_evaluation_report from loaders import export_evaluation_report
from loaders import load_ratings from loaders import load_ratings
# New imports # New imports
from surprise.model_selection import train_test_split from surprise.model_selection import train_test_split
from surprise import accuracy from surprise import accuracy
from surprise.model_selection import LeaveOneOut from surprise.model_selection import LeaveOneOut
from collections import Counter from collections import Counter
``` ```
%% Output %% Output
The autoreload extension is already loaded. To reload it, use: The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload %reload_ext autoreload
%% Cell type:markdown id:d47c24a4 tags: %% Cell type:markdown id:d47c24a4 tags:
# 1. Model validation functions # 1. Model validation functions
Validation functions are a way to perform crossvalidation on recommender system models. Validation functions are a way to perform crossvalidation on recommender system models.
%% Cell type:code id:d6d82188 tags: %% Cell type:code id:d6d82188 tags:
``` python ``` python
# -- implement the function generate_split_predictions -- # -- implement the function generate_split_predictions --
def generate_split_predictions(algo, ratings_dataset, eval_config): def generate_split_predictions(algo, ratings_dataset, eval_config):
"""Generate predictions on a random test set specified in eval_config""" """Generate predictions on a random test set specified in eval_config"""
# Spliting the data into train and test sets # Spliting the data into train and test sets
trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size) trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size)
# Training the algorithm on the train data set # Training the algorithm on the train data set
algo.fit(trainset) algo.fit(trainset)
# Predict ratings for the testset # Predict ratings for the testset
predictions = algo.test(testset) predictions = algo.test(testset)
return predictions return predictions
# -- implement the function generate_loo_top_n -- # -- implement the function generate_loo_top_n --
def generate_loo_top_n(algo, ratings_dataset, eval_config): def generate_loo_top_n(algo, ratings_dataset, eval_config):
"""Generate top-n recommendations for each user on a random Leave-one-out split (LOO)""" """Generate top-n recommendations for each user on a random Leave-one-out split (LOO)"""
# Create a LeaveOneOut split # Create a LeaveOneOut split
loo = LeaveOneOut(n_splits=1) loo = LeaveOneOut(n_splits=1)
for trainset, testset in loo.split(ratings_dataset): for trainset, testset in loo.split(ratings_dataset):
algo.fit(trainset) # Train the algorithm on the training set algo.fit(trainset) # Train the algorithm on the training set
anti_testset = trainset.build_anti_testset() # Build the anti test-set anti_testset = trainset.build_anti_testset() # Build the anti test-set
predictions = algo.test(anti_testset) # Get predictions on the anti test-set predictions = algo.test(anti_testset) # Get predictions on the anti test-set
top_n = {} top_n = {}
for uid, iid, _, est, _ in predictions: for uid, iid, _, est, _ in predictions:
if uid not in top_n: if uid not in top_n:
top_n[uid] = [] top_n[uid] = []
top_n[uid].append((iid, est)) top_n[uid].append((iid, est))
for uid, user_ratings in top_n.items(): for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True) user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations
anti_testset_top_n = top_n anti_testset_top_n = top_n
return anti_testset_top_n, testset return anti_testset_top_n, testset
def generate_full_top_n(algo, ratings_dataset, eval_config): def generate_full_top_n(algo, ratings_dataset, eval_config):
"""Generate top-n recommendations for each user with full training set (LOO)""" """Generate top-n recommendations for each user with full training set (LOO)"""
full_trainset = ratings_dataset.build_full_trainset() # Build the full training set full_trainset = ratings_dataset.build_full_trainset() # Build the full training set
algo.fit(full_trainset) # Train the algorithm on the full training set algo.fit(full_trainset) # Train the algorithm on the full training set
anti_testset = full_trainset.build_anti_testset() # Build the anti test-set anti_testset = full_trainset.build_anti_testset() # Build the anti test-set
predictions = algo.test(anti_testset) # Get predictions on the anti test-set predictions = algo.test(anti_testset) # Get predictions on the anti test-set
top_n = {} top_n = {}
for uid, iid, _, est, _ in predictions: for uid, iid, _, est, _ in predictions:
if uid not in top_n: if uid not in top_n:
top_n[uid] = [] top_n[uid] = []
top_n[uid].append((iid, est)) top_n[uid].append((iid, est))
for uid, user_ratings in top_n.items(): for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True) user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations
anti_testset_top_n = top_n anti_testset_top_n = top_n
return anti_testset_top_n return anti_testset_top_n
def precomputed_information(movie_data): def precomputed_information(movie_data):
""" Returns a dictionary that precomputes relevant information for evaluating in full mode """ Returns a dictionary that precomputes relevant information for evaluating in full mode
Dictionary keys: Dictionary keys:
- precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to rankings - precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to rankings
- (-- for your project, add other relevant information here -- ) - (-- for your project, add other relevant information here -- )
""" """
# Initialize an empty dictionary to store item_id to rank mapping # Initialize an empty dictionary to store item_id to rank mapping
item_to_rank = {} item_to_rank = {}
# Calculate popularity rank for each movie # Calculate popularity rank for each movie
ratings_count = movie_data.groupby('movieId').size().sort_values(ascending=False) ratings_count = movie_data.groupby('movieId').size().sort_values(ascending=False)
# Assign ranks to movies based on their popularity # Assign ranks to movies based on their popularity
for rank, (movie_id, _) in enumerate(ratings_count.items(), start=1): for rank, (movie_id, _) in enumerate(ratings_count.items(), start=1):
item_to_rank[movie_id] = rank item_to_rank[movie_id] = rank
# Create the precomputed dictionary # Create the precomputed dictionary
precomputed_dict = {} precomputed_dict = {}
precomputed_dict["item_to_rank"] = item_to_rank precomputed_dict["item_to_rank"] = item_to_rank
return precomputed_dict return precomputed_dict
def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics): def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):
""" Create a DataFrame evaluating various models on metrics specified in an evaluation config. """ Create a DataFrame evaluating various models on metrics specified in an evaluation config.
""" """
evaluation_dict = {} evaluation_dict = {}
for model_name, model, arguments in eval_config.models: for model_name, model, arguments in eval_config.models:
print(f'Handling model {model_name}') print(f'Handling model {model_name}')
algo = model(**arguments) algo = model(**arguments)
evaluation_dict[model_name] = {} evaluation_dict[model_name] = {}
# Type 1 : split evaluations # Type 1 : split evaluations
if len(eval_config.split_metrics) > 0: if len(eval_config.split_metrics) > 0:
print('Training split predictions') print('Training split predictions')
predictions = generate_split_predictions(algo, sp_ratings, eval_config) predictions = generate_split_predictions(algo, sp_ratings, eval_config)
for metric in eval_config.split_metrics: for metric in eval_config.split_metrics:
print(f'- computing metric {metric}') print(f'- computing metric {metric}')
assert metric in available_metrics['split'] assert metric in available_metrics['split']
evaluation_function, parameters = available_metrics["split"][metric] evaluation_function, parameters = available_metrics["split"][metric]
evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters) evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters)
# Type 2 : loo evaluations # Type 2 : loo evaluations
if len(eval_config.loo_metrics) > 0: if len(eval_config.loo_metrics) > 0:
print('Training loo predictions') print('Training loo predictions')
anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config) anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)
for metric in eval_config.loo_metrics: for metric in eval_config.loo_metrics:
assert metric in available_metrics['loo'] assert metric in available_metrics['loo']
evaluation_function, parameters = available_metrics["loo"][metric] evaluation_function, parameters = available_metrics["loo"][metric]
evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters) evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)
# Type 3 : full evaluations # Type 3 : full evaluations
if len(eval_config.full_metrics) > 0: if len(eval_config.full_metrics) > 0:
print('Training full predictions') print('Training full predictions')
anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config) anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)
for metric in eval_config.full_metrics: for metric in eval_config.full_metrics:
assert metric in available_metrics['full'] assert metric in available_metrics['full']
evaluation_function, parameters = available_metrics["full"][metric] evaluation_function, parameters = available_metrics["full"][metric]
evaluation_dict[model_name][metric] = evaluation_function( evaluation_dict[model_name][metric] = evaluation_function(
anti_testset_top_n, anti_testset_top_n,
**precomputed_dict, **precomputed_dict,
**parameters **parameters
) )
return pd.DataFrame.from_dict(evaluation_dict).T return pd.DataFrame.from_dict(evaluation_dict).T
``` ```
%% Cell type:markdown id:f7e83d1d tags: %% Cell type:markdown id:f7e83d1d tags:
# 2. Evaluation metrics # 2. Evaluation metrics
Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric) Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)
%% Cell type:code id:f1849e55 tags: %% Cell type:code id:f1849e55 tags:
``` python ``` python
# -- implement the function get_hit_rate -- # -- implement the function get_hit_rate --
def get_hit_rate(anti_testset_top_n, testset): def get_hit_rate(anti_testset_top_n, testset):
"""Compute the average hit over the users (loo metric) """Compute the average hit over the users (loo metric)
A hit (1) happens when the movie in the testset has been picked by the top-n recommender A hit (1) happens when the movie in the testset has been picked by the top-n recommender
A fail (0) happens when the movie in the testset has not been picked by the top-n recommender A fail (0) happens when the movie in the testset has not been picked by the top-n recommender
""" """
hits = 0 hits = 0
total_users = len(testset) total_users = len(testset)
for uid, true_iid, _ in testset: for uid, true_iid, _ in testset:
if uid in anti_testset_top_n and true_iid in {iid for iid, _ in anti_testset_top_n[uid]}: if uid in anti_testset_top_n and true_iid in {iid for iid, _ in anti_testset_top_n[uid]}:
hits += 1 hits += 1
hit_rate = hits / total_users hit_rate = hits / total_users
return hit_rate return hit_rate
# -- implement the function get_novelty -- # -- implement the function get_novelty --
def get_novelty(anti_testset_top_n, item_to_rank): def get_novelty(anti_testset_top_n, item_to_rank):
"""Compute the average novelty of the top-n recommendation over the users (full metric) """Compute the average novelty of the top-n recommendation over the users (full metric)
The novelty is defined as the average ranking of the movies recommended The novelty is defined as the average ranking of the movies recommended
""" """
total_rank_sum = 0 total_rank_sum = 0
total_recommendations = 0 total_recommendations = 0
for uid, recommendations in anti_testset_top_n.items(): for uid, recommendations in anti_testset_top_n.items():
for iid, _ in recommendations: for iid, _ in recommendations:
if iid in item_to_rank: if iid in item_to_rank:
total_rank_sum += item_to_rank[iid] total_rank_sum += item_to_rank[iid]
total_recommendations += 1 total_recommendations += 1
if total_recommendations == 0: if total_recommendations == 0:
return 0 # Avoid division by zero return 0 # Avoid division by zero
average_rank_sum = total_rank_sum / total_recommendations average_rank_sum = total_rank_sum / total_recommendations
return average_rank_sum return average_rank_sum
``` ```
%% Cell type:markdown id:1a9855b3 tags: %% Cell type:markdown id:1a9855b3 tags:
# 3. Evaluation workflow # 3. Evaluation workflow
Load data, evaluate models and save the experimental outcomes Load data, evaluate models and save the experimental outcomes
%% Cell type:code id:704f4d2a tags: %% Cell type:code id:704f4d2a tags:
``` python ``` python
AVAILABLE_METRICS = { AVAILABLE_METRICS = {
"split": { "split": {
"mae": (accuracy.mae, {'verbose': False}), "mae": (accuracy.mae, {'verbose': False}),
"rmse": (accuracy.rmse, {'verbose': False}) "rmse": (accuracy.rmse, {'verbose': False})
}, },
"loo": { "loo": {
"hit_rate": (get_hit_rate, {}), "hit_rate": (get_hit_rate, {}),
}, },
"full": { "full": {
"novelty": (get_novelty, {}), "novelty": (get_novelty, {}),
} }
} }
sp_ratings = load_ratings(surprise_format=True) sp_ratings = load_ratings(surprise_format=True)
precomputed_dict = precomputed_information(pd.read_csv("data/tiny/evidence/ratings.csv")) precomputed_dict = precomputed_information(pd.read_csv("data/tiny/evidence/ratings.csv"))
evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS) evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)
export_evaluation_report(evaluation_report) export_evaluation_report(evaluation_report)
``` ```
%% Output %% Output
Handling model baseline_1 Handling model baseline_1
Training split predictions Training split predictions
- computing metric mae - computing metric mae
- computing metric rmse - computing metric rmse
Training loo predictions Training loo predictions
Training full predictions Training full predictions
Handling model baseline_2 Handling model baseline_2
Training split predictions Training split predictions
- computing metric mae - computing metric mae
- computing metric rmse - computing metric rmse
Training loo predictions Training loo predictions
Training full predictions Training full predictions
Handling model baseline_3 Handling model baseline_3
Training split predictions Training split predictions
- computing metric mae - computing metric mae
- computing metric rmse - computing metric rmse
Training loo predictions Training loo predictions
Training full predictions Training full predictions
Handling model baseline_4 Handling model baseline_4
Training split predictions Training split predictions
- computing metric mae - computing metric mae
- computing metric rmse - computing metric rmse
Training loo predictions Training loo predictions
Training full predictions Training full predictions
Handling model ContentBased_sample Handling model title_length_ContentBased_sample
Training split predictions Training split predictions
- computing metric mae - computing metric mae
- computing metric rmse - computing metric rmse
Training loo predictions Training loo predictions
Training full predictions Training full predictions
Handling model ContentBased_score Handling model title_length_ContentBased_score
Training split predictions Training split predictions
- computing metric mae - computing metric mae
- computing metric rmse - computing metric rmse
Training loo predictions Training loo predictions
Training full predictions Training full predictions
Handling model ContentBased_Lr Handling model title_length_ContentBased_Lr
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model movie_year_ContentBased_sample
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model movie_year_ContentBased_score
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model genres_ContentBased_sample
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model genres_ContentBased_score
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model rating_ContentBased_sample
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model rating_ContentBased_score
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model tags_ContentBased_sample
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model tags_ContentBased_score
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model tags_length_ContentBased_sample
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model tags_length_ContentBased_score
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model timestamp_ContentBased_sample
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model timestamp_ContentBased_score
Training split predictions Training split predictions
- computing metric mae - computing metric mae
- computing metric rmse - computing metric rmse
Training loo predictions Training loo predictions
Training full predictions Training full predictions
The data has been exported to the evaluation report The data has been exported to the evaluation report
mae rmse hit_rate novelty mae rmse hit_rate novelty
baseline_1 1.561178 1.792482 0.074766 99.405607 baseline_1 1.312500 1.667708 1.0 4.0
baseline_2 1.471412 1.819364 0.000000 429.942991 baseline_2 1.315250 1.572990 1.0 4.0
baseline_3 0.878270 1.085591 0.074766 99.405607 baseline_3 1.318182 1.465689 1.0 4.0
baseline_4 0.705673 0.912313 0.130841 60.202804 baseline_4 1.363953 1.523985 1.0 4.0
ContentBased_sample 1.013747 1.350417 0.084112 178.048598 title_length_ContentBased_sample 1.375000 1.750000 1.0 4.0
ContentBased_score 1.461846 1.803067 0.018692 437.222430 title_length_ContentBased_score 1.556280 2.063469 1.0 4.0
ContentBased_Lr 1.202626 1.460273 0.084112 278.046729 title_length_ContentBased_Lr 1.625729 1.773594 1.0 4.0
movie_year_ContentBased_sample 2.250000 2.610077 1.0 4.0
movie_year_ContentBased_score 1.866274 2.111422 1.0 4.0
genres_ContentBased_sample 1.875000 2.271136 1.0 4.0
genres_ContentBased_score 1.463388 1.793363 1.0 4.0
rating_ContentBased_sample 1.289773 1.715759 1.0 4.0
rating_ContentBased_score 2.482206 2.795490 1.0 4.0
tags_ContentBased_sample 1.937500 2.128673 0.5 4.0
tags_ContentBased_score 1.683499 1.782805 1.0 4.0
tags_length_ContentBased_sample 1.187500 1.704773 1.0 4.0
tags_length_ContentBased_score 1.564917 1.944345 0.5 4.0
timestamp_ContentBased_sample 1.875000 2.277608 1.0 4.0
timestamp_ContentBased_score 1.265317 1.512329 1.0 4.0
%% Cell type:markdown id:6f8b6d19 tags: %% Cell type:markdown id:6f8b6d19 tags:
dire quel modèle est meilleur ? dire quel modèle est meilleur ?
(ici on a pris en compte les différents content features du model)
......
...@@ -97,10 +97,35 @@ class ContentBased(AlgoBase): ...@@ -97,10 +97,35 @@ class ContentBased(AlgoBase):
def create_content_features(self, features_method): def create_content_features(self, features_method):
"""Content Analyzer""" """Content Analyzer"""
df_items = load_items() df_items = load_items()
df_ratings = load_ratings()
df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
if features_method is None: if features_method is None:
df_features = None df_features = None
elif features_method == "title_length": # a naive method that creates only 1 feature based on title length elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
elif features_method == "movie_year" :
df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False)
elif features_method == "genres" :
genres_list = df_items['genres'].str.split('|').explode().unique()
for genre in genres_list:
df_features = df_items['genres'].str.contains(genre).astype(int)
elif features_method == "rating" :
df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')
elif features_method == "tags" :
df_features = df_tag['tag'].apply(lambda x: len(x.split(',')))
elif features_method == "tags_length" :
df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')))
elif features_method == "timestamp" :
df_features = df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400)
df_features = df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400)
else: # (implement other feature creations here) else: # (implement other feature creations here)
raise NotImplementedError(f'Feature method {features_method} not yet implemented') raise NotImplementedError(f'Feature method {features_method} not yet implemented')
return df_features return df_features
...@@ -176,6 +201,4 @@ class ContentBased(AlgoBase): ...@@ -176,6 +201,4 @@ class ContentBased(AlgoBase):
# (implement here the regressor prediction) # (implement here the regressor prediction)
return score return score
\ No newline at end of file
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter