initial commit

d24ee4ed · Corentin Vande Kerckhove · d24ee4ed · d24ee4ed · d24ee4ed · d24ee4ed
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# MacOS
+.DS_Store
+
+# Images and data
+*.pdf
+images
+data
+*.zip
+posters
--- a/Pipfile
+++ b/Pipfile
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pandas = "*"
+ipykernel = "*"
+jupyter = "*"
+matplotlib = "*"
+scipy = "*"
+seaborn = "*"
+scikit-surprise = "*"
+python-dotenv = "*"
+scikit-learn = "*"
+streamlit = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.9"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
+# Group XX - Movie Recommender System
+
+Welcome in the README file :)
+Write here a few introduction words for your project. 
+
+If you want inspiration on how to write an awesome README, check this github repo : https://github.com/navendu-pottekkat/awesome-readme . But don't spend to much time on it. This is not the topic of this course.
+
+If you need help with the Markdown syntax, you might find some help here : https://www.markdownguide.org/basic-syntax/ .
+
+Good luck with your project. 
+May the force be with you.
\ No newline at end of file
--- a/analytics.ipynb
+++ b/analytics.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analytics Module\n",
+    "The Analytics module provides descriptive statistics on content data, evidence data and model evaluations "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reloads modules automatically before entering the execution of code\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "# third parties imports\n",
+    "import numpy as np \n",
+    "import pandas as pd\n",
+    "# -- add new imports here --\n",
+    "\n",
+    "# local imports\n",
+    "from constants import Constant as C\n",
+    "from loaders import load_ratings\n",
+    "from loaders import load_items"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1 - Content analytics\n",
+    "Explore and perform descriptive statistics on content data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -- load the items and display the Dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -- display relevant informations that can be extracted from the dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2 - Evidence analytics\n",
+    "Explore and perform descriptive statistics on evidence data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -- load the items and display the Dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -- display relevant informations that can be extracted from the dataset"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mlsmm2156",
+   "language": "python",
+   "name": "mlsmm2156"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:markdown id: tags:
+
+# Analytics Module
+The Analytics module provides descriptive statistics on content data, evidence data and model evaluations
+
+%% Cell type:code id: tags:
+
+``` python
+# reloads modules automatically before entering the execution of code
+%load_ext autoreload
+%autoreload 2
+
+# third parties imports
+import numpy as np
+import pandas as pd
+# -- add new imports here --
+
+# local imports
+from constants import Constant as C
+from loaders import load_ratings
+from loaders import load_items
+```
+
+%% Cell type:markdown id: tags:
+
+# 1 - Content analytics
+Explore and perform descriptive statistics on content data
+
+%% Cell type:code id: tags:
+
+``` python
+# -- load the items and display the Dataframe
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# -- display relevant informations that can be extracted from the dataset
+```
+
+%% Cell type:markdown id: tags:
+
+# 2 - Evidence analytics
+Explore and perform descriptive statistics on evidence data
+
+%% Cell type:code id: tags:
+
+``` python
+# -- load the items and display the Dataframe
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# -- display relevant informations that can be extracted from the dataset
+```
--- a/configs.py
+++ b/configs.py
+# local imports
+from models import *
+
+
+class EvalConfig:
+    
+    models = [
+        ("baseline_1", ModelBaseline1, {}),  # model_name, model class, model parameters (dict)
+    ]
+    split_metrics = ["mae"]
+    loo_metrics = []
+    full_metrics = []
+
+    # Split parameters
+    test_size = None  # -- configure the test_size (from 0 to 1) --
+
+    # Loo parameters
+    top_n_value = None  # -- configure the numer of recommendations (> 1) --
--- a/constants.py
+++ b/constants.py
+# third parties imports
+from pathlib import Path
+
+
+class Constant:
+
+    DATA_PATH = Path('data/test')  # -- fill here the dataset size to use
+
+    # Content
+    CONTENT_PATH = DATA_PATH / 'content'
+    # - item
+    ITEMS_FILENAME = 'movies.csv'
+    ITEM_ID_COL = 'movieId'
+    LABEL_COL = 'title'
+    GENRES_COL = 'genres'
+
+    # Evidence
+    EVIDENCE_PATH = DATA_PATH / 'evidence'
+    # - ratings
+    RATINGS_FILENAME = 'ratings.csv'
+    USER_ID_COL = 'userId'
+    RATING_COL = 'rating'
+    TIMESTAMP_COL = 'timestamp'
+    USER_ITEM_RATINGS = [USER_ID_COL, ITEM_ID_COL, RATING_COL]
+
+    # Rating scale
+    RATINGS_SCALE = None  # -- fill in here the ratings scale as a tuple (min_value, max_value)
--- a/loaders.py
+++ b/loaders.py
+# third parties imports
+import pandas as pd
+
+# local imports
+from constants import Constant as C
+
+
+def load_ratings(surprise_format=False):
+    df_ratings = pd.read_csv(C.EVIDENCE_PATH / C.RATINGS_FILENAME)
+    if surprise_format:
+        pass
+    else:
+        return df_ratings
+
+
+def load_items():
+    df_items = pd.read_csv(C.CONTENT_PATH / C.ITEMS_FILENAME)
+    df_items = df_items.set_index(C.ITEM_ID_COL)
+    return df_items
+
+
+def export_evaluation_report(df):
+    """ Export the report to the evaluation folder.
+
+    The name of the report is versioned using today's date
+    """
+    pass
\ No newline at end of file
--- a/models.py
+++ b/models.py
+# standard library imports
+from collections import defaultdict
+
+# third parties imports
+import numpy as np
+import random as rd
+from surprise import AlgoBase
+from surprise import KNNWithMeans
+from surprise import SVD
+
+
+def get_top_n(predictions, n):
+    """Return the top-N recommendation for each user from a set of predictions.
+    Source: inspired by https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py
+    and modified by cvandekerckh for random tie breaking
+
+    Args:
+        predictions(list of Prediction objects): The list of predictions, as
+            returned by the test method of an algorithm.
+        n(int): The number of recommendation to output for each user. Default
+            is 10.
+    Returns:
+    A dict where keys are user (raw) ids and values are lists of tuples:
+        [(raw item id, rating estimation), ...] of size n.
+    """
+
+    rd.seed(0)
+
+    # First map the predictions to each user.
+    top_n = defaultdict(list)
+    for uid, iid, true_r, est, _ in predictions:
+        top_n[uid].append((iid, est))
+
+    # Then sort the predictions for each user and retrieve the k highest ones.
+    for uid, user_ratings in top_n.items():
+        rd.shuffle(user_ratings)
+        user_ratings.sort(key=lambda x: x[1], reverse=True)
+        top_n[uid] = user_ratings[:n]
+
+    return top_n
+
+
+# First algorithm
+class ModelBaseline1(AlgoBase):
+    def __init__(self):
+        AlgoBase.__init__(self)
+
+    def estimate(self, u, i):
+        return 2
+
+
+# Second algorithm
+class ModelBaseline2(AlgoBase):
+    def __init__(self):
+        AlgoBase.__init__(self)
+
+    def fit(self, trainset):
+        AlgoBase.fit(self, trainset)
+        rd.seed(0)
+
+    def estimate(self, u, i):
+        return rd.uniform(self.trainset.rating_scale[0], self.trainset.rating_scale[1])
+
+
+# Third algorithm
+class ModelBaseline3(AlgoBase):
+    def __init__(self):
+        AlgoBase.__init__(self)
+
+    def fit(self, trainset):
+        AlgoBase.fit(self, trainset)
+        self.the_mean = np.mean([r for (_, _, r) in self.trainset.all_ratings()])
+
+        return self
+
+    def estimate(self, u, i):
+        return self.the_mean
+
+
+# Fourth Model
+class ModelBaseline4(SVD):
+    def __init__(self):
+        SVD.__init__(self, n_factors=100)
--- a/unzip_data.py
+++ b/unzip_data.py
+import os
+import shutil
+import zipfile as zf
+import errno
+
+
+
+def unzip_and_remove(zip_filename):
+    if not os.path.exists(zip_filename):
+        raise FileNotFoundError(
+            errno.ENOENT,
+            os.strerror(errno.ENOENT),
+            zip_filename,
+        )
+    files = zf.ZipFile(zip_filename, 'r')
+    files.extractall('.')
+    files.close()
+    os.remove(zip_filename)
+    shutil.rmtree('__MACOSX')  # remove macosx data for Mac users
+
+
+if __name__ == "__main__":
+    unzip_and_remove("data.zip")
\ No newline at end of file