From 11a2a6489af8fcfd9c81649dd3708b80d5731276 Mon Sep 17 00:00:00 2001 From: Frank Sauerburger <frank@sauerburger.com> Date: Sat, 27 Aug 2022 10:05:27 +0200 Subject: [PATCH] Add first data processing step and CI --- .dvc/config | 2 ++ .gitignore | 3 +++ .gitlab-ci.yml | 18 ++++++++++++++++++ movies.py | 29 +++++++++++++++++++++++++++++ movies_test.py | 28 ++++++++++++++++++++++++++++ requirements.txt | 2 ++ 6 files changed, 82 insertions(+) create mode 100644 .gitlab-ci.yml create mode 100644 movies.py create mode 100644 movies_test.py create mode 100644 requirements.txt diff --git a/.dvc/config b/.dvc/config index 4c72621..43096e3 100644 --- a/.dvc/config +++ b/.dvc/config @@ -1,3 +1,5 @@ +[core] + remote = amazonas ['remote "amazonas"'] url = s3://mlflow/ endpointurl = https://s3.ds.sit-servers.net diff --git a/.gitignore b/.gitignore index f163a26..7bbaed8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ /MovieLenseSmall /MovieLense20M +__pycache__/ +.ipynb_checkpoints/ +*.ipynb diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..24388e0 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,18 @@ + +stages: + - test + + +################################################################################ +# Unittest +.unittest: &unittest_template + stage: test + script: + - pip install -r requirements.txt + - pytest + +unittest:py3.10: + <<: *unittest_template + image: python:3.10 + + diff --git a/movies.py b/movies.py new file mode 100644 index 0000000..3212de3 --- /dev/null +++ b/movies.py @@ -0,0 +1,29 @@ + +import collections +import pandas as pd +import tensorflow as tf + +def load_movielense(): + """Load an return movies and ratings as dataframes""" + ratings_filename = "MovieLense20M/rating.csv" + ratings_df = pd.read_csv(ratings_filename) + + movies_filename = "MovieLense20M/movie.csv" + movies_df = pd.read_csv(movies_filename) + + return ratings_df, movies_df + +def collect_user_context(ratings, min_rating=2.1): + """Create a per-user rating list""" + user_movies = collections.defaultdict(lambda: []) # dict mapping ids to movies + + ratings = ratings.sort_values(by=['userId', 'timestamp']) + + for user_id, movie_id, rating, _ in ratings.values: + if rating >= min_rating: + user_movies[user_id].append(movie_id) + + return user_movies + + + diff --git a/movies_test.py b/movies_test.py new file mode 100644 index 0000000..a37ae83 --- /dev/null +++ b/movies_test.py @@ -0,0 +1,28 @@ + +import pandas as pd +import unittest +import movies + +class LoadTests(unittest.TestCase): + """Test functions concerned with loading and preparing the dataset""" + + @staticmethod + def toy_ratings(): + """Return a toy dataframe with ratings""" + return pd.DataFrame(data={ + "userId": [1, 1, 2, 2, 1], + "movieId": [1, 2, 1, 3, 4], + "rating": [3, 1, 4, 3, 5], + "timestamp": [1, 2, 7, 5, 4], + }, + columns=["userId", "movieId", "rating", "timestamp"]) + + def test_collect_user_context(self): + """Check that ratings are correctly aggregated""" + rating = self.toy_ratings() + user_movies = movies.collect_user_context(rating) + + self.assertEqual(user_movies[1], [1, 4]) + self.assertEqual(user_movies[2], [3, 1]) + self.assertEqual(set(user_movies.keys()), {1, 2}) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7c4c279 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +tensorflow-recommenders +tensorflow-datasets -- GitLab