From 11a2a6489af8fcfd9c81649dd3708b80d5731276 Mon Sep 17 00:00:00 2001
From: Frank Sauerburger <frank@sauerburger.com>
Date: Sat, 27 Aug 2022 10:05:27 +0200
Subject: [PATCH] Add first data processing step and CI

---
 .dvc/config      |  2 ++
 .gitignore       |  3 +++
 .gitlab-ci.yml   | 18 ++++++++++++++++++
 movies.py        | 29 +++++++++++++++++++++++++++++
 movies_test.py   | 28 ++++++++++++++++++++++++++++
 requirements.txt |  2 ++
 6 files changed, 82 insertions(+)
 create mode 100644 .gitlab-ci.yml
 create mode 100644 movies.py
 create mode 100644 movies_test.py
 create mode 100644 requirements.txt

diff --git a/.dvc/config b/.dvc/config
index 4c72621..43096e3 100644
--- a/.dvc/config
+++ b/.dvc/config
@@ -1,3 +1,5 @@
+[core]
+    remote = amazonas
 ['remote "amazonas"']
     url = s3://mlflow/
     endpointurl = https://s3.ds.sit-servers.net
diff --git a/.gitignore b/.gitignore
index f163a26..7bbaed8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 /MovieLenseSmall
 /MovieLense20M
+__pycache__/
+.ipynb_checkpoints/
+*.ipynb
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..24388e0
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,18 @@
+
+stages:
+ - test
+
+
+################################################################################
+# Unittest
+.unittest: &unittest_template
+  stage: test
+  script:
+    - pip install -r requirements.txt
+    - pytest
+
+unittest:py3.10:
+  <<: *unittest_template
+  image: python:3.10
+
+
diff --git a/movies.py b/movies.py
new file mode 100644
index 0000000..3212de3
--- /dev/null
+++ b/movies.py
@@ -0,0 +1,29 @@
+
+import collections
+import pandas as pd
+import tensorflow as tf
+
+def load_movielense():
+    """Load an return movies and ratings as dataframes"""
+    ratings_filename = "MovieLense20M/rating.csv"
+    ratings_df = pd.read_csv(ratings_filename)
+
+    movies_filename = "MovieLense20M/movie.csv"
+    movies_df = pd.read_csv(movies_filename)
+
+    return ratings_df, movies_df
+
+def collect_user_context(ratings, min_rating=2.1):
+    """Create a per-user rating list"""
+    user_movies = collections.defaultdict(lambda: [])  # dict mapping ids to movies
+
+    ratings = ratings.sort_values(by=['userId', 'timestamp'])
+
+    for user_id, movie_id, rating, _ in ratings.values:
+        if rating >= min_rating:
+            user_movies[user_id].append(movie_id)
+
+    return user_movies
+
+
+
diff --git a/movies_test.py b/movies_test.py
new file mode 100644
index 0000000..a37ae83
--- /dev/null
+++ b/movies_test.py
@@ -0,0 +1,28 @@
+
+import pandas as pd
+import unittest
+import movies
+
+class LoadTests(unittest.TestCase):
+    """Test functions concerned with loading and preparing the dataset"""
+
+    @staticmethod
+    def toy_ratings():
+        """Return a toy dataframe with ratings"""
+        return pd.DataFrame(data={
+            "userId": [1, 1, 2, 2, 1],
+            "movieId": [1, 2, 1, 3, 4],
+            "rating": [3, 1, 4, 3, 5],
+            "timestamp": [1, 2, 7, 5, 4],
+        },
+        columns=["userId", "movieId", "rating", "timestamp"])
+    
+    def test_collect_user_context(self):
+        """Check that ratings are correctly aggregated"""
+        rating = self.toy_ratings()
+        user_movies = movies.collect_user_context(rating)
+        
+        self.assertEqual(user_movies[1], [1, 4]) 
+        self.assertEqual(user_movies[2], [3, 1]) 
+        self.assertEqual(set(user_movies.keys()), {1, 2})
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7c4c279
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+tensorflow-recommenders
+tensorflow-datasets
-- 
GitLab