Skip to content
Snippets Groups Projects
Verified Commit 11a2a648 authored by Frank Sauerburger's avatar Frank Sauerburger
Browse files

Add first data processing step and CI

parent b4dfd877
No related branches found
No related tags found
No related merge requests found
Pipeline #9616 failed
[core]
remote = amazonas
['remote "amazonas"'] ['remote "amazonas"']
url = s3://mlflow/ url = s3://mlflow/
endpointurl = https://s3.ds.sit-servers.net endpointurl = https://s3.ds.sit-servers.net
/MovieLenseSmall /MovieLenseSmall
/MovieLense20M /MovieLense20M
__pycache__/
.ipynb_checkpoints/
*.ipynb
stages:
- test
################################################################################
# Unittest
.unittest: &unittest_template
stage: test
script:
- pip install -r requirements.txt
- pytest
unittest:py3.10:
<<: *unittest_template
image: python:3.10
import collections
import pandas as pd
import tensorflow as tf
def load_movielense():
"""Load an return movies and ratings as dataframes"""
ratings_filename = "MovieLense20M/rating.csv"
ratings_df = pd.read_csv(ratings_filename)
movies_filename = "MovieLense20M/movie.csv"
movies_df = pd.read_csv(movies_filename)
return ratings_df, movies_df
def collect_user_context(ratings, min_rating=2.1):
"""Create a per-user rating list"""
user_movies = collections.defaultdict(lambda: []) # dict mapping ids to movies
ratings = ratings.sort_values(by=['userId', 'timestamp'])
for user_id, movie_id, rating, _ in ratings.values:
if rating >= min_rating:
user_movies[user_id].append(movie_id)
return user_movies
import pandas as pd
import unittest
import movies
class LoadTests(unittest.TestCase):
"""Test functions concerned with loading and preparing the dataset"""
@staticmethod
def toy_ratings():
"""Return a toy dataframe with ratings"""
return pd.DataFrame(data={
"userId": [1, 1, 2, 2, 1],
"movieId": [1, 2, 1, 3, 4],
"rating": [3, 1, 4, 3, 5],
"timestamp": [1, 2, 7, 5, 4],
},
columns=["userId", "movieId", "rating", "timestamp"])
def test_collect_user_context(self):
"""Check that ratings are correctly aggregated"""
rating = self.toy_ratings()
user_movies = movies.collect_user_context(rating)
self.assertEqual(user_movies[1], [1, 4])
self.assertEqual(user_movies[2], [3, 1])
self.assertEqual(set(user_movies.keys()), {1, 2})
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment