From 966fed1b06b73363702902ae5afbad309262bef4 Mon Sep 17 00:00:00 2001 From: Frank Sauerburger <frank@sauerburger.com> Date: Sat, 27 Aug 2022 14:52:39 +0200 Subject: [PATCH] Adjust for "Small" dataset --- movies.py | 69 ++++++++++++++++++++++++++++++++++++++++-------- movies_test.py | 8 +++--- requirements.txt | 1 + 3 files changed, 63 insertions(+), 15 deletions(-) diff --git a/movies.py b/movies.py index 3574b99..1d68dd3 100644 --- a/movies.py +++ b/movies.py @@ -9,16 +9,26 @@ import json import os import random import re +import numpy as np import pandas as pd import tensorflow as tf -def load_movielense(): +def load_movielense(basedir): """Load an return movies and ratings as dataframes""" - ratings_filename = "data/MovieLense20M/rating.csv" - ratings_df = pd.read_csv(ratings_filename) - - movies_filename = "data/MovieLense20M/movie.csv" - movies_df = pd.read_csv(movies_filename) + ratings_filename = os.path.join(basedir, "ratings.csv") + ratings_df = pd.read_csv(ratings_filename, dtype={ + "userId": int, + "movieId": int, + "rating": float, + "timestamp": int, + }) + + movies_filename = os.path.join(basedir, "movies.csv") + movies_df = pd.read_csv(movies_filename, dtype={ + "movieId": int, + "title": str, + "genres": str, + }) return ratings_df, movies_df @@ -29,14 +39,18 @@ def collect_user_histories(ratings, min_rating=2.1): ratings = ratings.sort_values(by=['userId', 'timestamp']) - for user_id, movie_id, rating, _ in ratings.values: + for row in ratings.itertuples(): + user_id = row.userId + movie_id = row.movieId + rating = row.rating + if rating >= min_rating: user_histories[user_id].append(movie_id) return user_histories -def create_single_user_examples(user_history, min_len=3, max_len=100): +def create_single_user_examples(user_history, min_len=3, max_len=10): """Create examples by sliding a 3-100 window over the history""" examples = [] for label_idx in range(min_len, len(user_history)): @@ -45,6 +59,9 @@ def create_single_user_examples(user_history, min_len=3, max_len=100): if len(context_movie_ids) < min_len: continue + while len(context_movie_ids) < max_len: + context_movie_ids.append(0) + movie_id = user_history[label_idx] feature = { @@ -93,7 +110,10 @@ def parse_title_year(movie_name): def index_titles(movies_df): """Create dict index of titles""" titles = {} - for movie_id, title, _ in movies_df.values: + for row in movies_df.itertuples(): + movie_id = row.movieId + title = row.title + titles[movie_id] = parse_title_year(title) return titles @@ -113,9 +133,36 @@ def write_movie_titles(titles, filename): json.dump(titles, fileobj) +def load_dataset(basedir): + """Load prepared dataset""" + train_filename = os.path.join(basedir, "train.tfrecord") + test_filename = os.path.join(basedir, "test.tfrecord") + + train = tf.data.TFRecordDataset(train_filename) + test = tf.data.TFRecordDataset(test_filename) + + feature_description = { + 'context_movie_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.zeros(10)), + 'label_movie_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0), + } + + def _record_parser(example_proto): + """Protobufer decode single example""" + return tf.io.parse_single_example(example_proto, feature_description) + + train_ds = train.map(_record_parser) + test_ds = test.map(_record_parser) + + titles_filename = os.path.join(basedir, "titles.json") + with open(titles_filename, encoding="utf-8") as fileobj: + titles = json.load(fileobj) + + return train_ds, test_ds, titles + + def prepare_dataset(args): """Read raw CSV files and write TF Record""" - ratings_df, movies_df = load_movielense() + ratings_df, movies_df = load_movielense(args.input) if args.debug: ratings_df = ratings_df[:10000] @@ -136,7 +183,7 @@ def prepare_dataset(args): movie_titles = index_titles(movies_df) titles_file = os.path.join(args.output, "titles.json") write_movie_titles(movie_titles, titles_file) - n_titles = len(titles_file) + n_titles = len(movie_titles) print(f"File {titles_file} with {n_titles:d} titles created.") diff --git a/movies_test.py b/movies_test.py index 90315e7..b860f60 100644 --- a/movies_test.py +++ b/movies_test.py @@ -56,15 +56,15 @@ class LoadTests(unittest.TestCase): examples = movies.create_single_user_examples(history) # First - self.assert_feature(examples[0], "context_movie_id", [1, 2, 3]) + self.assert_feature(examples[0], "context_movie_id", [1, 2, 3, 0, 0, 0, 0, 0, 0, 0]) self.assert_feature(examples[0], "label_movie_id", [4]) # Second - self.assert_feature(examples[1], "context_movie_id", [1, 2, 3, 4]) + self.assert_feature(examples[1], "context_movie_id", [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]) self.assert_feature(examples[1], "label_movie_id", [5]) # Third - self.assert_feature(examples[2], "context_movie_id", [1, 2, 3, 4, 5]) + self.assert_feature(examples[2], "context_movie_id", [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]) self.assert_feature(examples[2], "label_movie_id", [6]) self.assertEqual(len(examples), 3) @@ -77,7 +77,7 @@ class LoadTests(unittest.TestCase): examples = movies.create_single_user_examples(history, min_len=2, max_len=3) # First - self.assert_feature(examples[0], "context_movie_id", [1, 2]) + self.assert_feature(examples[0], "context_movie_id", [1, 2, 0]) self.assert_feature(examples[0], "label_movie_id", [3]) # Second diff --git a/requirements.txt b/requirements.txt index a409205..c5864bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +numpy~= pandas~=1.4.3 tensorflow-datasets~=4.6.0 tensorflow-recommenders~=0.7.0 -- GitLab