From 966fed1b06b73363702902ae5afbad309262bef4 Mon Sep 17 00:00:00 2001
From: Frank Sauerburger <frank@sauerburger.com>
Date: Sat, 27 Aug 2022 14:52:39 +0200
Subject: [PATCH] Adjust for "Small" dataset

---
 movies.py        | 69 ++++++++++++++++++++++++++++++++++++++++--------
 movies_test.py   |  8 +++---
 requirements.txt |  1 +
 3 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/movies.py b/movies.py
index 3574b99..1d68dd3 100644
--- a/movies.py
+++ b/movies.py
@@ -9,16 +9,26 @@ import json
 import os
 import random
 import re
+import numpy as np
 import pandas as pd
 import tensorflow as tf
 
-def load_movielense():
+def load_movielense(basedir):
     """Load an return movies and ratings as dataframes"""
-    ratings_filename = "data/MovieLense20M/rating.csv"
-    ratings_df = pd.read_csv(ratings_filename)
-
-    movies_filename = "data/MovieLense20M/movie.csv"
-    movies_df = pd.read_csv(movies_filename)
+    ratings_filename = os.path.join(basedir, "ratings.csv")
+    ratings_df = pd.read_csv(ratings_filename, dtype={
+        "userId": int,
+        "movieId": int,
+        "rating": float,
+        "timestamp": int,
+    })
+
+    movies_filename = os.path.join(basedir, "movies.csv")
+    movies_df = pd.read_csv(movies_filename, dtype={
+        "movieId": int,
+        "title": str,
+        "genres": str,
+    })
 
     return ratings_df, movies_df
 
@@ -29,14 +39,18 @@ def collect_user_histories(ratings, min_rating=2.1):
 
     ratings = ratings.sort_values(by=['userId', 'timestamp'])
 
-    for user_id, movie_id, rating, _ in ratings.values:
+    for row in ratings.itertuples():
+        user_id = row.userId
+        movie_id = row.movieId
+        rating = row.rating
+
         if rating >= min_rating:
             user_histories[user_id].append(movie_id)
 
     return user_histories
 
 
-def create_single_user_examples(user_history, min_len=3, max_len=100):
+def create_single_user_examples(user_history, min_len=3, max_len=10):
     """Create examples by sliding a 3-100 window over the history"""
     examples = []
     for label_idx in range(min_len, len(user_history)):
@@ -45,6 +59,9 @@ def create_single_user_examples(user_history, min_len=3, max_len=100):
         if len(context_movie_ids) < min_len:
             continue
 
+        while len(context_movie_ids) < max_len:
+            context_movie_ids.append(0)
+
         movie_id = user_history[label_idx]
 
         feature = {
@@ -93,7 +110,10 @@ def parse_title_year(movie_name):
 def index_titles(movies_df):
     """Create dict index of titles"""
     titles = {}
-    for movie_id, title, _ in movies_df.values:
+    for row in movies_df.itertuples():
+        movie_id = row.movieId
+        title = row.title
+
         titles[movie_id] = parse_title_year(title)
     return titles
 
@@ -113,9 +133,36 @@ def write_movie_titles(titles, filename):
         json.dump(titles, fileobj)
 
 
+def load_dataset(basedir):
+    """Load prepared dataset"""
+    train_filename = os.path.join(basedir, "train.tfrecord")
+    test_filename = os.path.join(basedir, "test.tfrecord")
+
+    train = tf.data.TFRecordDataset(train_filename)
+    test = tf.data.TFRecordDataset(test_filename)
+
+    feature_description = {
+        'context_movie_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.zeros(10)),
+        'label_movie_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0),
+    }
+
+    def _record_parser(example_proto):
+        """Protobufer decode single example"""
+        return tf.io.parse_single_example(example_proto, feature_description)
+
+    train_ds = train.map(_record_parser)
+    test_ds = test.map(_record_parser)
+
+    titles_filename = os.path.join(basedir, "titles.json")
+    with open(titles_filename, encoding="utf-8") as fileobj:
+        titles = json.load(fileobj)
+
+    return train_ds, test_ds, titles
+
+
 def prepare_dataset(args):
     """Read raw CSV files and write TF Record"""
-    ratings_df, movies_df = load_movielense()
+    ratings_df, movies_df = load_movielense(args.input)
     if args.debug:
         ratings_df = ratings_df[:10000]
 
@@ -136,7 +183,7 @@ def prepare_dataset(args):
     movie_titles = index_titles(movies_df)
     titles_file = os.path.join(args.output, "titles.json")
     write_movie_titles(movie_titles, titles_file)
-    n_titles = len(titles_file)
+    n_titles = len(movie_titles)
     print(f"File {titles_file} with {n_titles:d} titles created.")
 
 
diff --git a/movies_test.py b/movies_test.py
index 90315e7..b860f60 100644
--- a/movies_test.py
+++ b/movies_test.py
@@ -56,15 +56,15 @@ class LoadTests(unittest.TestCase):
         examples = movies.create_single_user_examples(history)
 
         # First
-        self.assert_feature(examples[0], "context_movie_id", [1, 2, 3])
+        self.assert_feature(examples[0], "context_movie_id", [1, 2, 3, 0, 0, 0, 0, 0, 0, 0])
         self.assert_feature(examples[0], "label_movie_id", [4])
 
         # Second
-        self.assert_feature(examples[1], "context_movie_id", [1, 2, 3, 4])
+        self.assert_feature(examples[1], "context_movie_id", [1, 2, 3, 4, 0, 0, 0, 0, 0, 0])
         self.assert_feature(examples[1], "label_movie_id", [5])
 
         # Third
-        self.assert_feature(examples[2], "context_movie_id", [1, 2, 3, 4, 5])
+        self.assert_feature(examples[2], "context_movie_id", [1, 2, 3, 4, 5, 0, 0, 0, 0, 0])
         self.assert_feature(examples[2], "label_movie_id", [6])
 
         self.assertEqual(len(examples), 3)
@@ -77,7 +77,7 @@ class LoadTests(unittest.TestCase):
         examples = movies.create_single_user_examples(history, min_len=2, max_len=3)
 
         # First
-        self.assert_feature(examples[0], "context_movie_id", [1, 2])
+        self.assert_feature(examples[0], "context_movie_id", [1, 2, 0])
         self.assert_feature(examples[0], "label_movie_id", [3])
 
         # Second
diff --git a/requirements.txt b/requirements.txt
index a409205..c5864bc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+numpy~=
 pandas~=1.4.3
 tensorflow-datasets~=4.6.0
 tensorflow-recommenders~=0.7.0
-- 
GitLab