From e00661fa8a944ff941acc385adad23ef7d579ac5 Mon Sep 17 00:00:00 2001
From: Frank Sauerburger <frank@sauerburger.com>
Date: Sat, 27 Aug 2022 23:30:20 +0200
Subject: [PATCH] Implement model

---
 movies.py        | 155 +++++++++++++++++++++++++++++++++++++++--------
 requirements.txt |   1 -
 2 files changed, 130 insertions(+), 26 deletions(-)

diff --git a/movies.py b/movies.py
index 1d68dd3..a78639e 100644
--- a/movies.py
+++ b/movies.py
@@ -7,11 +7,14 @@ import argparse
 import collections
 import json
 import os
+from pprint import pprint
 import random
 import re
 import numpy as np
 import pandas as pd
 import tensorflow as tf
+import tensorflow_recommenders as tfrs
+
 
 def load_movielense(basedir):
     """Load an return movies and ratings as dataframes"""
@@ -133,13 +136,40 @@ def write_movie_titles(titles, filename):
         json.dump(titles, fileobj)
 
 
+def prepare_dataset(args):
+    """Read raw CSV files and write TF Record"""
+    ratings_df, movies_df = load_movielense(args.input)
+    if args.debug:
+        ratings_df = ratings_df[:10000]
+
+    user_histories = collect_user_histories(ratings_df)
+    train_examples, test_examples = create_user_examples(user_histories)
+
+    train_file = os.path.join(args.output, "train.tfrecord")
+    write_user_examples(train_examples, train_file)
+    n_train = len(train_examples)
+    print(f"File {train_file} with {n_train:d} records created.")
+
+    test_file = os.path.join(args.output, "test.tfrecord")
+    write_user_examples(test_examples, test_file)
+    n_test = len(test_examples)
+    print(f"File {test_file} with {n_test:d} records created.")
+
+
+    movie_titles = index_titles(movies_df)
+    titles_file = os.path.join(args.output, "titles.json")
+    write_movie_titles(movie_titles, titles_file)
+    n_titles = len(movie_titles)
+    print(f"File {titles_file} with {n_titles:d} titles created.")
+
+
 def load_dataset(basedir):
     """Load prepared dataset"""
     train_filename = os.path.join(basedir, "train.tfrecord")
     test_filename = os.path.join(basedir, "test.tfrecord")
 
-    train = tf.data.TFRecordDataset(train_filename)
-    test = tf.data.TFRecordDataset(test_filename)
+    train_rec = tf.data.TFRecordDataset(train_filename)
+    test_rec = tf.data.TFRecordDataset(test_filename)
 
     feature_description = {
         'context_movie_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.zeros(10)),
@@ -150,45 +180,120 @@ def load_dataset(basedir):
         """Protobufer decode single example"""
         return tf.io.parse_single_example(example_proto, feature_description)
 
-    train_ds = train.map(_record_parser)
-    test_ds = test.map(_record_parser)
+    train_ds = train_rec.map(_record_parser)
+    test_ds = test_rec.map(_record_parser)
 
     titles_filename = os.path.join(basedir, "titles.json")
     with open(titles_filename, encoding="utf-8") as fileobj:
         titles = json.load(fileobj)
 
-    return train_ds, test_ds, titles
+    titles = {int(k): v for k, v in titles.items()}
 
+    movies_tensor = tf.constant([[x] for x in titles.keys()])
+    movie_ds = tf.data.Dataset.from_tensor_slices(movies_tensor)
 
-def prepare_dataset(args):
-    """Read raw CSV files and write TF Record"""
-    ratings_df, movies_df = load_movielense(args.input)
-    if args.debug:
-        ratings_df = ratings_df[:10000]
+    # Cache and batch
+    train_ds = train_ds.batch(128).cache()
+    test_ds = test_ds.batch(128).cache()
 
-    user_histories = collect_user_histories(ratings_df)
-    train_examples, test_examples = create_user_examples(user_histories)
+    return train_ds, test_ds, titles, movie_ds
 
-    train_file = os.path.join(args.output, "train.tfrecord")
-    write_user_examples(train_examples, train_file)
-    n_train = len(train_examples)
-    print(f"File {train_file} with {n_train:d} records created.")
 
-    test_file = os.path.join(args.output, "test.tfrecord")
-    write_user_examples(test_examples, test_file)
-    n_test = len(test_examples)
-    print(f"File {test_file} with {n_test:d} records created.")
+class RecommenderModel(tfrs.Model):
+    """Recommender model tying query and candidate models together"""
 
+    def __init__(self,  query_model, cand_model, retrieval_task):
+        """New instance from query and candidate models and retrieval task"""
+        super().__init__()
+        self.query_model = query_model
+        self.cand_model = cand_model
 
-    movie_titles = index_titles(movies_df)
-    titles_file = os.path.join(args.output, "titles.json")
-    write_movie_titles(movie_titles, titles_file)
-    n_titles = len(movie_titles)
-    print(f"File {titles_file} with {n_titles:d} titles created.")
+        self.retrieval_task = retrieval_task
+
+
+    def compute_loss(self, inputs, training=False):
+        """Compute and return the loss"""
+        context_movie_id = inputs["context_movie_id"]
+        embedded_query = self.query_model(context_movie_id)
+
+        label_movie_id = inputs["label_movie_id"]
+        embedded_cand = self.cand_model(label_movie_id)
+
+
+        return self.retrieval_task(embedded_query, embedded_cand, compute_metrics=not training)
+
+
+def build_embedding_model(movies, embedding_dimension):
+    """Build and return query and candidate models"""
+    vocab = np.concatenate(list(movies))
+    query_model = tf.keras.Sequential([
+        tf.keras.layers.IntegerLookup(vocabulary=vocab),
+        tf.keras.layers.Embedding(len(movies) + 1, embedding_dimension),
+        tf.keras.layers.GRU(embedding_dimension),
+    ])
+
+    cand_model = tf.keras.Sequential([
+        tf.keras.layers.IntegerLookup(vocabulary=vocab, mask_token=None, input_shape=(1,)),
+        tf.keras.layers.Embedding(len(movies) + 1, embedding_dimension),
+        tf.keras.layers.Reshape((32,), input_shape=(1, 32))
+    ])
+
+    return query_model, cand_model
+
+
+def build_model(movies,
+                learning_rate=0.1,
+                embedding_dimension=32):
+    """Return the full keras/tensorflow model"""
+    query_model, cand_model = build_embedding_model(
+        movies, embedding_dimension=embedding_dimension
+    )
+
+    metrics = tfrs.metrics.FactorizedTopK(
+      candidates=movies.batch(128).map(cand_model)
+    )
+
+    retrieval_task= tfrs.tasks.Retrieval(
+      metrics=metrics
+    )
+
+    model = RecommenderModel(query_model, cand_model, retrieval_task)
+    model.compile(optimizer=tf.keras.optimizers.Adagrad(
+        learning_rate=learning_rate
+    ))
+
+    return model
+
+
+def fit_model(model, training_ds, epochs=3):
+    """Fit the model to the training dataset"""
+    model.fit(training_ds, epochs=epochs)
+
+
+def lookup(titles, movie_ids):
+    """Convert list of movie ids to movie titles, years"""
+    return [titles[id] for id in movie_ids]
+
+
+def eval_model(model, test_ds):
+    """Print the test set performance"""
+    result = model.evaluate(test_ds, return_dict=True)
+    pprint(result)
+
+
+def train(args):
+    """Load the dataset, train and evaluate the model"""
+    train_ds, test_ds, _, movies = load_dataset(args.input)
+
+    with tf.device("cpu:0"):
+        model = build_model(movies)
+        fit_model(model, train_ds)
+        eval_model(model, test_ds)
 
 
 commands = {
     "prepare": prepare_dataset,
+    "train": train,
 }
 
 def get_default_parser():
diff --git a/requirements.txt b/requirements.txt
index 93f00a0..9055648 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 numpy~=1.23.2
 pandas~=1.4.3
-tensorflow-datasets~=4.6.0
 tensorflow-recommenders~=0.7.0
 tensorflow~=2.9.1
-- 
GitLab