From e00661fa8a944ff941acc385adad23ef7d579ac5 Mon Sep 17 00:00:00 2001 From: Frank Sauerburger <frank@sauerburger.com> Date: Sat, 27 Aug 2022 23:30:20 +0200 Subject: [PATCH] Implement model --- movies.py | 155 +++++++++++++++++++++++++++++++++++++++-------- requirements.txt | 1 - 2 files changed, 130 insertions(+), 26 deletions(-) diff --git a/movies.py b/movies.py index 1d68dd3..a78639e 100644 --- a/movies.py +++ b/movies.py @@ -7,11 +7,14 @@ import argparse import collections import json import os +from pprint import pprint import random import re import numpy as np import pandas as pd import tensorflow as tf +import tensorflow_recommenders as tfrs + def load_movielense(basedir): """Load an return movies and ratings as dataframes""" @@ -133,13 +136,40 @@ def write_movie_titles(titles, filename): json.dump(titles, fileobj) +def prepare_dataset(args): + """Read raw CSV files and write TF Record""" + ratings_df, movies_df = load_movielense(args.input) + if args.debug: + ratings_df = ratings_df[:10000] + + user_histories = collect_user_histories(ratings_df) + train_examples, test_examples = create_user_examples(user_histories) + + train_file = os.path.join(args.output, "train.tfrecord") + write_user_examples(train_examples, train_file) + n_train = len(train_examples) + print(f"File {train_file} with {n_train:d} records created.") + + test_file = os.path.join(args.output, "test.tfrecord") + write_user_examples(test_examples, test_file) + n_test = len(test_examples) + print(f"File {test_file} with {n_test:d} records created.") + + + movie_titles = index_titles(movies_df) + titles_file = os.path.join(args.output, "titles.json") + write_movie_titles(movie_titles, titles_file) + n_titles = len(movie_titles) + print(f"File {titles_file} with {n_titles:d} titles created.") + + def load_dataset(basedir): """Load prepared dataset""" train_filename = os.path.join(basedir, "train.tfrecord") test_filename = os.path.join(basedir, "test.tfrecord") - train = tf.data.TFRecordDataset(train_filename) - test = tf.data.TFRecordDataset(test_filename) + train_rec = tf.data.TFRecordDataset(train_filename) + test_rec = tf.data.TFRecordDataset(test_filename) feature_description = { 'context_movie_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.zeros(10)), @@ -150,45 +180,120 @@ def load_dataset(basedir): """Protobufer decode single example""" return tf.io.parse_single_example(example_proto, feature_description) - train_ds = train.map(_record_parser) - test_ds = test.map(_record_parser) + train_ds = train_rec.map(_record_parser) + test_ds = test_rec.map(_record_parser) titles_filename = os.path.join(basedir, "titles.json") with open(titles_filename, encoding="utf-8") as fileobj: titles = json.load(fileobj) - return train_ds, test_ds, titles + titles = {int(k): v for k, v in titles.items()} + movies_tensor = tf.constant([[x] for x in titles.keys()]) + movie_ds = tf.data.Dataset.from_tensor_slices(movies_tensor) -def prepare_dataset(args): - """Read raw CSV files and write TF Record""" - ratings_df, movies_df = load_movielense(args.input) - if args.debug: - ratings_df = ratings_df[:10000] + # Cache and batch + train_ds = train_ds.batch(128).cache() + test_ds = test_ds.batch(128).cache() - user_histories = collect_user_histories(ratings_df) - train_examples, test_examples = create_user_examples(user_histories) + return train_ds, test_ds, titles, movie_ds - train_file = os.path.join(args.output, "train.tfrecord") - write_user_examples(train_examples, train_file) - n_train = len(train_examples) - print(f"File {train_file} with {n_train:d} records created.") - test_file = os.path.join(args.output, "test.tfrecord") - write_user_examples(test_examples, test_file) - n_test = len(test_examples) - print(f"File {test_file} with {n_test:d} records created.") +class RecommenderModel(tfrs.Model): + """Recommender model tying query and candidate models together""" + def __init__(self, query_model, cand_model, retrieval_task): + """New instance from query and candidate models and retrieval task""" + super().__init__() + self.query_model = query_model + self.cand_model = cand_model - movie_titles = index_titles(movies_df) - titles_file = os.path.join(args.output, "titles.json") - write_movie_titles(movie_titles, titles_file) - n_titles = len(movie_titles) - print(f"File {titles_file} with {n_titles:d} titles created.") + self.retrieval_task = retrieval_task + + + def compute_loss(self, inputs, training=False): + """Compute and return the loss""" + context_movie_id = inputs["context_movie_id"] + embedded_query = self.query_model(context_movie_id) + + label_movie_id = inputs["label_movie_id"] + embedded_cand = self.cand_model(label_movie_id) + + + return self.retrieval_task(embedded_query, embedded_cand, compute_metrics=not training) + + +def build_embedding_model(movies, embedding_dimension): + """Build and return query and candidate models""" + vocab = np.concatenate(list(movies)) + query_model = tf.keras.Sequential([ + tf.keras.layers.IntegerLookup(vocabulary=vocab), + tf.keras.layers.Embedding(len(movies) + 1, embedding_dimension), + tf.keras.layers.GRU(embedding_dimension), + ]) + + cand_model = tf.keras.Sequential([ + tf.keras.layers.IntegerLookup(vocabulary=vocab, mask_token=None, input_shape=(1,)), + tf.keras.layers.Embedding(len(movies) + 1, embedding_dimension), + tf.keras.layers.Reshape((32,), input_shape=(1, 32)) + ]) + + return query_model, cand_model + + +def build_model(movies, + learning_rate=0.1, + embedding_dimension=32): + """Return the full keras/tensorflow model""" + query_model, cand_model = build_embedding_model( + movies, embedding_dimension=embedding_dimension + ) + + metrics = tfrs.metrics.FactorizedTopK( + candidates=movies.batch(128).map(cand_model) + ) + + retrieval_task= tfrs.tasks.Retrieval( + metrics=metrics + ) + + model = RecommenderModel(query_model, cand_model, retrieval_task) + model.compile(optimizer=tf.keras.optimizers.Adagrad( + learning_rate=learning_rate + )) + + return model + + +def fit_model(model, training_ds, epochs=3): + """Fit the model to the training dataset""" + model.fit(training_ds, epochs=epochs) + + +def lookup(titles, movie_ids): + """Convert list of movie ids to movie titles, years""" + return [titles[id] for id in movie_ids] + + +def eval_model(model, test_ds): + """Print the test set performance""" + result = model.evaluate(test_ds, return_dict=True) + pprint(result) + + +def train(args): + """Load the dataset, train and evaluate the model""" + train_ds, test_ds, _, movies = load_dataset(args.input) + + with tf.device("cpu:0"): + model = build_model(movies) + fit_model(model, train_ds) + eval_model(model, test_ds) commands = { "prepare": prepare_dataset, + "train": train, } def get_default_parser(): diff --git a/requirements.txt b/requirements.txt index 93f00a0..9055648 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ numpy~=1.23.2 pandas~=1.4.3 -tensorflow-datasets~=4.6.0 tensorflow-recommenders~=0.7.0 tensorflow~=2.9.1 -- GitLab