Skip to content
Snippets Groups Projects
Verified Commit e00661fa authored by Frank Sauerburger's avatar Frank Sauerburger
Browse files

Implement model

parent c924bbd9
No related branches found
No related tags found
No related merge requests found
......@@ -7,11 +7,14 @@ import argparse
import collections
import json
import os
from pprint import pprint
import random
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
def load_movielense(basedir):
"""Load an return movies and ratings as dataframes"""
......@@ -133,13 +136,40 @@ def write_movie_titles(titles, filename):
json.dump(titles, fileobj)
def prepare_dataset(args):
"""Read raw CSV files and write TF Record"""
ratings_df, movies_df = load_movielense(args.input)
if args.debug:
ratings_df = ratings_df[:10000]
user_histories = collect_user_histories(ratings_df)
train_examples, test_examples = create_user_examples(user_histories)
train_file = os.path.join(args.output, "train.tfrecord")
write_user_examples(train_examples, train_file)
n_train = len(train_examples)
print(f"File {train_file} with {n_train:d} records created.")
test_file = os.path.join(args.output, "test.tfrecord")
write_user_examples(test_examples, test_file)
n_test = len(test_examples)
print(f"File {test_file} with {n_test:d} records created.")
movie_titles = index_titles(movies_df)
titles_file = os.path.join(args.output, "titles.json")
write_movie_titles(movie_titles, titles_file)
n_titles = len(movie_titles)
print(f"File {titles_file} with {n_titles:d} titles created.")
def load_dataset(basedir):
"""Load prepared dataset"""
train_filename = os.path.join(basedir, "train.tfrecord")
test_filename = os.path.join(basedir, "test.tfrecord")
train = tf.data.TFRecordDataset(train_filename)
test = tf.data.TFRecordDataset(test_filename)
train_rec = tf.data.TFRecordDataset(train_filename)
test_rec = tf.data.TFRecordDataset(test_filename)
feature_description = {
'context_movie_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.zeros(10)),
......@@ -150,45 +180,120 @@ def load_dataset(basedir):
"""Protobufer decode single example"""
return tf.io.parse_single_example(example_proto, feature_description)
train_ds = train.map(_record_parser)
test_ds = test.map(_record_parser)
train_ds = train_rec.map(_record_parser)
test_ds = test_rec.map(_record_parser)
titles_filename = os.path.join(basedir, "titles.json")
with open(titles_filename, encoding="utf-8") as fileobj:
titles = json.load(fileobj)
return train_ds, test_ds, titles
titles = {int(k): v for k, v in titles.items()}
movies_tensor = tf.constant([[x] for x in titles.keys()])
movie_ds = tf.data.Dataset.from_tensor_slices(movies_tensor)
def prepare_dataset(args):
"""Read raw CSV files and write TF Record"""
ratings_df, movies_df = load_movielense(args.input)
if args.debug:
ratings_df = ratings_df[:10000]
# Cache and batch
train_ds = train_ds.batch(128).cache()
test_ds = test_ds.batch(128).cache()
user_histories = collect_user_histories(ratings_df)
train_examples, test_examples = create_user_examples(user_histories)
return train_ds, test_ds, titles, movie_ds
train_file = os.path.join(args.output, "train.tfrecord")
write_user_examples(train_examples, train_file)
n_train = len(train_examples)
print(f"File {train_file} with {n_train:d} records created.")
test_file = os.path.join(args.output, "test.tfrecord")
write_user_examples(test_examples, test_file)
n_test = len(test_examples)
print(f"File {test_file} with {n_test:d} records created.")
class RecommenderModel(tfrs.Model):
"""Recommender model tying query and candidate models together"""
def __init__(self, query_model, cand_model, retrieval_task):
"""New instance from query and candidate models and retrieval task"""
super().__init__()
self.query_model = query_model
self.cand_model = cand_model
movie_titles = index_titles(movies_df)
titles_file = os.path.join(args.output, "titles.json")
write_movie_titles(movie_titles, titles_file)
n_titles = len(movie_titles)
print(f"File {titles_file} with {n_titles:d} titles created.")
self.retrieval_task = retrieval_task
def compute_loss(self, inputs, training=False):
"""Compute and return the loss"""
context_movie_id = inputs["context_movie_id"]
embedded_query = self.query_model(context_movie_id)
label_movie_id = inputs["label_movie_id"]
embedded_cand = self.cand_model(label_movie_id)
return self.retrieval_task(embedded_query, embedded_cand, compute_metrics=not training)
def build_embedding_model(movies, embedding_dimension):
"""Build and return query and candidate models"""
vocab = np.concatenate(list(movies))
query_model = tf.keras.Sequential([
tf.keras.layers.IntegerLookup(vocabulary=vocab),
tf.keras.layers.Embedding(len(movies) + 1, embedding_dimension),
tf.keras.layers.GRU(embedding_dimension),
])
cand_model = tf.keras.Sequential([
tf.keras.layers.IntegerLookup(vocabulary=vocab, mask_token=None, input_shape=(1,)),
tf.keras.layers.Embedding(len(movies) + 1, embedding_dimension),
tf.keras.layers.Reshape((32,), input_shape=(1, 32))
])
return query_model, cand_model
def build_model(movies,
learning_rate=0.1,
embedding_dimension=32):
"""Return the full keras/tensorflow model"""
query_model, cand_model = build_embedding_model(
movies, embedding_dimension=embedding_dimension
)
metrics = tfrs.metrics.FactorizedTopK(
candidates=movies.batch(128).map(cand_model)
)
retrieval_task= tfrs.tasks.Retrieval(
metrics=metrics
)
model = RecommenderModel(query_model, cand_model, retrieval_task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(
learning_rate=learning_rate
))
return model
def fit_model(model, training_ds, epochs=3):
"""Fit the model to the training dataset"""
model.fit(training_ds, epochs=epochs)
def lookup(titles, movie_ids):
"""Convert list of movie ids to movie titles, years"""
return [titles[id] for id in movie_ids]
def eval_model(model, test_ds):
"""Print the test set performance"""
result = model.evaluate(test_ds, return_dict=True)
pprint(result)
def train(args):
"""Load the dataset, train and evaluate the model"""
train_ds, test_ds, _, movies = load_dataset(args.input)
with tf.device("cpu:0"):
model = build_model(movies)
fit_model(model, train_ds)
eval_model(model, test_ds)
commands = {
"prepare": prepare_dataset,
"train": train,
}
def get_default_parser():
......
numpy~=1.23.2
pandas~=1.4.3
tensorflow-datasets~=4.6.0
tensorflow-recommenders~=0.7.0
tensorflow~=2.9.1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment