@article{agrawal_avadhanula_tulabandhula_2021,
title={A Tractable Online Learning Algorithm for the Multinomial Logit Contextual Bandit},
abstractNote={In this paper, we consider the contextual variant of the MNL-Bandit problem.
More specifically, we consider a dynamic set optimization problem, where in
every round a decision maker offers a subset (assortment) of products to a
consumer, and observes their response. Consumers purchase products so as to
maximize their utility. We assume that the products are described by a set of
attributes and the mean utility of a product is linear in the values of these
attributes. We model consumer choice behavior by means of the widely used
Multinomial Logit (MNL) model, and consider the decision maker's problem of
dynamically learning the model parameters, while optimizing cumulative revenue
over the selling horizon $T$. Though this problem has attracted considerable
attention in recent times, many existing methods often involve solving an
intractable non-convex optimization problem and their theoretical performance
guarantees depend on a problem dependent parameter which could be prohibitively
large. In particular, existing algorithms for this problem have regret bounded
by $O(\sqrt{\kappa d T})$, where $\kappa$ is a problem dependent constant that
can have exponential dependency on the number of attributes. In this paper, we
propose an optimistic algorithm and show that the regret is bounded by
$O(\sqrt{dT} + \kappa)$, significantly improving the performance over existing
methods. Further, we propose a convex relaxation of the optimization step which
allows for tractable decision-making while retaining the favourable regret
guarantee.},
author={Agrawal and Avadhanula and Tulabandhula},
year={2021},
month={Mar}}