Source code for econirl.datasets.supermarket

"""Aguirregabiria (1999) supermarket pricing and inventory dataset.

This module loads the real supermarket data from Aguirregabiria (1999)
"The Dynamics of Markups and Inventories in Retailing Firms" (Review
of Economic Studies). The dataset tracks 534 products in a single
Spanish supermarket over 29 months. Each product-month observation
includes inventory levels, sales, orders, retail and wholesale prices,
and promotion indicators.

The data is preprocessed into a DDC panel with discretized states
(inventory bin x lagged promotion status) and actions (promotion
decision x order decision).

State space:
    10 states = 5 inventory quintile bins x 2 lagged promotion status.

Action space:
    4 actions: (no promo, no order), (no promo, order),
    (promo, no order), (promo, order).

Reference:
    Aguirregabiria, V. (1999). "The Dynamics of Markups and Inventories
    in Retailing Firms." Review of Economic Studies, 66(2), 275-308.
"""

from __future__ import annotations

from pathlib import Path
from typing import Union

import pandas as pd

from econirl.core.types import Panel, Trajectory
from econirl.environments.supermarket import (
    N_ACTIONS,
    N_FEATURES,
    N_STATES,
)



[docs]
def load_supermarket(
    as_panel: bool = False,
    data_path: str | Path | None = None,
) -> Union[pd.DataFrame, Panel]:
    """Load the Aguirregabiria (1999) supermarket dataset.

    Returns the real supermarket data with 534 products tracked over
    26 usable periods (29 months minus lagged variable construction).
    Each observation is a product-month with discretized state and
    action variables suitable for DDC estimation.

    Args:
        as_panel: If True, return Panel object for econirl estimators.
        data_path: Path to the supermarket_data.csv file. If None,
            uses the bundled dataset.

    Returns:
        DataFrame with columns: product_id, period, state, action,
        next_state, inventory_bin, lagged_promotion, promotion,
        ordered, sales, inventory, orders, wholesale_price,
        retail_price, markup_pct, stockout.

        If as_panel=True, returns Panel object.
    """
    if data_path is None:
        data_path = Path(__file__).parent / "supermarket_data.csv"

    df = pd.read_csv(data_path)

    if as_panel:
        import numpy as np

        trajectories = []
        for pid, group in df.groupby("product_id"):
            group = group.sort_values("period")
            trajectories.append(
                Trajectory(
                    individual_id=int(pid),
                    states=np.array(group["state"].values, dtype=np.int32),
                    actions=np.array(group["action"].values, dtype=np.int32),
                    next_states=np.array(group["next_state"].values, dtype=np.int32),
                )
            )
        return Panel(trajectories=trajectories)

    return df




[docs]
def get_supermarket_info() -> dict:
    """Return metadata about the supermarket dataset."""
    return {
        "name": "Aguirregabiria (1999) Supermarket Pricing/Inventory",
        "description": (
            "Real supermarket data from Aguirregabiria (1999 REStud). "
            "534 products over 29 months in a Spanish supermarket. "
            "10 states (inventory bin x lagged promotion), "
            "4 actions (promotion x order decision)."
        ),
        "source": "http://individual.utoronto.ca/vaguirre/data/data.html",
        "license": "Academic use",
        "n_states": N_STATES,
        "n_actions": N_ACTIONS,
        "n_features": N_FEATURES,
        "state_description": "Inventory quintile bin x lagged promotion status",
        "action_description": (
            "No promo + no order (0), No promo + order (1), "
            "Promo + no order (2), Promo + order (3)"
        ),
        "n_observations": 13884,
        "n_products": 534,
        "n_periods": 26,
        "ground_truth": False,
        "use_case": "Retail IO, pricing dynamics, inventory management, promotions",
    }