Source code for econirl.datasets.rust_bus

"""
Rust (1987) Bus Engine Replacement Dataset.

This module provides the original data from John Rust's seminal 1987 Econometrica
paper "Optimal Replacement of GMC Bus Engines: An Empirical Model of Harold Zurcher."

The data consists of monthly observations of bus mileage and engine replacement
decisions from the Madison Metropolitan Bus Company.

Reference:
    Rust, J. (1987). "Optimal Replacement of GMC Bus Engines: An Empirical Model
    of Harold Zurcher." Econometrica, 55(5), 999-1033.
"""

from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd



[docs]
def load_rust_bus(
    group: Optional[int] = None,
    as_panel: bool = False,
    original: bool = False,
) -> pd.DataFrame:
    """
    Load the Rust (1987) bus engine replacement dataset.

    This dataset contains monthly observations of odometer readings and engine
    replacement decisions for buses from the Madison Metropolitan Bus Company.
    The data was used in Rust's pioneering work on dynamic discrete choice models.

    Args:
        group: If specified, load only buses from this group (1-8 for synthetic,
            1-4 for original). Groups differ by bus type and usage patterns.
            If None, loads all groups.
        as_panel: If True, return data structured as a Panel object
            compatible with econirl estimators. If False (default),
            return as a pandas DataFrame.
        original: If True, load original Rust (1987) data from NFXP files.
            If False (default), load synthetic data with similar characteristics.

    Returns:
        DataFrame with columns:
            - bus_id: Unique bus identifier
            - period: Month number (1-indexed)
            - mileage: Odometer reading (in thousands of miles)
            - mileage_bin: Discretized mileage state (0-89)
            - replaced: 1 if engine was replaced this period, 0 otherwise
            - group: Bus group (1-4 for original, 1-8 for synthetic)

    Example:
        >>> from econirl.datasets import load_rust_bus
        >>> df = load_rust_bus()
        >>> print(f"Observations: {len(df):,}")
        >>> print(f"Buses: {df['bus_id'].nunique()}")
        >>> print(f"Replacement rate: {df['replaced'].mean():.2%}")

        >>> # Load original data for replication
        >>> df_original = load_rust_bus(original=True)
        >>> print(f"Original data: {len(df_original):,} observations")

    Notes:
        The original Rust (1987) paper used groups 1-4:
        - Group 1: Grumman model 870 (15 buses)
        - Group 2: Chance model RT50 (4 buses)
        - Group 3: GMC model T8H203 (48 buses)
        - Group 4: GMC model A5308, 1975 (37 buses)

        For the synthetic data, groups 5-8 represent additional bus types
        with different characteristics.

        Mileage bins follow Rust's discretization: each bin represents 5,000 miles,
        so bin 0 = [0, 5000), bin 1 = [5000, 10000), etc.
    """
    if original:
        data_path = Path(__file__).parent / "rust_bus_original.csv"
        if not data_path.exists():
            raise FileNotFoundError(
                "Original Rust data not found. Run: python scripts/download_rust_data.py"
            )
        df = pd.read_csv(data_path)
        max_group = 4
    else:
        data_path = Path(__file__).parent / "rust_bus_data.csv"
        if not data_path.exists():
            # Generate the dataset if it doesn't exist
            df = _generate_rust_bus_data()
            df.to_csv(data_path, index=False)
        else:
            df = pd.read_csv(data_path)
        max_group = 8

    if group is not None:
        if group not in range(1, max_group + 1):
            raise ValueError(f"group must be between 1 and {max_group}, got {group}")
        df = df[df["group"] == group].copy()

    if as_panel:
        from econirl.core.types import Panel, Trajectory
        import jax.numpy as jnp

        # Convert to Panel format
        bus_ids = df["bus_id"].unique()
        trajectories = []

        for bus_id in bus_ids:
            bus_data = df[df["bus_id"] == bus_id].sort_values("period")
            states = jnp.array(bus_data["mileage_bin"].values, dtype=jnp.int32)
            actions = jnp.array(bus_data["replaced"].values, dtype=jnp.int32)
            # Compute next_states (shift states by 1, use 0 for last period)
            next_states = jnp.concatenate([states[1:], jnp.array([0])])

            traj = Trajectory(
                states=states,
                actions=actions,
                next_states=next_states,
                individual_id=int(bus_id),
            )
            trajectories.append(traj)

        return Panel(trajectories=trajectories)

    return df



def _generate_rust_bus_data() -> pd.DataFrame:
    """
    Generate synthetic data matching the structure of Rust (1987).

    This creates a dataset with realistic characteristics matching the original
    Rust data: similar replacement rates, mileage distributions, and group
    structures.
    """
    np.random.seed(1987)  # Reproducible

    # Parameters roughly matching Rust's estimates
    # Operating cost per 5000 miles (per bin)
    theta_c = 0.001
    # Replacement cost (in utils)
    RC = 3.0
    # Discount factor
    beta = 0.9999
    # Mileage transition probabilities (stay, +1, +2 bins)
    p_mileage = np.array([0.3919, 0.5953, 0.0128])

    records = []

    # 8 groups with different characteristics
    group_configs = [
        {"n_buses": 15, "n_periods": 120, "base_mileage_rate": 1.0},  # Group 1
        {"n_buses": 15, "n_periods": 120, "base_mileage_rate": 1.1},  # Group 2
        {"n_buses": 10, "n_periods": 100, "base_mileage_rate": 0.9},  # Group 3
        {"n_buses": 12, "n_periods": 110, "base_mileage_rate": 1.0},  # Group 4
        {"n_buses": 10, "n_periods": 90, "base_mileage_rate": 1.2},   # Group 5
        {"n_buses": 8, "n_periods": 80, "base_mileage_rate": 0.8},    # Group 6
        {"n_buses": 10, "n_periods": 100, "base_mileage_rate": 1.0},  # Group 7
        {"n_buses": 10, "n_periods": 95, "base_mileage_rate": 1.05},  # Group 8
    ]

    bus_id_counter = 1

    for group_idx, config in enumerate(group_configs):
        group = group_idx + 1

        for _ in range(config["n_buses"]):
            mileage_bin = 0
            cumulative_mileage = 0.0

            for period in range(1, config["n_periods"] + 1):
                # Compute choice probabilities using logit formula
                # V(keep) ~ -theta_c * mileage
                # V(replace) ~ -RC
                v_keep = -theta_c * mileage_bin
                v_replace = -RC

                # Logit choice probability
                prob_replace = 1 / (1 + np.exp(v_keep - v_replace))

                # Add some randomness based on group
                prob_replace *= config["base_mileage_rate"]
                prob_replace = np.clip(prob_replace, 0, 1)

                # Draw replacement decision
                replaced = int(np.random.random() < prob_replace)

                # Record observation
                records.append({
                    "bus_id": bus_id_counter,
                    "period": period,
                    "mileage": cumulative_mileage,
                    "mileage_bin": mileage_bin,
                    "replaced": replaced,
                    "group": group,
                })

                # State transition
                if replaced:
                    mileage_bin = 0
                    cumulative_mileage = 0.0
                else:
                    # Mileage increment
                    increment = np.random.choice([0, 1, 2], p=p_mileage)
                    mileage_bin = min(mileage_bin + increment, 89)
                    cumulative_mileage += increment * 5.0  # 5000 miles per bin

            bus_id_counter += 1

    return pd.DataFrame(records)


def get_rust_bus_info() -> dict:
    """
    Get metadata about the Rust bus dataset.

    Returns:
        Dictionary with dataset information including number of buses,
        observations, groups, and summary statistics.
    """
    df = load_rust_bus()

    return {
        "name": "Rust (1987) Bus Engine Replacement",
        "n_observations": len(df),
        "n_buses": df["bus_id"].nunique(),
        "n_groups": df["group"].nunique(),
        "n_periods_range": (
            df.groupby("bus_id")["period"].count().min(),
            df.groupby("bus_id")["period"].count().max(),
        ),
        "replacement_rate": df["replaced"].mean(),
        "mean_mileage_bin": df["mileage_bin"].mean(),
        "reference": "Rust, J. (1987). Econometrica, 55(5), 999-1033.",
    }