Source code for econirl.datasets.rust_bus

"""
Rust (1987) Bus Engine Replacement Dataset.

This module provides the original data from John Rust's seminal 1987 Econometrica
paper "Optimal Replacement of GMC Bus Engines: An Empirical Model of Harold Zurcher."

The data consists of monthly observations of bus mileage and engine replacement
decisions from the Madison Metropolitan Bus Company.

Reference:
    Rust, J. (1987). "Optimal Replacement of GMC Bus Engines: An Empirical Model
    of Harold Zurcher." Econometrica, 55(5), 999-1033.
"""

from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd


[docs] def load_rust_bus( group: Optional[int] = None, as_panel: bool = False, original: bool = False, ) -> pd.DataFrame: """ Load the Rust (1987) bus engine replacement dataset. This dataset contains monthly observations of odometer readings and engine replacement decisions for buses from the Madison Metropolitan Bus Company. The data was used in Rust's pioneering work on dynamic discrete choice models. Args: group: If specified, load only buses from this group (1-8 for synthetic, 1-4 for original). Groups differ by bus type and usage patterns. If None, loads all groups. as_panel: If True, return data structured as a Panel object compatible with econirl estimators. If False (default), return as a pandas DataFrame. original: If True, load original Rust (1987) data from NFXP files. If False (default), load synthetic data with similar characteristics. Returns: DataFrame with columns: - bus_id: Unique bus identifier - period: Month number (1-indexed) - mileage: Odometer reading (in thousands of miles) - mileage_bin: Discretized mileage state (0-89) - replaced: 1 if engine was replaced this period, 0 otherwise - group: Bus group (1-4 for original, 1-8 for synthetic) Example: >>> from econirl.datasets import load_rust_bus >>> df = load_rust_bus() >>> print(f"Observations: {len(df):,}") >>> print(f"Buses: {df['bus_id'].nunique()}") >>> print(f"Replacement rate: {df['replaced'].mean():.2%}") >>> # Load original data for replication >>> df_original = load_rust_bus(original=True) >>> print(f"Original data: {len(df_original):,} observations") Notes: The original Rust (1987) paper used groups 1-4: - Group 1: Grumman model 870 (15 buses) - Group 2: Chance model RT50 (4 buses) - Group 3: GMC model T8H203 (48 buses) - Group 4: GMC model A5308, 1975 (37 buses) For the synthetic data, groups 5-8 represent additional bus types with different characteristics. Mileage bins follow Rust's discretization: each bin represents 5,000 miles, so bin 0 = [0, 5000), bin 1 = [5000, 10000), etc. """ if original: data_path = Path(__file__).parent / "rust_bus_original.csv" if not data_path.exists(): raise FileNotFoundError( "Original Rust data not found. Run: python scripts/download_rust_data.py" ) df = pd.read_csv(data_path) max_group = 4 else: data_path = Path(__file__).parent / "rust_bus_data.csv" if not data_path.exists(): # Generate the dataset if it doesn't exist df = _generate_rust_bus_data() df.to_csv(data_path, index=False) else: df = pd.read_csv(data_path) max_group = 8 if group is not None: if group not in range(1, max_group + 1): raise ValueError(f"group must be between 1 and {max_group}, got {group}") df = df[df["group"] == group].copy() if as_panel: from econirl.core.types import Panel, Trajectory import jax.numpy as jnp # Convert to Panel format bus_ids = df["bus_id"].unique() trajectories = [] for bus_id in bus_ids: bus_data = df[df["bus_id"] == bus_id].sort_values("period") states = jnp.array(bus_data["mileage_bin"].values, dtype=jnp.int32) actions = jnp.array(bus_data["replaced"].values, dtype=jnp.int32) # Compute next_states (shift states by 1, use 0 for last period) next_states = jnp.concatenate([states[1:], jnp.array([0])]) traj = Trajectory( states=states, actions=actions, next_states=next_states, individual_id=int(bus_id), ) trajectories.append(traj) return Panel(trajectories=trajectories) return df
def _generate_rust_bus_data() -> pd.DataFrame: """ Generate synthetic data matching the structure of Rust (1987). This creates a dataset with realistic characteristics matching the original Rust data: similar replacement rates, mileage distributions, and group structures. """ np.random.seed(1987) # Reproducible # Parameters roughly matching Rust's estimates # Operating cost per 5000 miles (per bin) theta_c = 0.001 # Replacement cost (in utils) RC = 3.0 # Discount factor beta = 0.9999 # Mileage transition probabilities (stay, +1, +2 bins) p_mileage = np.array([0.3919, 0.5953, 0.0128]) records = [] # 8 groups with different characteristics group_configs = [ {"n_buses": 15, "n_periods": 120, "base_mileage_rate": 1.0}, # Group 1 {"n_buses": 15, "n_periods": 120, "base_mileage_rate": 1.1}, # Group 2 {"n_buses": 10, "n_periods": 100, "base_mileage_rate": 0.9}, # Group 3 {"n_buses": 12, "n_periods": 110, "base_mileage_rate": 1.0}, # Group 4 {"n_buses": 10, "n_periods": 90, "base_mileage_rate": 1.2}, # Group 5 {"n_buses": 8, "n_periods": 80, "base_mileage_rate": 0.8}, # Group 6 {"n_buses": 10, "n_periods": 100, "base_mileage_rate": 1.0}, # Group 7 {"n_buses": 10, "n_periods": 95, "base_mileage_rate": 1.05}, # Group 8 ] bus_id_counter = 1 for group_idx, config in enumerate(group_configs): group = group_idx + 1 for _ in range(config["n_buses"]): mileage_bin = 0 cumulative_mileage = 0.0 for period in range(1, config["n_periods"] + 1): # Compute choice probabilities using logit formula # V(keep) ~ -theta_c * mileage # V(replace) ~ -RC v_keep = -theta_c * mileage_bin v_replace = -RC # Logit choice probability prob_replace = 1 / (1 + np.exp(v_keep - v_replace)) # Add some randomness based on group prob_replace *= config["base_mileage_rate"] prob_replace = np.clip(prob_replace, 0, 1) # Draw replacement decision replaced = int(np.random.random() < prob_replace) # Record observation records.append({ "bus_id": bus_id_counter, "period": period, "mileage": cumulative_mileage, "mileage_bin": mileage_bin, "replaced": replaced, "group": group, }) # State transition if replaced: mileage_bin = 0 cumulative_mileage = 0.0 else: # Mileage increment increment = np.random.choice([0, 1, 2], p=p_mileage) mileage_bin = min(mileage_bin + increment, 89) cumulative_mileage += increment * 5.0 # 5000 miles per bin bus_id_counter += 1 return pd.DataFrame(records) def get_rust_bus_info() -> dict: """ Get metadata about the Rust bus dataset. Returns: Dictionary with dataset information including number of buses, observations, groups, and summary statistics. """ df = load_rust_bus() return { "name": "Rust (1987) Bus Engine Replacement", "n_observations": len(df), "n_buses": df["bus_id"].nunique(), "n_groups": df["group"].nunique(), "n_periods_range": ( df.groupby("bus_id")["period"].count().min(), df.groupby("bus_id")["period"].count().max(), ), "replacement_rate": df["replaced"].mean(), "mean_mileage_bin": df["mileage_bin"].mean(), "reference": "Rust, J. (1987). Econometrica, 55(5), 999-1033.", }