Source code for econirl.datasets.rdw_scrappage

"""
RDW Vehicle Scrappage Dataset.

This module provides data for a vehicle scrappage decision model using
Dutch RDW (Rijksdienst voor het Wegverkeer) open data. The dataset consists
of annual observations of vehicle age and APK inspection defect severity,
paired with scrappage decisions.

When real RDW data is not available, the module generates synthetic data
with realistic scrappage patterns matching Dutch CBS statistics for
passenger vehicles.

Reference:
    RDW Open Data: https://opendata.rdw.nl
    El Boubsi (2023). MSc Thesis, Delft University of Technology.
"""

from pathlib import Path
from typing import Optional, Union

import numpy as np
import pandas as pd


[docs] def load_rdw_scrappage( data_dir: Optional[str] = None, as_panel: bool = False, max_vehicles: Optional[int] = None, ) -> Union[pd.DataFrame, "Panel"]: """ Load the RDW vehicle scrappage dataset. This dataset contains annual observations of vehicle age, APK inspection defect severity, and scrappage decisions for Dutch passenger vehicles. When real data is not available, synthetic data is generated with matching structure. Args: data_dir: Path to directory containing real RDW CSV data produced by scripts/download_rdw.py. If None or if the file does not exist, synthetic data is generated. as_panel: If True, return data structured as a Panel object compatible with econirl estimators. If False (default), return as a pandas DataFrame. max_vehicles: If specified, limit the number of vehicles loaded. Useful for quick testing. Returns: DataFrame with columns: - vehicle_id: Unique vehicle identifier - year: Calendar year (or period index for synthetic data) - age_bin: Discretized vehicle age (0-24) - defect_level: APK defect severity (0=pass, 1=minor, 2=major) - scrapped: 1 if vehicle was scrapped this period, 0 otherwise - state: Flattened state index (age_bin * 3 + defect_level) Example: >>> from econirl.datasets import load_rdw_scrappage >>> df = load_rdw_scrappage() >>> print(f"Observations: {len(df):,}") >>> print(f"Vehicles: {df['vehicle_id'].nunique()}") >>> print(f"Scrappage rate: {df['scrapped'].mean():.2%}") >>> # With real RDW data >>> df = load_rdw_scrappage(data_dir="/path/to/rdw_data/") """ if data_dir is not None: data_path = Path(data_dir) / "rdw_scrappage_data.csv" if data_path.exists(): df = pd.read_csv(data_path) else: raise FileNotFoundError( f"RDW data not found at {data_path}. " "Run: python scripts/download_rdw.py" ) else: # Check for bundled data bundled_path = Path(__file__).parent / "rdw_scrappage_data.csv" if bundled_path.exists(): df = pd.read_csv(bundled_path) else: df = _generate_synthetic_rdw() # Ensure state column exists if "state" not in df.columns: df["state"] = df["age_bin"] * 3 + df["defect_level"] if max_vehicles is not None: vehicle_ids = df["vehicle_id"].unique()[:max_vehicles] df = df[df["vehicle_id"].isin(vehicle_ids)].copy() if as_panel: return _to_panel(df) return df
def _generate_synthetic_rdw( n_vehicles: int = 2000, max_years: int = 20, num_age_bins: int = 25, num_defect_levels: int = 3, ) -> pd.DataFrame: """ Generate synthetic data matching RDW scrappage patterns. Creates a panel of vehicles with realistic Dutch scrappage behavior: annual scrappage rates of 5-8 percent for vehicles aged 5-20 years, with defects roughly doubling the scrappage hazard. The data generating process uses the same structural model as RDWScrapageEnvironment with default parameters. """ rng = np.random.default_rng(2024) # Structural parameters (matching RDWScrapageEnvironment defaults) theta_age = 0.15 theta_minor = 0.5 theta_major = 1.5 RC = 3.0 defect_sensitivity = 0.02 records = [] vehicle_id = 1 for _ in range(n_vehicles): age = 0 defect = 0 for year in range(max_years): # Current state age_bin = min(age, num_age_bins - 1) # Compute scrappage probability via logit v_keep = -theta_age * age_bin if defect == 1: v_keep -= theta_minor elif defect == 2: v_keep -= theta_major v_scrap = -RC prob_scrap = 1.0 / (1.0 + np.exp(v_keep - v_scrap)) scrapped = int(rng.random() < prob_scrap) records.append({ "vehicle_id": vehicle_id, "year": year, "age_bin": age_bin, "defect_level": defect, "scrapped": scrapped, "state": age_bin * num_defect_levels + defect, }) if scrapped: break # Transition: age +1, defect stochastic age += 1 # Defect transition (age-dependent) p_stay = max(0.4, 0.85 - defect_sensitivity * age_bin) p_improve = 0.05 if defect > 0 else 0.0 p_worsen = 1.0 - p_stay - p_improve if defect == 0: probs = [p_stay, p_worsen * 0.7, p_worsen * 0.3] elif defect == num_defect_levels - 1: probs = [0.0] * num_defect_levels probs[defect] = p_stay + p_worsen if defect > 0: probs[defect - 1] = p_improve else: probs = [0.0] * num_defect_levels probs[defect] = p_stay probs[defect - 1] = p_improve if defect + 2 < num_defect_levels: probs[defect + 1] = p_worsen * 0.7 probs[defect + 2] = p_worsen * 0.3 else: probs[defect + 1] = p_worsen # Normalize and sample probs = np.array(probs) probs = probs / probs.sum() defect = int(rng.choice(num_defect_levels, p=probs)) vehicle_id += 1 return pd.DataFrame(records) def _to_panel(df: pd.DataFrame) -> "Panel": """Convert DataFrame to Panel format for estimators.""" from econirl.core.types import Panel, Trajectory import jax.numpy as jnp from tqdm import tqdm vehicle_ids = df["vehicle_id"].unique() trajectories = [] for vid in tqdm(vehicle_ids, desc="Building panel", leave=False): vdata = df[df["vehicle_id"] == vid].sort_values("year") states = jnp.array(vdata["state"].values, dtype=jnp.int32) actions = jnp.array(vdata["scrapped"].values, dtype=jnp.int32) next_states = jnp.concatenate([states[1:], jnp.array([0])]) traj = Trajectory( states=states, actions=actions, next_states=next_states, individual_id=int(vid), ) trajectories.append(traj) return Panel(trajectories=trajectories)
[docs] def get_rdw_scrappage_info() -> dict: """ Get metadata about the RDW scrappage dataset. Returns: Dictionary with dataset information including number of vehicles, observations, and summary statistics. """ df = load_rdw_scrappage() return { "name": "RDW Vehicle Scrappage", "n_observations": len(df), "n_vehicles": df["vehicle_id"].nunique(), "scrappage_rate": df["scrapped"].mean(), "mean_age_bin": df["age_bin"].mean(), "mean_defect_level": df["defect_level"].mean(), "source": "RDW Open Data (opendata.rdw.nl) / Synthetic", "reference": "El Boubsi (2023). MSc Thesis, TU Delft.", }