Source code for econirl.datasets.citibike_route

"""Citibike route choice dataset for destination IRL.

This module loads preprocessed Citibike trip data for route choice
estimation. Riders choose a destination station cluster given their
origin cluster and time of day. The data must first be downloaded
and preprocessed using scripts/download_citibike.py.

If the processed data is not available, the loader falls back to
generating synthetic route choice trajectories from the
CitibikeRouteEnvironment with default parameters.

State space:
    n_clusters x n_time_buckets discrete states.
    Default: 20 station clusters x 4 time buckets = 80 states.

Action space:
    n_clusters destination choices.
    Default: 20 destination clusters.

Reference:
    Citibike System Data (NYC): https://citibikenyc.com/system-data
"""

from __future__ import annotations

from pathlib import Path
from typing import Union

import numpy as np
import pandas as pd

from econirl.core.types import Panel, Trajectory
from econirl.environments.citibike_route import (
    N_ACTIONS,
    N_FEATURES,
    N_STATES,
)


DEFAULT_DATA_PATH = Path(__file__).resolve().parent.parent.parent.parent / "data" / "processed" / "citibike_route.csv"


[docs] def load_citibike_route( as_panel: bool = False, data_path: str | Path | None = None, n_individuals: int = 1000, n_periods: int = 50, seed: int = 42, ) -> Union[pd.DataFrame, Panel]: """Load Citibike route choice data. If preprocessed data exists (from scripts/download_citibike.py), loads it directly. Otherwise generates synthetic route choice trajectories from the CitibikeRouteEnvironment. Args: as_panel: If True, return Panel object for econirl estimators. data_path: Path to citibike_route.csv. If None, checks default location then falls back to synthetic generation. n_individuals: Number of riders for synthetic fallback. n_periods: Number of trips per rider for synthetic fallback. seed: Random seed for synthetic fallback. Returns: DataFrame with route choice data. If as_panel=True, returns Panel object. """ if data_path is not None: data_path = Path(data_path) else: data_path = DEFAULT_DATA_PATH if data_path.exists(): df = pd.read_csv(data_path) if "next_state" not in df.columns: # Build next_state from sequential trips df = df.sort_values(["trip_idx"]) df["next_state"] = df["state"].shift(-1) df = df.dropna(subset=["next_state"]) df["next_state"] = df["next_state"].astype(int) if as_panel: return _dataframe_to_panel(df) return df else: print( f"Citibike data not found at {data_path}. " "Generating synthetic data from CitibikeRouteEnvironment. " "Run 'python scripts/download_citibike.py' to download real data." ) return _generate_synthetic(as_panel, n_individuals, n_periods, seed)
def _generate_synthetic( as_panel: bool, n_individuals: int, n_periods: int, seed: int ) -> Union[pd.DataFrame, Panel]: """Generate synthetic route choice data as fallback.""" from econirl.environments.citibike_route import ( CitibikeRouteEnvironment, state_to_components, ) from econirl.simulation.synthetic import simulate_panel env = CitibikeRouteEnvironment(seed=seed) panel = simulate_panel(env, n_individuals=n_individuals, n_periods=n_periods, seed=seed) if as_panel: return panel records = [] for traj in panel.trajectories: for t in range(len(traj.states)): s = int(traj.states[t]) a = int(traj.actions[t]) ns = int(traj.next_states[t]) oc, tb = state_to_components(s) records.append({ "trip_idx": len(records), "state": s, "action": a, "next_state": ns, "origin_cluster": oc, "dest_cluster": a, "time_bucket": tb, }) return pd.DataFrame(records) def _dataframe_to_panel(df: pd.DataFrame) -> Panel: """Convert route choice DataFrame to Panel by chunking trips.""" # Group sequential trips into pseudo-individuals chunk_size = 50 trajectories = [] n_chunks = len(df) // chunk_size for i in range(n_chunks): chunk = df.iloc[i * chunk_size : (i + 1) * chunk_size] trajectories.append( Trajectory( individual_id=i, states=np.array(chunk["state"].values, dtype=np.int32), actions=np.array(chunk["action"].values, dtype=np.int32), next_states=np.array(chunk["next_state"].values, dtype=np.int32), ) ) return Panel(trajectories=trajectories)
[docs] def get_citibike_route_info() -> dict: """Return metadata about the Citibike route choice dataset.""" return { "name": "Citibike Route Choice", "description": ( "NYC Citibike station-to-station destination choice. " "80 states (20 station clusters x 4 time buckets), " "20 actions (destination clusters). Real data requires " "running scripts/download_citibike.py; falls back to " "synthetic generation." ), "source": "https://citibikenyc.com/system-data", "license": "Non-commercial research", "n_states": N_STATES, "n_actions": N_ACTIONS, "n_features": N_FEATURES, "state_description": "Origin station cluster x time-of-day bucket", "action_description": "Destination station cluster (0-19)", "ground_truth": False, "use_case": "Route choice IRL, urban mobility, transportation planning", }