Source code for econirl.datasets.citibike_usage

"""Citibike daily usage frequency dataset for transportation DDC.

This module loads preprocessed Citibike member-day panel data for
usage frequency estimation. Members decide each day whether to take
a bikeshare trip. The data must first be downloaded and preprocessed
using scripts/download_citibike.py.

If the processed data is not available, the loader falls back to
generating synthetic usage panels from the CitibikeUsageEnvironment.

State space:
    n_day_types x n_usage_buckets discrete states.
    Default: 2 day types x 4 usage buckets = 8 states.

Action space:
    2 actions: no ride (0) and ride (1).

Reference:
    Citibike System Data (NYC): https://citibikenyc.com/system-data
"""

from __future__ import annotations

from pathlib import Path
from typing import Union

import numpy as np
import pandas as pd

from econirl.core.types import Panel, Trajectory
from econirl.environments.citibike_usage import (
    N_ACTIONS,
    N_FEATURES,
    N_STATES,
)


DEFAULT_DATA_PATH = Path(__file__).resolve().parent.parent.parent.parent / "data" / "processed" / "citibike_usage.csv"


[docs] def load_citibike_usage( as_panel: bool = False, data_path: str | Path | None = None, n_individuals: int = 500, n_periods: int = 90, seed: int = 42, ) -> Union[pd.DataFrame, Panel]: """Load Citibike daily usage frequency data. If preprocessed data exists (from scripts/download_citibike.py), loads it directly. Otherwise generates synthetic usage panels from the CitibikeUsageEnvironment. Args: as_panel: If True, return Panel object for econirl estimators. data_path: Path to citibike_usage.csv. If None, checks default location then falls back to synthetic generation. n_individuals: Number of members for synthetic fallback. n_periods: Number of days per member for synthetic fallback. seed: Random seed for synthetic fallback. Returns: DataFrame with daily usage data. If as_panel=True, returns Panel object. """ if data_path is not None: data_path = Path(data_path) else: data_path = DEFAULT_DATA_PATH if data_path.exists(): df = pd.read_csv(data_path) if as_panel: return _dataframe_to_panel(df) return df else: print( f"Citibike usage data not found at {data_path}. " "Generating synthetic data from CitibikeUsageEnvironment. " "Run 'python scripts/download_citibike.py' to download real data." ) return _generate_synthetic(as_panel, n_individuals, n_periods, seed)
def _generate_synthetic( as_panel: bool, n_individuals: int, n_periods: int, seed: int ) -> Union[pd.DataFrame, Panel]: """Generate synthetic usage frequency data as fallback.""" from econirl.environments.citibike_usage import ( CitibikeUsageEnvironment, DAY_LABELS, USAGE_LABELS, state_to_components, ) from econirl.simulation.synthetic import simulate_panel env = CitibikeUsageEnvironment(seed=seed) panel = simulate_panel(env, n_individuals=n_individuals, n_periods=n_periods, seed=seed) if as_panel: return panel records = [] for traj in panel.trajectories: tid = traj.individual_id for t in range(len(traj.states)): s = int(traj.states[t]) a = int(traj.actions[t]) ns = int(traj.next_states[t]) dt, ub = state_to_components(s) records.append({ "rider_id": tid, "day": t, "state": s, "action": a, "next_state": ns, "day_type": dt, "usage_bucket": ub, "day_label": DAY_LABELS[dt], "usage_label": USAGE_LABELS[ub], "rode": a == 1, }) return pd.DataFrame(records) def _dataframe_to_panel(df: pd.DataFrame) -> Panel: """Convert usage DataFrame to Panel.""" trajectories = [] rider_col = "rider_id" for rid, group in df.groupby(rider_col): group = group.sort_values("date" if "date" in group.columns else "day") trajectories.append( Trajectory( individual_id=int(rid) if isinstance(rid, (int, float)) else hash(rid) % 100000, states=np.array(group["state"].values, dtype=np.int32), actions=np.array(group["action"].values, dtype=np.int32), next_states=np.array(group["next_state"].values, dtype=np.int32), ) ) return Panel(trajectories=trajectories)
[docs] def get_citibike_usage_info() -> dict: """Return metadata about the Citibike usage frequency dataset.""" return { "name": "Citibike Daily Usage Frequency", "description": ( "NYC Citibike member daily ride/no-ride decisions. " "8 states (day type x recent usage bucket), " "2 actions (ride/no ride). Real data requires " "running scripts/download_citibike.py; falls back to " "synthetic generation." ), "source": "https://citibikenyc.com/system-data", "license": "Non-commercial research", "n_states": N_STATES, "n_actions": N_ACTIONS, "n_features": N_FEATURES, "state_description": "Day type (weekday/weekend) x recent usage bucket", "action_description": "No ride (0) / Ride (1)", "ground_truth": False, "use_case": "Transportation DDC, labor supply, habitual behavior, usage frequency", }