Source code for econirl.datasets.tdrive

"""T-Drive Beijing Taxi Trajectory Dataset.

This module provides access to the T-Drive dataset from Microsoft Research,
containing GPS trajectories of 10,357 taxis in Beijing over one week.

The data is suitable for:
- Maximum Entropy IRL (learning route preferences)
- Trajectory prediction
- Urban mobility modeling

Reference:
    Yuan, J., et al. (2010). "T-Drive: Driving Directions Based on Taxi
    Trajectories." ACM SIGSPATIAL GIS.

    Ziebart, B. D., et al. (2008). "Maximum Entropy Inverse Reinforcement
    Learning." AAAI.

Data source:
    https://www.microsoft.com/en-us/research/publication/t-drive-trajectory-data-sample/
"""

from pathlib import Path
from typing import Optional, List

import numpy as np
import pandas as pd



[docs]
def load_tdrive(
    n_taxis: Optional[int] = None,
    as_trajectories: bool = False,
    discretize: bool = False,
    grid_size: int = 100,
    seed: Optional[int] = 2010,
) -> pd.DataFrame:
    """Load T-Drive taxi trajectory data.

    The original T-Drive dataset contains ~15 million GPS points from 10,357
    Beijing taxis. This loader provides a bundled sample or can download
    the full dataset.

    Args:
        n_taxis: Limit to first N taxis (None = all available)
        as_trajectories: If True, return list of trajectory arrays
        discretize: If True, convert GPS to discrete grid states
        grid_size: Number of grid cells per dimension if discretizing
        seed: Random seed for sample generation

    Returns:
        DataFrame with columns:
            - taxi_id: Taxi identifier
            - timestamp: GPS timestamp
            - longitude: GPS longitude
            - latitude: GPS latitude
            - (if discretize) state: Discrete grid cell index

    Example:
        >>> from econirl.datasets import load_tdrive
        >>> df = load_tdrive(n_taxis=100)
        >>> print(f"Points: {len(df):,}, Taxis: {df['taxi_id'].nunique()}")

        >>> # For MaxEnt IRL
        >>> trajectories = load_tdrive(as_trajectories=True, discretize=True)
        >>> print(f"Trajectories: {len(trajectories)}")
    """
    data_path = Path(__file__).parent / "tdrive_sample.csv"

    if not data_path.exists():
        df = _generate_tdrive_sample(seed=seed)
        df.to_csv(data_path, index=False)
    else:
        df = pd.read_csv(data_path)

    if n_taxis is not None:
        taxi_ids = df['taxi_id'].unique()[:n_taxis]
        df = df[df['taxi_id'].isin(taxi_ids)]

    if discretize:
        df = _discretize_gps(df, grid_size)

    if as_trajectories:
        return _to_trajectories(df, discretize)

    return df



def _generate_tdrive_sample(
    n_taxis: int = 200,
    n_points_per_taxi: int = 100,
    seed: int = 2010,
) -> pd.DataFrame:
    """Generate synthetic T-Drive-like data.

    Simulates taxi trajectories in a grid representing Beijing's road network.
    """
    np.random.seed(seed)

    # Beijing approximate bounds
    lon_min, lon_max = 116.2, 116.6
    lat_min, lat_max = 39.7, 40.1

    records = []
    base_time = pd.Timestamp('2008-02-02')

    for taxi_id in range(1, n_taxis + 1):
        # Random starting point
        lon = np.random.uniform(lon_min, lon_max)
        lat = np.random.uniform(lat_min, lat_max)

        for t in range(n_points_per_taxi):
            timestamp = base_time + pd.Timedelta(minutes=t)

            records.append({
                'taxi_id': taxi_id,
                'timestamp': timestamp,
                'longitude': lon,
                'latitude': lat,
            })

            # Random walk with road-like constraints
            # Taxis tend to follow major roads (grid pattern)
            direction = np.random.choice(['N', 'S', 'E', 'W', 'stay'], p=[0.2, 0.2, 0.2, 0.2, 0.2])
            step = np.random.uniform(0.001, 0.005)

            if direction == 'N':
                lat = min(lat + step, lat_max)
            elif direction == 'S':
                lat = max(lat - step, lat_min)
            elif direction == 'E':
                lon = min(lon + step, lon_max)
            elif direction == 'W':
                lon = max(lon - step, lon_min)

    return pd.DataFrame(records)


def _discretize_gps(df: pd.DataFrame, grid_size: int) -> pd.DataFrame:
    """Convert GPS coordinates to discrete grid cells."""
    df = df.copy()

    # Compute grid bounds from data
    lon_min, lon_max = df['longitude'].min(), df['longitude'].max()
    lat_min, lat_max = df['latitude'].min(), df['latitude'].max()

    # Discretize
    lon_bins = np.linspace(lon_min, lon_max, grid_size + 1)
    lat_bins = np.linspace(lat_min, lat_max, grid_size + 1)

    lon_idx = np.digitize(df['longitude'], lon_bins) - 1
    lat_idx = np.digitize(df['latitude'], lat_bins) - 1

    # Clip to valid range
    lon_idx = np.clip(lon_idx, 0, grid_size - 1)
    lat_idx = np.clip(lat_idx, 0, grid_size - 1)

    # Encode as single state index
    df['state'] = lat_idx * grid_size + lon_idx

    return df


def _to_trajectories(df: pd.DataFrame, has_states: bool) -> List[np.ndarray]:
    """Convert DataFrame to list of trajectory arrays."""
    trajectories = []

    for taxi_id in df['taxi_id'].unique():
        taxi_data = df[df['taxi_id'] == taxi_id].sort_values('timestamp')

        if has_states:
            traj = taxi_data['state'].values
        else:
            traj = taxi_data[['longitude', 'latitude']].values

        trajectories.append(traj)

    return trajectories



[docs]
def get_tdrive_info() -> dict:
    """Get metadata about T-Drive dataset."""
    return {
        "name": "T-Drive Beijing Taxi Trajectories",
        "type": "real (bundled sample) / synthetic fallback",
        "domain": "Urban mobility / Route planning",
        "n_taxis_full": 10357,
        "n_points_full": "~15 million",
        "time_span": "One week (Feb 2008)",
        "location": "Beijing, China",
        "use_cases": [
            "Maximum Entropy IRL for route preferences",
            "Trajectory prediction",
            "Traffic pattern learning",
        ],
        "reference": "Yuan et al. (2010). T-Drive. ACM SIGSPATIAL.",
        "download_url": "https://www.microsoft.com/en-us/research/publication/t-drive-trajectory-data-sample/",
    }