"""T-Drive Beijing Taxi Trajectory Dataset.
This module provides access to the T-Drive dataset from Microsoft Research,
containing GPS trajectories of 10,357 taxis in Beijing over one week.
The data is suitable for:
- Maximum Entropy IRL (learning route preferences)
- Trajectory prediction
- Urban mobility modeling
Reference:
Yuan, J., et al. (2010). "T-Drive: Driving Directions Based on Taxi
Trajectories." ACM SIGSPATIAL GIS.
Ziebart, B. D., et al. (2008). "Maximum Entropy Inverse Reinforcement
Learning." AAAI.
Data source:
https://www.microsoft.com/en-us/research/publication/t-drive-trajectory-data-sample/
"""
from pathlib import Path
from typing import Optional, List
import numpy as np
import pandas as pd
[docs]
def load_tdrive(
n_taxis: Optional[int] = None,
as_trajectories: bool = False,
discretize: bool = False,
grid_size: int = 100,
seed: Optional[int] = 2010,
) -> pd.DataFrame:
"""Load T-Drive taxi trajectory data.
The original T-Drive dataset contains ~15 million GPS points from 10,357
Beijing taxis. This loader provides a bundled sample or can download
the full dataset.
Args:
n_taxis: Limit to first N taxis (None = all available)
as_trajectories: If True, return list of trajectory arrays
discretize: If True, convert GPS to discrete grid states
grid_size: Number of grid cells per dimension if discretizing
seed: Random seed for sample generation
Returns:
DataFrame with columns:
- taxi_id: Taxi identifier
- timestamp: GPS timestamp
- longitude: GPS longitude
- latitude: GPS latitude
- (if discretize) state: Discrete grid cell index
Example:
>>> from econirl.datasets import load_tdrive
>>> df = load_tdrive(n_taxis=100)
>>> print(f"Points: {len(df):,}, Taxis: {df['taxi_id'].nunique()}")
>>> # For MaxEnt IRL
>>> trajectories = load_tdrive(as_trajectories=True, discretize=True)
>>> print(f"Trajectories: {len(trajectories)}")
"""
data_path = Path(__file__).parent / "tdrive_sample.csv"
if not data_path.exists():
df = _generate_tdrive_sample(seed=seed)
df.to_csv(data_path, index=False)
else:
df = pd.read_csv(data_path)
if n_taxis is not None:
taxi_ids = df['taxi_id'].unique()[:n_taxis]
df = df[df['taxi_id'].isin(taxi_ids)]
if discretize:
df = _discretize_gps(df, grid_size)
if as_trajectories:
return _to_trajectories(df, discretize)
return df
def _generate_tdrive_sample(
n_taxis: int = 200,
n_points_per_taxi: int = 100,
seed: int = 2010,
) -> pd.DataFrame:
"""Generate synthetic T-Drive-like data.
Simulates taxi trajectories in a grid representing Beijing's road network.
"""
np.random.seed(seed)
# Beijing approximate bounds
lon_min, lon_max = 116.2, 116.6
lat_min, lat_max = 39.7, 40.1
records = []
base_time = pd.Timestamp('2008-02-02')
for taxi_id in range(1, n_taxis + 1):
# Random starting point
lon = np.random.uniform(lon_min, lon_max)
lat = np.random.uniform(lat_min, lat_max)
for t in range(n_points_per_taxi):
timestamp = base_time + pd.Timedelta(minutes=t)
records.append({
'taxi_id': taxi_id,
'timestamp': timestamp,
'longitude': lon,
'latitude': lat,
})
# Random walk with road-like constraints
# Taxis tend to follow major roads (grid pattern)
direction = np.random.choice(['N', 'S', 'E', 'W', 'stay'], p=[0.2, 0.2, 0.2, 0.2, 0.2])
step = np.random.uniform(0.001, 0.005)
if direction == 'N':
lat = min(lat + step, lat_max)
elif direction == 'S':
lat = max(lat - step, lat_min)
elif direction == 'E':
lon = min(lon + step, lon_max)
elif direction == 'W':
lon = max(lon - step, lon_min)
return pd.DataFrame(records)
def _discretize_gps(df: pd.DataFrame, grid_size: int) -> pd.DataFrame:
"""Convert GPS coordinates to discrete grid cells."""
df = df.copy()
# Compute grid bounds from data
lon_min, lon_max = df['longitude'].min(), df['longitude'].max()
lat_min, lat_max = df['latitude'].min(), df['latitude'].max()
# Discretize
lon_bins = np.linspace(lon_min, lon_max, grid_size + 1)
lat_bins = np.linspace(lat_min, lat_max, grid_size + 1)
lon_idx = np.digitize(df['longitude'], lon_bins) - 1
lat_idx = np.digitize(df['latitude'], lat_bins) - 1
# Clip to valid range
lon_idx = np.clip(lon_idx, 0, grid_size - 1)
lat_idx = np.clip(lat_idx, 0, grid_size - 1)
# Encode as single state index
df['state'] = lat_idx * grid_size + lon_idx
return df
def _to_trajectories(df: pd.DataFrame, has_states: bool) -> List[np.ndarray]:
"""Convert DataFrame to list of trajectory arrays."""
trajectories = []
for taxi_id in df['taxi_id'].unique():
taxi_data = df[df['taxi_id'] == taxi_id].sort_values('timestamp')
if has_states:
traj = taxi_data['state'].values
else:
traj = taxi_data[['longitude', 'latitude']].values
trajectories.append(traj)
return trajectories
[docs]
def get_tdrive_info() -> dict:
"""Get metadata about T-Drive dataset."""
return {
"name": "T-Drive Beijing Taxi Trajectories",
"type": "real (bundled sample) / synthetic fallback",
"domain": "Urban mobility / Route planning",
"n_taxis_full": 10357,
"n_points_full": "~15 million",
"time_span": "One week (Feb 2008)",
"location": "Beijing, China",
"use_cases": [
"Maximum Entropy IRL for route preferences",
"Trajectory prediction",
"Traffic pattern learning",
],
"reference": "Yuan et al. (2010). T-Drive. ACM SIGSPATIAL.",
"download_url": "https://www.microsoft.com/en-us/research/publication/t-drive-trajectory-data-sample/",
}