Source code for econirl.datasets.geolife

"""GeoLife GPS Trajectory Dataset.

This module provides access to the GeoLife dataset from Microsoft Research,
containing GPS trajectories from 182 users over 5 years (2007-2012).

The data is suitable for:
- Human mobility pattern learning via IRL
- Transportation mode inference
- Activity recognition

Reference:
    Zheng, Y., et al. (2008-2010). GeoLife GPS Trajectory Dataset.
    Microsoft Research.

Data source:
    https://www.microsoft.com/en-us/download/details.aspx?id=52367
"""

from pathlib import Path
from typing import Optional, List

import numpy as np
import pandas as pd


[docs] def load_geolife( n_users: Optional[int] = None, include_labels: bool = False, as_trajectories: bool = False, discretize: bool = False, grid_size: int = 100, seed: Optional[int] = 2008, ) -> pd.DataFrame: """Load GeoLife GPS trajectory data. The original GeoLife dataset contains 17,621 trajectories from 182 users, representing 1.2 million kilometers of travel. Some trajectories include transportation mode labels. Args: n_users: Limit to first N users (None = all available) include_labels: Include transportation mode labels where available as_trajectories: If True, return list of trajectory arrays discretize: If True, convert GPS to discrete grid states grid_size: Number of grid cells per dimension if discretizing seed: Random seed for sample generation Returns: DataFrame with columns: - user_id: User identifier - trajectory_id: Trajectory identifier (trip) - timestamp: GPS timestamp - latitude: GPS latitude - longitude: GPS longitude - altitude: Altitude in meters - (if include_labels) mode: Transportation mode Example: >>> from econirl.datasets import load_geolife >>> df = load_geolife(n_users=50) >>> print(f"Users: {df['user_id'].nunique()}, Trips: {df['trajectory_id'].nunique()}") >>> # For mobility IRL >>> trajectories = load_geolife(as_trajectories=True, discretize=True) """ data_path = Path(__file__).parent / "geolife_sample.csv" if not data_path.exists(): df = _generate_geolife_sample(seed=seed) df.to_csv(data_path, index=False) else: df = pd.read_csv(data_path) if n_users is not None: user_ids = df['user_id'].unique()[:n_users] df = df[df['user_id'].isin(user_ids)] if not include_labels and 'mode' in df.columns: df = df.drop(columns=['mode']) if discretize: df = _discretize_gps(df, grid_size) if as_trajectories: return _to_trajectories(df, discretize) return df
def _generate_geolife_sample( n_users: int = 50, trajectories_per_user: int = 5, points_per_trajectory: int = 50, seed: int = 2008, ) -> pd.DataFrame: """Generate synthetic GeoLife-like data.""" np.random.seed(seed) # Beijing area (GeoLife was collected there) lon_min, lon_max = 116.2, 116.6 lat_min, lat_max = 39.7, 40.1 # Transportation modes modes = ['walk', 'bike', 'bus', 'car', 'subway'] mode_speeds = {'walk': 0.0005, 'bike': 0.001, 'bus': 0.002, 'car': 0.003, 'subway': 0.004} records = [] traj_id = 0 for user_id in range(1, n_users + 1): # User's home location home_lon = np.random.uniform(lon_min, lon_max) home_lat = np.random.uniform(lat_min, lat_max) for _ in range(trajectories_per_user): traj_id += 1 mode = np.random.choice(modes, p=[0.3, 0.2, 0.2, 0.2, 0.1]) speed = mode_speeds[mode] # Start from home or previous endpoint lon, lat = home_lon, home_lat alt = np.random.uniform(20, 100) base_time = pd.Timestamp('2008-01-01') + pd.Timedelta( days=np.random.randint(0, 365), hours=np.random.randint(6, 22) ) for t in range(points_per_trajectory): timestamp = base_time + pd.Timedelta(seconds=t * 5) records.append({ 'user_id': user_id, 'trajectory_id': traj_id, 'timestamp': timestamp, 'latitude': lat, 'longitude': lon, 'altitude': alt, 'mode': mode, }) # Move based on mode direction = np.random.uniform(0, 2 * np.pi) lon += speed * np.cos(direction) lat += speed * np.sin(direction) # Keep in bounds lon = np.clip(lon, lon_min, lon_max) lat = np.clip(lat, lat_min, lat_max) return pd.DataFrame(records) def _discretize_gps(df: pd.DataFrame, grid_size: int) -> pd.DataFrame: """Convert GPS coordinates to discrete grid cells.""" df = df.copy() lon_min, lon_max = df['longitude'].min(), df['longitude'].max() lat_min, lat_max = df['latitude'].min(), df['latitude'].max() lon_bins = np.linspace(lon_min, lon_max, grid_size + 1) lat_bins = np.linspace(lat_min, lat_max, grid_size + 1) lon_idx = np.clip(np.digitize(df['longitude'], lon_bins) - 1, 0, grid_size - 1) lat_idx = np.clip(np.digitize(df['latitude'], lat_bins) - 1, 0, grid_size - 1) df['state'] = lat_idx * grid_size + lon_idx return df def _to_trajectories(df: pd.DataFrame, has_states: bool) -> List[np.ndarray]: """Convert DataFrame to list of trajectory arrays.""" trajectories = [] for traj_id in df['trajectory_id'].unique(): traj_data = df[df['trajectory_id'] == traj_id].sort_values('timestamp') if has_states: traj = traj_data['state'].values else: traj = traj_data[['longitude', 'latitude']].values trajectories.append(traj) return trajectories
[docs] def get_geolife_info() -> dict: """Get metadata about GeoLife dataset.""" return { "name": "GeoLife GPS Trajectories", "type": "real (bundled sample) / synthetic fallback", "domain": "Human mobility / Activity recognition", "n_users_full": 182, "n_trajectories_full": 17621, "time_span": "5 years (2007-2012)", "location": "Beijing, China (primarily)", "labeled_portion": "~30% have transportation mode labels", "use_cases": [ "Human mobility IRL", "Transportation mode inference", "Activity pattern learning", ], "reference": "Zheng et al. GeoLife. Microsoft Research.", "download_url": "https://www.microsoft.com/en-us/download/details.aspx?id=52367", }