Source code for econirl.datasets.geolife

"""GeoLife GPS Trajectory Dataset.

This module provides access to the GeoLife dataset from Microsoft Research,
containing GPS trajectories from 182 users over 5 years (2007-2012).

The data is suitable for:
- Human mobility pattern learning via IRL
- Transportation mode inference
- Activity recognition

Reference:
    Zheng, Y., et al. (2008-2010). GeoLife GPS Trajectory Dataset.
    Microsoft Research.

Data source:
    https://www.microsoft.com/en-us/download/details.aspx?id=52367
"""

from pathlib import Path
from typing import Optional, List

import numpy as np
import pandas as pd



[docs]
def load_geolife(
    n_users: Optional[int] = None,
    include_labels: bool = False,
    as_trajectories: bool = False,
    discretize: bool = False,
    grid_size: int = 100,
    seed: Optional[int] = 2008,
) -> pd.DataFrame:
    """Load GeoLife GPS trajectory data.

    The original GeoLife dataset contains 17,621 trajectories from 182 users,
    representing 1.2 million kilometers of travel. Some trajectories include
    transportation mode labels.

    Args:
        n_users: Limit to first N users (None = all available)
        include_labels: Include transportation mode labels where available
        as_trajectories: If True, return list of trajectory arrays
        discretize: If True, convert GPS to discrete grid states
        grid_size: Number of grid cells per dimension if discretizing
        seed: Random seed for sample generation

    Returns:
        DataFrame with columns:
            - user_id: User identifier
            - trajectory_id: Trajectory identifier (trip)
            - timestamp: GPS timestamp
            - latitude: GPS latitude
            - longitude: GPS longitude
            - altitude: Altitude in meters
            - (if include_labels) mode: Transportation mode

    Example:
        >>> from econirl.datasets import load_geolife
        >>> df = load_geolife(n_users=50)
        >>> print(f"Users: {df['user_id'].nunique()}, Trips: {df['trajectory_id'].nunique()}")

        >>> # For mobility IRL
        >>> trajectories = load_geolife(as_trajectories=True, discretize=True)
    """
    data_path = Path(__file__).parent / "geolife_sample.csv"

    if not data_path.exists():
        df = _generate_geolife_sample(seed=seed)
        df.to_csv(data_path, index=False)
    else:
        df = pd.read_csv(data_path)

    if n_users is not None:
        user_ids = df['user_id'].unique()[:n_users]
        df = df[df['user_id'].isin(user_ids)]

    if not include_labels and 'mode' in df.columns:
        df = df.drop(columns=['mode'])

    if discretize:
        df = _discretize_gps(df, grid_size)

    if as_trajectories:
        return _to_trajectories(df, discretize)

    return df



def _generate_geolife_sample(
    n_users: int = 50,
    trajectories_per_user: int = 5,
    points_per_trajectory: int = 50,
    seed: int = 2008,
) -> pd.DataFrame:
    """Generate synthetic GeoLife-like data."""
    np.random.seed(seed)

    # Beijing area (GeoLife was collected there)
    lon_min, lon_max = 116.2, 116.6
    lat_min, lat_max = 39.7, 40.1

    # Transportation modes
    modes = ['walk', 'bike', 'bus', 'car', 'subway']
    mode_speeds = {'walk': 0.0005, 'bike': 0.001, 'bus': 0.002, 'car': 0.003, 'subway': 0.004}

    records = []
    traj_id = 0

    for user_id in range(1, n_users + 1):
        # User's home location
        home_lon = np.random.uniform(lon_min, lon_max)
        home_lat = np.random.uniform(lat_min, lat_max)

        for _ in range(trajectories_per_user):
            traj_id += 1
            mode = np.random.choice(modes, p=[0.3, 0.2, 0.2, 0.2, 0.1])
            speed = mode_speeds[mode]

            # Start from home or previous endpoint
            lon, lat = home_lon, home_lat
            alt = np.random.uniform(20, 100)

            base_time = pd.Timestamp('2008-01-01') + pd.Timedelta(
                days=np.random.randint(0, 365),
                hours=np.random.randint(6, 22)
            )

            for t in range(points_per_trajectory):
                timestamp = base_time + pd.Timedelta(seconds=t * 5)

                records.append({
                    'user_id': user_id,
                    'trajectory_id': traj_id,
                    'timestamp': timestamp,
                    'latitude': lat,
                    'longitude': lon,
                    'altitude': alt,
                    'mode': mode,
                })

                # Move based on mode
                direction = np.random.uniform(0, 2 * np.pi)
                lon += speed * np.cos(direction)
                lat += speed * np.sin(direction)

                # Keep in bounds
                lon = np.clip(lon, lon_min, lon_max)
                lat = np.clip(lat, lat_min, lat_max)

    return pd.DataFrame(records)


def _discretize_gps(df: pd.DataFrame, grid_size: int) -> pd.DataFrame:
    """Convert GPS coordinates to discrete grid cells."""
    df = df.copy()

    lon_min, lon_max = df['longitude'].min(), df['longitude'].max()
    lat_min, lat_max = df['latitude'].min(), df['latitude'].max()

    lon_bins = np.linspace(lon_min, lon_max, grid_size + 1)
    lat_bins = np.linspace(lat_min, lat_max, grid_size + 1)

    lon_idx = np.clip(np.digitize(df['longitude'], lon_bins) - 1, 0, grid_size - 1)
    lat_idx = np.clip(np.digitize(df['latitude'], lat_bins) - 1, 0, grid_size - 1)

    df['state'] = lat_idx * grid_size + lon_idx

    return df


def _to_trajectories(df: pd.DataFrame, has_states: bool) -> List[np.ndarray]:
    """Convert DataFrame to list of trajectory arrays."""
    trajectories = []

    for traj_id in df['trajectory_id'].unique():
        traj_data = df[df['trajectory_id'] == traj_id].sort_values('timestamp')

        if has_states:
            traj = traj_data['state'].values
        else:
            traj = traj_data[['longitude', 'latitude']].values

        trajectories.append(traj)

    return trajectories



[docs]
def get_geolife_info() -> dict:
    """Get metadata about GeoLife dataset."""
    return {
        "name": "GeoLife GPS Trajectories",
        "type": "real (bundled sample) / synthetic fallback",
        "domain": "Human mobility / Activity recognition",
        "n_users_full": 182,
        "n_trajectories_full": 17621,
        "time_span": "5 years (2007-2012)",
        "location": "Beijing, China (primarily)",
        "labeled_portion": "~30% have transportation mode labels",
        "use_cases": [
            "Human mobility IRL",
            "Transportation mode inference",
            "Activity pattern learning",
        ],
        "reference": "Zheng et al. GeoLife. Microsoft Research.",
        "download_url": "https://www.microsoft.com/en-us/download/details.aspx?id=52367",
    }