Source code for econirl.datasets.foursquare

"""
Foursquare NYC Check-in Dataset for Sequential Venue Choice.

This module provides the Foursquare NYC check-in dataset (Yang et al., 2015),
modeling sequential venue/activity choice as a dynamic discrete choice problem.

Each user's check-in sequence is treated as a trajectory where:
- State: (current_venue_category, time_of_day_bin)
- Action: next venue category visited
- Transitions: empirical category-to-category patterns

Reference:
    Yang, D., Zhang, D., Zheng, V.W., & Yu, Z. (2015). "Modeling User Activity
    Preference by Leveraging User Spatial Temporal Characteristics in LBSNs."
    IEEE Trans. Systems, Man, and Cybernetics: Systems, 45(1), 129-142.
"""

from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd

# 10 super-categories mapping the 252 Foursquare categories
SUPER_CATEGORIES = {
    "Home": ["Home (private)", "Residential Building (Apartment / Condo)"],
    "Work": ["Office", "Coworking Space", "Tech Startup", "Conference Room"],
    "Social": ["Bar", "Nightclub", "Lounge", "Brewery", "Wine Bar", "Pub",
               "Hotel", "Hotel Bar", "Cocktail Bar", "Dive Bar", "Sports Bar",
               "Hookah Bar", "Gay Bar", "Beer Garden", "Whisky Bar"],
    "Transport": ["Subway", "Train Station", "Bus Station", "Bus Stop",
                  "Light Rail Station", "Airport", "Airport Terminal",
                  "Airport Gate", "Taxi", "Boat or Ferry", "Platform"],
    "Food": ["Coffee Shop", "Food & Drink Shop", "Deli / Bodega",
             "American Restaurant", "Italian Restaurant", "Chinese Restaurant",
             "Japanese Restaurant", "Mexican Restaurant", "Pizza Place",
             "Thai Restaurant", "Indian Restaurant", "French Restaurant",
             "Sushi Restaurant", "Burger Joint", "Bakery", "Café",
             "Fast Food Restaurant", "Food Truck", "Sandwich Place",
             "Seafood Restaurant", "Steakhouse", "Vegetarian / Vegan Restaurant",
             "Greek Restaurant", "Korean Restaurant", "Ramen Restaurant",
             "Vietnamese Restaurant", "BBQ Joint", "Taco Place", "Noodle House",
             "Salad Place", "Bagel Shop", "Donut Shop", "Ice Cream Shop",
             "Frozen Yogurt Shop", "Juice Bar", "Smoothie Shop",
             "Restaurant", "Asian Restaurant", "Latin American Restaurant",
             "Mediterranean Restaurant", "Middle Eastern Restaurant",
             "Spanish Restaurant", "Breakfast Spot", "Gastropub", "Diner",
             "Wings Joint", "Falafel Restaurant", "Food Court"],
    "Fitness": ["Gym / Fitness Center", "Gym", "Yoga Studio", "Gym Pool",
                "Martial Arts Dojo", "Boxing Gym", "Pilates Studio",
                "Cycle Studio", "Athletics & Sports", "Recreation Center",
                "Rock Climbing Spot"],
    "Shopping": ["Grocery Store", "Clothing Store", "Pharmacy", "Supermarket",
                 "Department Store", "Shopping Mall", "Bookstore",
                 "Electronics Store", "Hardware Store", "Pet Store",
                 "Convenience Store", "Liquor Store", "Cosmetics Shop",
                 "Boutique", "Thrift / Vintage Store", "Gift Shop",
                 "Shoe Store", "Jewelry Store", "Market", "Farmers Market",
                 "Flea Market", "Wine Shop", "Record Shop", "Toy / Game Store",
                 "Furniture / Home Store", "Sporting Goods Shop",
                 "Mobile Phone Shop", "Music Store", "Art Supply Store"],
    "Entertainment": ["Movie Theater", "Theater", "Music Venue", "Comedy Club",
                      "Art Gallery", "Museum", "Performing Arts Venue",
                      "Concert Hall", "Karaoke Bar", "Arcade", "Casino",
                      "Bowling Alley", "Pool Hall", "Board Shop",
                      "Multiplex", "Indie Movie Theater", "Sculpture Garden",
                      "Stadium", "Basketball Stadium", "Baseball Stadium",
                      "Soccer Stadium", "Hockey Arena", "Tennis Stadium"],
    "Outdoors": ["Park", "Other Great Outdoors", "Neighborhood", "Beach",
                 "Plaza", "Trail", "Garden", "Playground", "Dog Run",
                 "River", "Lake", "Harbor / Marina", "Scenic Lookout",
                 "Bridge", "Pier", "Waterfront", "Campground", "Field",
                 "Mountain", "National Park", "State / Provincial Park",
                 "Roof Deck", "Courtyard"],
    "Services": [],  # Catch-all for everything else
}

N_SUPER_CATEGORIES = 10
N_TIME_BINS = 4  # night(0-6), morning(6-12), afternoon(12-18), evening(18-24)
CATEGORY_NAMES = list(SUPER_CATEGORIES.keys())


def _build_category_map(df: pd.DataFrame) -> dict:
    """Build mapping from raw Foursquare category → super-category index."""
    cat_map = {}
    for idx, (super_cat, members) in enumerate(SUPER_CATEGORIES.items()):
        for member in members:
            cat_map[member] = idx

    # Map remaining categories by keyword matching
    all_cats = df["venueCategory"].unique()
    for cat in all_cats:
        if cat not in cat_map:
            cat_lower = cat.lower()
            if any(w in cat_lower for w in ["restaurant", "food", "eat", "cook", "bistro"]):
                cat_map[cat] = CATEGORY_NAMES.index("Food")
            elif any(w in cat_lower for w in ["bar", "pub", "club", "lounge"]):
                cat_map[cat] = CATEGORY_NAMES.index("Social")
            elif any(w in cat_lower for w in ["shop", "store", "market", "mall"]):
                cat_map[cat] = CATEGORY_NAMES.index("Shopping")
            elif any(w in cat_lower for w in ["park", "beach", "outdoor", "garden", "trail"]):
                cat_map[cat] = CATEGORY_NAMES.index("Outdoors")
            elif any(w in cat_lower for w in ["gym", "fitness", "sport", "yoga", "pool"]):
                cat_map[cat] = CATEGORY_NAMES.index("Fitness")
            elif any(w in cat_lower for w in ["theater", "museum", "gallery", "cinema", "stadium"]):
                cat_map[cat] = CATEGORY_NAMES.index("Entertainment")
            elif any(w in cat_lower for w in ["station", "airport", "bus", "train", "subway", "ferry"]):
                cat_map[cat] = CATEGORY_NAMES.index("Transport")
            elif any(w in cat_lower for w in ["office", "work", "cowork"]):
                cat_map[cat] = CATEGORY_NAMES.index("Work")
            elif any(w in cat_lower for w in ["home", "residen", "apartment"]):
                cat_map[cat] = CATEGORY_NAMES.index("Home")
            else:
                cat_map[cat] = CATEGORY_NAMES.index("Services")
    return cat_map


[docs] def load_foursquare( as_panel: bool = False, min_checkins: int = 50, n_time_bins: int = N_TIME_BINS, ) -> pd.DataFrame: """ Load the Foursquare NYC check-in dataset as a sequential venue choice problem. Each user's check-in sequence is converted to (state, action, next_state) tuples where state encodes the current venue category and time of day, and action is the next venue category visited. Args: as_panel: If True, return as Panel object for econirl estimators. min_checkins: Minimum check-ins per user to include (default 50). n_time_bins: Number of time-of-day bins (default 4). Returns: DataFrame with columns: user_id, period, state, action, next_state, super_category, next_category, time_bin, hour, is_weekend """ data_path = Path(__file__).parent.parent.parent.parent / "data" / "raw" / "foursquare" / "dataset_TSMC2014_NYC.csv" if not data_path.exists(): raise FileNotFoundError( f"Foursquare data not found at {data_path}. " "Download from: https://github.com/ruslansco/Foursquare-Data-Analysis" ) df = pd.read_csv(data_path) # Parse timestamps df["timestamp"] = pd.to_datetime(df["utcTimestamp"], format="mixed") df["hour"] = df["timestamp"].dt.hour df["is_weekend"] = df["timestamp"].dt.dayofweek.isin([5, 6]).astype(int) df["time_bin"] = df["hour"] // (24 // n_time_bins) df["time_bin"] = df["time_bin"].clip(upper=n_time_bins - 1) # Map to super-categories cat_map = _build_category_map(df) df["super_category"] = df["venueCategory"].map(cat_map) # Sort by user and time df = df.sort_values(["userId", "timestamp"]).reset_index(drop=True) # Filter users with enough check-ins user_counts = df["userId"].value_counts() valid_users = user_counts[user_counts >= min_checkins].index df = df[df["userId"].isin(valid_users)].copy() # Build sequential transitions records = [] for user_id, user_df in df.groupby("userId"): user_df = user_df.sort_values("timestamp") cats = user_df["super_category"].values time_bins = user_df["time_bin"].values hours = user_df["hour"].values weekends = user_df["is_weekend"].values for t in range(len(cats) - 1): state = int(cats[t]) * n_time_bins + int(time_bins[t]) action = int(cats[t + 1]) # next category is the "choice" next_time_bin = int(time_bins[t + 1]) next_state = int(cats[t + 1]) * n_time_bins + next_time_bin records.append({ "user_id": user_id, "period": t, "state": state, "action": action, "next_state": next_state, "super_category": CATEGORY_NAMES[cats[t]], "next_category": CATEGORY_NAMES[cats[t + 1]], "time_bin": int(time_bins[t]), "hour": int(hours[t]), "is_weekend": int(weekends[t]), }) result = pd.DataFrame(records) if as_panel: from econirl.core.types import Panel, Trajectory import jax.numpy as jnp trajectories = [] for user_id in result["user_id"].unique(): user_data = result[result["user_id"] == user_id].sort_values("period") traj = Trajectory( states=jnp.array(user_data["state"].values, dtype=jnp.int32), actions=jnp.array(user_data["action"].values, dtype=jnp.int32), next_states=jnp.array(user_data["next_state"].values, dtype=jnp.int32), individual_id=int(user_id), ) trajectories.append(traj) return Panel(trajectories=trajectories) return result
[docs] def get_foursquare_info() -> dict: """Return metadata about the Foursquare NYC dataset.""" return { "name": "Foursquare NYC Check-ins", "description": "Sequential venue choice from 1,084 NYC users over 6 months", "source": "Yang et al. (2015), IEEE Trans. SMC", "url": "https://github.com/ruslansco/Foursquare-Data-Analysis", "n_states": N_SUPER_CATEGORIES * N_TIME_BINS, # 40 "n_actions": N_SUPER_CATEGORIES, # 10 "n_individuals": 1084, "n_observations": "~226K transitions", "state_description": "(venue_super_category, time_of_day_bin)", "action_description": "next venue super-category", "categories": CATEGORY_NAMES, "time_bins": ["night(0-6h)", "morning(6-12h)", "afternoon(12-18h)", "evening(18-24h)"], }