Source code for econirl.datasets.keane_wolpin

"""Keane & Wolpin (1994) Career Decisions Dataset.

This module provides the Keane-Wolpin career choice dataset, which tracks
individuals making choices between schooling, white-collar work, blue-collar
work, and home production.

Reference:
    Keane, M. P., & Wolpin, K. I. (1994). "The Solution and Estimation of
    Discrete Choice Dynamic Programming Models by Simulation and Interpolation:
    Monte Carlo Evidence." The Review of Economics and Statistics, 76(4), 648-672.

    Keane, M. P., & Wolpin, K. I. (1997). "The Career Decisions of Young Men."
    Journal of Political Economy, 105(3), 473-522.
"""

from pathlib import Path
from typing import Literal, Optional

import numpy as np
import pandas as pd



[docs]
def load_keane_wolpin(
    version: Literal["kw_94", "kw_97"] = "kw_94",
    as_panel: bool = False,
    source: Literal["respy", "bundled"] = "bundled",
) -> pd.DataFrame:
    """Load the Keane & Wolpin career decisions dataset.

    This dataset tracks individuals choosing between:
    - 0: Schooling (accumulate education)
    - 1: White-collar work (accumulate white-collar experience)
    - 2: Blue-collar work (accumulate blue-collar experience)
    - 3: Home production (no state accumulation)

    State variables include:
    - schooling: Years of completed education
    - exp_white_collar: Years of white-collar experience
    - exp_blue_collar: Years of blue-collar experience
    - age: Current age

    Args:
        version: Which version of the KW model to load
            - "kw_94": Original 1994 REStat specification
            - "kw_97": Extended 1997 JPE specification
        as_panel: If True, return as Panel object for econirl estimators
        source: Data source
            - "respy": Load from respy package (if installed)
            - "bundled": Load bundled sample data

    Returns:
        DataFrame with columns:
            - id: Individual identifier
            - period: Decision period (1-indexed)
            - age: Current age
            - schooling: Years of education
            - exp_white_collar: White-collar experience
            - exp_blue_collar: Blue-collar experience
            - choice: Chosen action (0-3)

    Example:
        >>> from econirl.datasets import load_keane_wolpin
        >>> df = load_keane_wolpin()
        >>> print(f"Individuals: {df['id'].nunique()}")
        >>> print(f"Choice distribution:\\n{df['choice'].value_counts()}")

    Notes:
        For full replication of KW94/KW97, install respy:
        `pip install respy`

        The bundled sample data is suitable for testing and tutorials.
    """
    if source == "respy":
        try:
            return _load_from_respy(version, as_panel)
        except ImportError:
            import warnings
            warnings.warn(
                "respy not installed. Falling back to bundled data. "
                "Install with: pip install respy"
            )
            source = "bundled"

    if source == "bundled":
        return _load_bundled(as_panel)

    raise ValueError(f"Unknown source: {source}")



def _load_from_respy(version: str, as_panel: bool) -> pd.DataFrame:
    """Load data from respy package."""
    import respy

    # Map version to respy model name
    model_map = {
        "kw_94": "kw_94_one",
        "kw_97": "kw_97_basic",
    }

    model_name = model_map.get(version, "kw_94_one")

    # Get example model with data
    _, _, df = respy.get_example_model(model_name, with_data=True)

    # respy uses MultiIndex (Identifier, Period)
    df = df.reset_index()

    # Standardize column names
    rename_map = {
        'Identifier': 'id',
        'Period': 'period',
        'Age': 'age',
        'Years_Of_Schooling': 'schooling',
        'Experience_White_Collar': 'exp_white_collar',
        'Experience_Blue_Collar': 'exp_blue_collar',
        'Choice': 'choice',
    }
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

    # Ensure choice is 0-indexed
    if 'choice' in df.columns:
        if df['choice'].min() == 1:
            df['choice'] = df['choice'] - 1

    if as_panel:
        return _to_panel(df)

    return df


def _load_bundled(as_panel: bool) -> pd.DataFrame:
    """Load bundled sample data."""
    data_path = Path(__file__).parent / "keane_wolpin_sample.csv"

    if not data_path.exists():
        # Generate sample data if not present
        df = _generate_kw_sample()
        df.to_csv(data_path, index=False)
    else:
        df = pd.read_csv(data_path)

    if as_panel:
        return _to_panel(df)

    return df


def _generate_kw_sample(
    n_individuals: int = 500,
    n_periods: int = 10,
    seed: int = 1994,
) -> pd.DataFrame:
    """Generate synthetic Keane-Wolpin style data."""
    np.random.seed(seed)

    records = []
    for i in range(1, n_individuals + 1):
        schooling = 10 + np.random.randint(0, 7)
        exp_white = 0
        exp_blue = 0

        for t in range(1, n_periods + 1):
            age = 16 + t

            # Simple choice model
            if age <= 22 and schooling < 16:
                p_school = 0.6 - 0.05 * (schooling - 10)
                p_school = max(0.1, min(0.8, p_school))
                if np.random.random() < p_school:
                    choice = 0
                else:
                    choice = np.random.choice([1, 2, 3], p=[0.4, 0.4, 0.2])
            else:
                # Experience-dependent probabilities
                total_exp = exp_white + exp_blue
                p_white = 0.3 + 0.02 * exp_white
                p_blue = 0.3 + 0.02 * exp_blue
                p_home = 0.15
                p_school = max(0, 1 - p_white - p_blue - p_home)
                probs = np.array([p_school, p_white, p_blue, p_home])
                probs = probs / probs.sum()
                choice = np.random.choice([0, 1, 2, 3], p=probs)

            records.append({
                'id': i,
                'period': t,
                'age': age,
                'schooling': schooling,
                'exp_white_collar': exp_white,
                'exp_blue_collar': exp_blue,
                'choice': choice,
            })

            # State transitions
            if choice == 0:
                schooling = min(schooling + 1, 20)
            elif choice == 1:
                exp_white += 1
            elif choice == 2:
                exp_blue += 1

    return pd.DataFrame(records)


def _to_panel(df: pd.DataFrame):
    """Convert DataFrame to Panel format."""
    from econirl.core.types import Panel, Trajectory
    import jax.numpy as jnp

    # Create composite state from individual state variables
    # For KW, state = (schooling, exp_white, exp_blue) encoded as single int
    # This is a simplification; full implementation would use multi-dimensional states

    def encode_state(row):
        """Encode state tuple as single integer."""
        # schooling: 0-20, exp_white: 0-30, exp_blue: 0-30
        return (row['schooling'] * 31 * 31 +
                row['exp_white_collar'] * 31 +
                row['exp_blue_collar'])

    df = df.copy()
    df['state'] = df.apply(encode_state, axis=1)

    trajectories = []
    for ind_id in df['id'].unique():
        ind_data = df[df['id'] == ind_id].sort_values('period')

        states = jnp.array(ind_data['state'].values, dtype=jnp.int32)
        actions = jnp.array(ind_data['choice'].values, dtype=jnp.int32)
        next_states = jnp.concatenate([states[1:], states[-1:]])

        traj = Trajectory(
            states=states,
            actions=actions,
            next_states=next_states,
            individual_id=int(ind_id),
        )
        trajectories.append(traj)

    return Panel(trajectories=trajectories)



[docs]
def get_keane_wolpin_info() -> dict:
    """Get metadata about the Keane-Wolpin dataset."""
    df = load_keane_wolpin()

    return {
        "name": "Keane & Wolpin (1994/1997) Career Decisions",
        "n_observations": len(df),
        "n_individuals": df['id'].nunique(),
        "n_periods": df['period'].max(),
        "n_choices": df['choice'].nunique(),
        "choices": {
            0: "Schooling",
            1: "White-collar work",
            2: "Blue-collar work",
            3: "Home production",
        },
        "state_variables": ["schooling", "exp_white_collar", "exp_blue_collar"],
        "reference": "Keane & Wolpin (1994). REStat, 76(4), 648-672.",
    }