Source code for econirl.datasets.keane_wolpin

"""Keane & Wolpin (1994) Career Decisions Dataset.

This module provides the Keane-Wolpin career choice dataset, which tracks
individuals making choices between schooling, white-collar work, blue-collar
work, and home production.

Reference:
    Keane, M. P., & Wolpin, K. I. (1994). "The Solution and Estimation of
    Discrete Choice Dynamic Programming Models by Simulation and Interpolation:
    Monte Carlo Evidence." The Review of Economics and Statistics, 76(4), 648-672.

    Keane, M. P., & Wolpin, K. I. (1997). "The Career Decisions of Young Men."
    Journal of Political Economy, 105(3), 473-522.
"""

from pathlib import Path
from typing import Literal, Optional

import numpy as np
import pandas as pd


[docs] def load_keane_wolpin( version: Literal["kw_94", "kw_97"] = "kw_94", as_panel: bool = False, source: Literal["respy", "bundled"] = "bundled", ) -> pd.DataFrame: """Load the Keane & Wolpin career decisions dataset. This dataset tracks individuals choosing between: - 0: Schooling (accumulate education) - 1: White-collar work (accumulate white-collar experience) - 2: Blue-collar work (accumulate blue-collar experience) - 3: Home production (no state accumulation) State variables include: - schooling: Years of completed education - exp_white_collar: Years of white-collar experience - exp_blue_collar: Years of blue-collar experience - age: Current age Args: version: Which version of the KW model to load - "kw_94": Original 1994 REStat specification - "kw_97": Extended 1997 JPE specification as_panel: If True, return as Panel object for econirl estimators source: Data source - "respy": Load from respy package (if installed) - "bundled": Load bundled sample data Returns: DataFrame with columns: - id: Individual identifier - period: Decision period (1-indexed) - age: Current age - schooling: Years of education - exp_white_collar: White-collar experience - exp_blue_collar: Blue-collar experience - choice: Chosen action (0-3) Example: >>> from econirl.datasets import load_keane_wolpin >>> df = load_keane_wolpin() >>> print(f"Individuals: {df['id'].nunique()}") >>> print(f"Choice distribution:\\n{df['choice'].value_counts()}") Notes: For full replication of KW94/KW97, install respy: `pip install respy` The bundled sample data is suitable for testing and tutorials. """ if source == "respy": try: return _load_from_respy(version, as_panel) except ImportError: import warnings warnings.warn( "respy not installed. Falling back to bundled data. " "Install with: pip install respy" ) source = "bundled" if source == "bundled": return _load_bundled(as_panel) raise ValueError(f"Unknown source: {source}")
def _load_from_respy(version: str, as_panel: bool) -> pd.DataFrame: """Load data from respy package.""" import respy # Map version to respy model name model_map = { "kw_94": "kw_94_one", "kw_97": "kw_97_basic", } model_name = model_map.get(version, "kw_94_one") # Get example model with data _, _, df = respy.get_example_model(model_name, with_data=True) # respy uses MultiIndex (Identifier, Period) df = df.reset_index() # Standardize column names rename_map = { 'Identifier': 'id', 'Period': 'period', 'Age': 'age', 'Years_Of_Schooling': 'schooling', 'Experience_White_Collar': 'exp_white_collar', 'Experience_Blue_Collar': 'exp_blue_collar', 'Choice': 'choice', } df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}) # Ensure choice is 0-indexed if 'choice' in df.columns: if df['choice'].min() == 1: df['choice'] = df['choice'] - 1 if as_panel: return _to_panel(df) return df def _load_bundled(as_panel: bool) -> pd.DataFrame: """Load bundled sample data.""" data_path = Path(__file__).parent / "keane_wolpin_sample.csv" if not data_path.exists(): # Generate sample data if not present df = _generate_kw_sample() df.to_csv(data_path, index=False) else: df = pd.read_csv(data_path) if as_panel: return _to_panel(df) return df def _generate_kw_sample( n_individuals: int = 500, n_periods: int = 10, seed: int = 1994, ) -> pd.DataFrame: """Generate synthetic Keane-Wolpin style data.""" np.random.seed(seed) records = [] for i in range(1, n_individuals + 1): schooling = 10 + np.random.randint(0, 7) exp_white = 0 exp_blue = 0 for t in range(1, n_periods + 1): age = 16 + t # Simple choice model if age <= 22 and schooling < 16: p_school = 0.6 - 0.05 * (schooling - 10) p_school = max(0.1, min(0.8, p_school)) if np.random.random() < p_school: choice = 0 else: choice = np.random.choice([1, 2, 3], p=[0.4, 0.4, 0.2]) else: # Experience-dependent probabilities total_exp = exp_white + exp_blue p_white = 0.3 + 0.02 * exp_white p_blue = 0.3 + 0.02 * exp_blue p_home = 0.15 p_school = max(0, 1 - p_white - p_blue - p_home) probs = np.array([p_school, p_white, p_blue, p_home]) probs = probs / probs.sum() choice = np.random.choice([0, 1, 2, 3], p=probs) records.append({ 'id': i, 'period': t, 'age': age, 'schooling': schooling, 'exp_white_collar': exp_white, 'exp_blue_collar': exp_blue, 'choice': choice, }) # State transitions if choice == 0: schooling = min(schooling + 1, 20) elif choice == 1: exp_white += 1 elif choice == 2: exp_blue += 1 return pd.DataFrame(records) def _to_panel(df: pd.DataFrame): """Convert DataFrame to Panel format.""" from econirl.core.types import Panel, Trajectory import jax.numpy as jnp # Create composite state from individual state variables # For KW, state = (schooling, exp_white, exp_blue) encoded as single int # This is a simplification; full implementation would use multi-dimensional states def encode_state(row): """Encode state tuple as single integer.""" # schooling: 0-20, exp_white: 0-30, exp_blue: 0-30 return (row['schooling'] * 31 * 31 + row['exp_white_collar'] * 31 + row['exp_blue_collar']) df = df.copy() df['state'] = df.apply(encode_state, axis=1) trajectories = [] for ind_id in df['id'].unique(): ind_data = df[df['id'] == ind_id].sort_values('period') states = jnp.array(ind_data['state'].values, dtype=jnp.int32) actions = jnp.array(ind_data['choice'].values, dtype=jnp.int32) next_states = jnp.concatenate([states[1:], states[-1:]]) traj = Trajectory( states=states, actions=actions, next_states=next_states, individual_id=int(ind_id), ) trajectories.append(traj) return Panel(trajectories=trajectories)
[docs] def get_keane_wolpin_info() -> dict: """Get metadata about the Keane-Wolpin dataset.""" df = load_keane_wolpin() return { "name": "Keane & Wolpin (1994/1997) Career Decisions", "n_observations": len(df), "n_individuals": df['id'].nunique(), "n_periods": df['period'].max(), "n_choices": df['choice'].nunique(), "choices": { 0: "Schooling", 1: "White-collar work", 2: "Blue-collar work", 3: "Home production", }, "state_variables": ["schooling", "exp_white_collar", "exp_blue_collar"], "reference": "Keane & Wolpin (1994). REStat, 76(4), 648-672.", }