Source code for econirl.datasets.occupational_choice

"""
Synthetic Occupational Choice Dataset (Keane-Wolpin style).

This module provides a synthetic dataset for occupational choice problems,
inspired by Keane & Wolpin (1997) "The Career Decisions of Young Men."

The data represents individuals making career choices over their working lives:
- State: (education_level, experience, age) discretized
- Actions: 0=school, 1=white_collar, 2=blue_collar, 3=home

Reference:
    Keane, M. P., & Wolpin, K. I. (1997). "The Career Decisions of Young Men."
    Journal of Political Economy, 105(3), 473-522.
"""

import numpy as np
import pandas as pd


[docs] def load_occupational_choice( n_individuals: int = 500, n_periods: int = 40, as_panel: bool = False, seed: int = 1997, ) -> pd.DataFrame: """ Load synthetic occupational choice data (Keane-Wolpin style). This dataset represents individuals making career choices over their working lives. The state space combines education level, work experience, and age into approximately 100 discrete states. Individuals choose between continuing school, working in white-collar or blue-collar jobs, or staying home. Args: n_individuals: Number of individuals to generate (default: 500) n_periods: Number of time periods per individual (default: 40) as_panel: If True, return data structured as a Panel object compatible with econirl estimators. If False (default), return as a pandas DataFrame. seed: Random seed for reproducibility (default: 1997) Returns: DataFrame with columns: - id: Individual identifier - period: Time period (0-indexed) - state: Discretized state index (0-99) - action: Chosen action (0=school, 1=white_collar, 2=blue_collar, 3=home) - education: Education level (0-4, representing years/degree levels) - experience: Work experience (0-9, discretized) - age: Age group (0-1: young/old) Example: >>> from econirl.datasets import load_occupational_choice >>> df = load_occupational_choice() >>> print(f"Observations: {len(df):,}") >>> print(f"Individuals: {df['id'].nunique()}") >>> print(f"States: {df['state'].nunique()}") >>> # Get as Panel for estimation >>> panel = load_occupational_choice(as_panel=True) >>> print(f"Panel with {panel.num_individuals} individuals") Notes: State encoding: state = education * 20 + experience * 2 + age // 20 This gives approximately 5 * 10 * 2 = 100 discrete states. Action interpretation: - 0 (school): Continue education, increases education level - 1 (white_collar): Work in white-collar job, increases experience - 2 (blue_collar): Work in blue-collar job, increases experience - 3 (home): Stay home (unemployment, family care, etc.) """ df = _generate_occupational_choice_data(n_individuals, n_periods, seed) if as_panel: from econirl.core.types import Panel, Trajectory import jax.numpy as jnp # Convert to Panel format individual_ids = df["id"].unique() trajectories = [] for ind_id in individual_ids: ind_data = df[df["id"] == ind_id].sort_values("period") states = jnp.array(ind_data["state"].values, dtype=jnp.int32) actions = jnp.array(ind_data["action"].values, dtype=jnp.int32) # Compute next_states (shift states by 1, use 0 for last period) next_states = jnp.concatenate([states[1:], jnp.array([0])]) traj = Trajectory( states=states, actions=actions, next_states=next_states, individual_id=int(ind_id), ) trajectories.append(traj) return Panel(trajectories=trajectories) return df
def _generate_occupational_choice_data( n_individuals: int, n_periods: int, seed: int, ) -> pd.DataFrame: """ Generate synthetic occupational choice data. Creates a dataset with realistic choice patterns based on a simple model of career decisions. Choice probabilities depend on current state (education, experience, age) with reasonable economic patterns: - Higher education increases white-collar job probability - Experience increases employment probability - Age affects schooling eligibility and choice patterns """ np.random.seed(seed) # Constants for state encoding # education: 0-4 (5 levels) # experience: 0-9 (10 levels) # age_group: 0-1 (2 groups: young/old) # Total states: 5 * 10 * 2 = 100 records = [] for ind_id in range(n_individuals): # Initial state: everyone starts young with no education/experience education = 0 experience = 0 age = 0 # Age in periods (0-39) for period in range(n_periods): # Compute discretized state age_group = min(age // 20, 1) # 0 if age < 20, 1 otherwise exp_bin = min(experience, 9) edu_bin = min(education, 4) state = edu_bin * 20 + exp_bin * 2 + age_group # Compute choice probabilities based on state # Base probabilities (will be normalized) logits = _compute_choice_logits(education, experience, age) # Convert to probabilities using softmax exp_logits = np.exp(logits - np.max(logits)) probs = exp_logits / exp_logits.sum() # Draw action action = np.random.choice(4, p=probs) # Record observation records.append({ "id": ind_id, "period": period, "state": state, "action": action, "education": edu_bin, "experience": exp_bin, "age": age_group, }) # State transition based on action if action == 0: # School education = min(education + 1, 4) elif action in [1, 2]: # White-collar or blue-collar work experience = min(experience + 1, 9) # action == 3 (home): no state change age += 1 return pd.DataFrame(records) def _compute_choice_logits(education: int, experience: int, age: int) -> np.ndarray: """ Compute choice logits based on current state. Returns logits for [school, white_collar, blue_collar, home]. These are designed to produce reasonable choice patterns: - School is attractive when young and less educated - White-collar requires more education - Blue-collar is more accessible - Home probability increases when other options are less attractive """ logits = np.zeros(4) # School (action 0) # More attractive when young, less attractive when already educated if age < 20: # Can only go to school when young enough logits[0] = 2.0 - 0.5 * education + 0.1 * np.random.randn() else: logits[0] = -10.0 # Effectively impossible after age threshold # White-collar (action 1) # More attractive with higher education logits[1] = -1.0 + 0.8 * education + 0.2 * experience + 0.1 * np.random.randn() # Blue-collar (action 2) # Less dependent on education, more on experience logits[2] = 0.5 + 0.1 * education + 0.3 * experience + 0.1 * np.random.randn() # Home (action 3) # Base option, less attractive with more human capital logits[3] = -0.5 - 0.2 * education - 0.2 * experience + 0.1 * np.random.randn() return logits def get_occupational_choice_info() -> dict: """ Get metadata about the occupational choice dataset. Returns: Dictionary with dataset information including number of states, actions, and description of the state/action spaces. """ return { "name": "Synthetic Occupational Choice (Keane-Wolpin style)", "num_states": 100, "num_actions": 4, "state_description": { "education": "Education level (0-4)", "experience": "Work experience (0-9)", "age_group": "Age category (0-1: young/old)", }, "action_description": { 0: "school", 1: "white_collar", 2: "blue_collar", 3: "home", }, "reference": "Keane & Wolpin (1997). Journal of Political Economy, 105(3), 473-522.", }