Source code for econirl.datasets.lsw_synthetic

"""Lee-Sudhir-Wang serialized fiction reading dataset, semi-synthetic.

This module ships a documented semi-synthetic mirror of the empirical
setting in Lee, Sudhir, and Wang (2026), who study sequential reading
decisions on a serialized fiction platform. The original platform data
is not redistributable so the loader generates a panel from a
parametric data-generating process whose state space, action set, and
latent-type mixture mirror the source paper.

State representation (simplified to a finite integer index):
    chapter_index in {0, ..., n_chapters_per_book - 1}
    wait_bin     in {0, 1, 2, 3, 4}      (discretized days since release)
    pricing      in {0, 1}                (0 = discount window, 1 = full price)
    prev_paid    in {0, 1}                (1 if user paid for the previous chapter)

Action set:
    a = 0  pay-and-read
    a = 1  wait-and-read
    a = 2  exit the current book

Latent types (two dominant segments from Lee-Sudhir-Wang 2026):
    z = 0  high-patience, monetization-focused (mixture weight 0.4)
    z = 1  budget-conscious, patient            (mixture weight 0.6)

Identification anchor:
    Action a = 2 (exit) carries utility zero per type. Type-specific
    utilities for pay-and-read and wait-and-read are identified
    relative to exit. This mirrors the anchor assumption in the
    source paper.

Reference:
    Lee, Y.-J., Sudhir, K., and Wang, Y. (2026). "Adversarial Inverse
    Reinforcement Learning with Unobserved Heterogeneity in Sequential
    Content Consumption." Working paper.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd


# ---------------------------------------------------------------------------
# Parametric data-generating process
# ---------------------------------------------------------------------------

# Per-type utility coefficients on (pay, wait, exit). Exit is the anchor
# at zero. The remaining two coefficients are intercepts per action per
# type. Calibrated below to yield a marginal purchase rate near six
# percent under the default mixture weight and price draws.
_DEFAULT_THETA = {
    0: {"alpha_pay": -1.7, "alpha_wait": -3.0},  # high-patience, pays
    1: {"alpha_pay": -5.0, "alpha_wait": -2.0},  # budget-conscious, waits
}

# Shared coefficients on observed state covariates.
_BETA_PRICE = 0.5  # additional disutility on pay when pricing == 1
_BETA_WAIT = 0.3   # additional disutility on wait per wait_bin

# Calibration tolerance for the marginal purchase rate. Generation
# raises if the simulated rate falls outside [target - tol, target + tol].
_PURCHASE_RATE_TARGET = 0.06
_PURCHASE_RATE_TOL = 0.03

# State-space dimensions.
_N_WAIT_BINS = 5
_N_PRICING = 2
_N_PREV_PAID = 2

# Action constants.
_A_PAY = 0
_A_WAIT = 1
_A_EXIT = 2


def _encode_state(
    chapter_index: int, wait_bin: int, pricing: int, prev_paid: int,
    n_chapters_per_book: int,
) -> int:
    """Pack the four state components into a single integer index."""
    state = (
        chapter_index * (_N_WAIT_BINS * _N_PRICING * _N_PREV_PAID)
        + wait_bin * (_N_PRICING * _N_PREV_PAID)
        + pricing * _N_PREV_PAID
        + prev_paid
    )
    return state


def _utility(
    action: int,
    pricing: int,
    wait_bin: int,
    type_z: int,
    theta: dict,
) -> float:
    """Per-action utility under type z and the current state covariates."""
    if action == _A_EXIT:
        return 0.0
    if action == _A_PAY:
        return theta[type_z]["alpha_pay"] - _BETA_PRICE * pricing
    if action == _A_WAIT:
        return theta[type_z]["alpha_wait"] - _BETA_WAIT * wait_bin
    raise ValueError(f"Unknown action: {action}")


def _generate_lsw_synthetic(
    n_users: int,
    n_books: int,
    n_chapters_per_book: int,
    pi_pay_segment: float,
    seed: int,
    theta: dict,
) -> tuple[pd.DataFrame, dict]:
    """Generate the bundled CSV and metadata for the dataset.

    Returns the trajectory DataFrame and a metadata dictionary that
    declares the latent type weights, the type-specific utility
    coefficients, and the calibrated marginal purchase rate.
    """
    rng = np.random.default_rng(seed)
    n_states = (
        n_chapters_per_book * _N_WAIT_BINS * _N_PRICING * _N_PREV_PAID
    )

    records = []
    for user_id in range(n_users):
        # Draw latent type once per user.
        type_z = int(rng.random() < (1.0 - pi_pay_segment))
        # type_z = 0 with probability pi_pay_segment (pay segment)
        # type_z = 1 otherwise (wait segment)

        # Draw a content embedding once per user (carried as features
        # but not part of the integer state encoding).
        embedding = rng.standard_normal(size=4).astype(np.float32)

        # Each user reads up to n_books books. They may exit early.
        for book_id in range(n_books):
            prev_paid = 0
            wait_bin = int(rng.integers(0, _N_WAIT_BINS))
            for chapter_index in range(n_chapters_per_book):
                pricing = int(rng.integers(0, _N_PRICING))
                state = _encode_state(
                    chapter_index, wait_bin, pricing, prev_paid,
                    n_chapters_per_book,
                )

                # Compute type-conditional choice probabilities.
                utilities = np.array(
                    [
                        _utility(_A_PAY, pricing, wait_bin, type_z, theta),
                        _utility(_A_WAIT, pricing, wait_bin, type_z, theta),
                        _utility(_A_EXIT, pricing, wait_bin, type_z, theta),
                    ]
                )
                shifted = utilities - np.max(utilities)
                probs = np.exp(shifted) / np.sum(np.exp(shifted))
                action = int(rng.choice(3, p=probs))

                # Update state covariates for the next period.
                if action == _A_EXIT:
                    next_chapter = 0
                    next_wait = 0
                    next_pricing = 0
                    next_prev_paid = 0
                else:
                    next_chapter = min(
                        chapter_index + 1, n_chapters_per_book - 1
                    )
                    next_wait = (
                        max(0, wait_bin - 1) if action == _A_WAIT
                        else int(rng.integers(0, _N_WAIT_BINS))
                    )
                    next_pricing = int(rng.integers(0, _N_PRICING))
                    next_prev_paid = 1 if action == _A_PAY else 0

                next_state = _encode_state(
                    next_chapter, next_wait, next_pricing, next_prev_paid,
                    n_chapters_per_book,
                )

                records.append(
                    {
                        "user_id": user_id,
                        "book_id": book_id,
                        "chapter_index": chapter_index,
                        "wait_bin": wait_bin,
                        "pricing": pricing,
                        "prev_paid": prev_paid,
                        "state": state,
                        "action": action,
                        "next_state": next_state,
                        "latent_type": type_z,
                        "embedding_0": float(embedding[0]),
                        "embedding_1": float(embedding[1]),
                        "embedding_2": float(embedding[2]),
                        "embedding_3": float(embedding[3]),
                    }
                )

                if action == _A_EXIT:
                    break

                wait_bin = next_wait
                pricing = next_pricing
                prev_paid = next_prev_paid

    df = pd.DataFrame(records)
    purchase_rate = float((df["action"] == _A_PAY).mean())

    if abs(purchase_rate - _PURCHASE_RATE_TARGET) > _PURCHASE_RATE_TOL:
        raise RuntimeError(
            f"Calibration check failed: simulated purchase rate "
            f"{purchase_rate:.4f} is outside the target window "
            f"[{_PURCHASE_RATE_TARGET - _PURCHASE_RATE_TOL:.4f}, "
            f"{_PURCHASE_RATE_TARGET + _PURCHASE_RATE_TOL:.4f}]. "
            "Adjust _DEFAULT_THETA in lsw_synthetic.py."
        )

    metadata = {
        "n_users": n_users,
        "n_books": n_books,
        "n_chapters_per_book": n_chapters_per_book,
        "pi_pay_segment": pi_pay_segment,
        "pi_wait_segment": 1.0 - pi_pay_segment,
        "n_states": n_states,
        "n_actions": 3,
        "action_labels": {"0": "pay", "1": "wait", "2": "exit"},
        "anchor_action": _A_EXIT,
        "type_theta": {
            "0": dict(theta[0]),
            "1": dict(theta[1]),
        },
        "beta_price": _BETA_PRICE,
        "beta_wait": _BETA_WAIT,
        "n_wait_bins": _N_WAIT_BINS,
        "n_pricing": _N_PRICING,
        "n_prev_paid": _N_PREV_PAID,
        "simulated_purchase_rate": purchase_rate,
        "purchase_rate_target": _PURCHASE_RATE_TARGET,
        "purchase_rate_tol": _PURCHASE_RATE_TOL,
        "seed": seed,
        "discount_factor": 0.95,
    }
    return df, metadata


# ---------------------------------------------------------------------------
# Public loader
# ---------------------------------------------------------------------------


[docs]
def load_lsw_synthetic(
    n_users: int = 5000,
    n_books: int = 50,
    n_chapters_per_book: int = 30,
    pi_pay_segment: float = 0.4,
    seed: int = 42,
    as_panel: bool = False,
) -> pd.DataFrame:
    """Load or generate the LSW serialized-content semi-synthetic panel.

    Args:
        n_users: Number of simulated users in the panel.
        n_books: Maximum number of books each user encounters before
            running out of content. Users may exit a book early.
        n_chapters_per_book: Number of chapters per book.
        pi_pay_segment: Population mixture weight on the high-patience
            pay-and-read latent type. Default 0.4 matches the source
            paper's reported relative segment shares.
        seed: Random seed for reproducibility.
        as_panel: If True, return a Panel object whose metadata field
            carries the data-generating process parameters. If False,
            return a pandas DataFrame.

    Returns:
        DataFrame with one row per chapter decision, including the
        integer state encoding, the chosen action, the next state,
        the latent type, and a four-dimensional content embedding.
        If `as_panel=True`, a Panel object whose metadata declares
        the type-specific reward coefficients.
    """
    if not 0.0 <= pi_pay_segment <= 1.0:
        raise ValueError(
            f"pi_pay_segment must be in [0, 1], got {pi_pay_segment}"
        )

    csv_path = Path(__file__).parent / "lsw_synthetic_data.csv"
    meta_path = Path(__file__).parent / "lsw_synthetic_metadata.json"

    use_cache = (
        csv_path.exists()
        and meta_path.exists()
        and n_users == 5000
        and n_books == 50
        and n_chapters_per_book == 30
        and pi_pay_segment == 0.4
        and seed == 42
    )

    if use_cache:
        df = pd.read_csv(csv_path)
        with open(meta_path) as f:
            metadata = json.load(f)
    else:
        df, metadata = _generate_lsw_synthetic(
            n_users=n_users,
            n_books=n_books,
            n_chapters_per_book=n_chapters_per_book,
            pi_pay_segment=pi_pay_segment,
            seed=seed,
            theta=_DEFAULT_THETA,
        )

    if as_panel:
        from econirl.core.types import Panel, Trajectory
        import jax.numpy as jnp

        trajectories = []
        # Each (user_id, book_id) pair is one trajectory.
        for (uid, bid), group in df.groupby(["user_id", "book_id"]):
            group = group.sort_values("chapter_index")
            states = jnp.array(group["state"].values, dtype=jnp.int32)
            actions = jnp.array(group["action"].values, dtype=jnp.int32)
            next_states = jnp.array(
                group["next_state"].values, dtype=jnp.int32
            )
            trajectories.append(
                Trajectory(
                    states=states,
                    actions=actions,
                    next_states=next_states,
                    individual_id=int(uid * 10_000 + bid),
                    metadata={
                        "user_id": int(uid),
                        "book_id": int(bid),
                        "latent_type": int(group["latent_type"].iloc[0]),
                    },
                )
            )

        return Panel(trajectories=trajectories, metadata=metadata)

    return df




[docs]
def get_lsw_synthetic_info() -> dict:
    """Return metadata about the LSW serialized-content dataset.

    Returns the type-specific reward coefficients, the population
    mixture weights, the discount factor, and the action labels used
    by the AIRL-Het EM loop to validate recovered parameters.
    """
    meta_path = Path(__file__).parent / "lsw_synthetic_metadata.json"
    static = {
        "name": (
            "Lee-Sudhir-Wang serialized fiction reading "
            "(semi-synthetic mirror)"
        ),
        "n_actions": 3,
        "action_labels": {"0": "pay", "1": "wait", "2": "exit"},
        "anchor_action": _A_EXIT,
        "ground_truth": True,
        "use_case": (
            "AIRL-Het with two latent types under a calibrated "
            "marginal purchase rate"
        ),
        "reference": (
            "Lee, Y.-J., Sudhir, K., and Wang, Y. (2026). Adversarial "
            "Inverse Reinforcement Learning with Unobserved Heterogeneity "
            "in Sequential Content Consumption. Working paper."
        ),
    }
    if meta_path.exists():
        with open(meta_path) as f:
            dynamic = json.load(f)
        static.update(dynamic)
    return static