test_that("classification_stability returns 1 for identical runs", {
    preds <- data.frame(
        run1 = c("A", "B", "C"),
        run2 = c("A", "B", "C")
    )
    expect_equal(classification_stability(preds), 1)
})

test_that("classification_stability handles multiple runs", {
    preds <- data.frame(
        run1 = c("A", "A", "B"),
        run2 = c("A", "B", "B"),
        run3 = c("A", "A", "B")
    )
    # run1-run2: 2/3 agree
    # run1-run3: 3/3 agree
    # run2-run3: 2/3 agree
    # Mean: (2+3+2)/9 = 7/9
    expect_equal(classification_stability(preds), 7 / 9)
})

test_that("classification_stability errors on single run", {
    expect_error(
        classification_stability(data.frame(run1 = c("A", "B"))),
        "At least two runs"
    )
})
