## ----setup, message=FALSE-----------------------------------------------------
library(BenchHub)
library(tidyverse)
library(glmnet)

## ----import-data--------------------------------------------------------------
# import the microbiome data into a temporary environment
exampleEnv <- new.env(parent = emptyenv())
data("lubomski_microbiome_data", envir = exampleEnv, package = "BenchHub")

x <- exampleEnv[["x"]]
lubomPD <- exampleEnv[["lubomPD"]]

# check the dimension of the microbiome matrix
dim(x)

# check the length of the patient status
length(lubomPD)

# Add sample IDs so the evidence matches the dataset rows by name.
names(lubomPD) <- rownames(x)

## ----initialise-trio----------------------------------------------------------
trio <- Trio$new(data = x, 
                 evidence = list(
                   Diagnosis = list(
                     evidence = lubomPD, 
                     metrics = "Balanced Accuracy"
                     )
                   ),
                 metrics = list(
                   `Balanced Accuracy` = balAccMetric
                   ),
                 datasetID = "lubomski_microbiome"
                 )

## ----gold-standard------------------------------------------------------------
# get the gold standard from the Trio object
x <- trio$data
y <- trio$getEvidence("Diagnosis")

## ----train-test---------------------------------------------------------------
# get train and test indices
trio$split(y = y, n_fold = 2, n_repeat = 5)
CVindices <- trio$splitIndices

## ----cross-validation---------------------------------------------------------
set.seed(1234)
library(tibble)

cv_plan <- tibble(
  trainIDs = CVindices,
  fold = vapply(
    strsplit(names(CVindices), ".", fixed = TRUE),
    `[`,
    character(1),
    1
  ),
  repeat_id = vapply(
    strsplit(names(CVindices), ".", fixed = TRUE),
    `[`,
    character(1),
    2
  )
)

run_one_cv <- function(trainIDs, fold, repeat_id, trio, x, y) {
  x_train <- x[trainIDs, ]
  x_test  <- x[-trainIDs, ]
  y_train <- y[trainIDs]
  y_test  <- y[-trainIDs]

  cv_lasso <- glmnet::cv.glmnet(
    x = as.matrix(x_train),
    y = y_train,
    alpha = 1,
    family = "binomial"
  )
  lam <- cv_lasso$lambda.1se

  fit <- glmnet::glmnet(
    x = as.matrix(x_train),
    y = y_train,
    alpha = 1,
    lambda = lam,
    family = "binomial"
  )

  pred <- predict(
    fit,
    newx = as.matrix(x_test),
    s = lam,
    type = "class"
  )
  pred <- setNames(as.factor(as.vector(pred)), rownames(x_test))

  eval_res <- trio$evaluate(list(lasso = list(Diagnosis = pred)))

  # attach metadata explicitly
  eval_res$fold <- fold
  eval_res$repeat_id <- repeat_id

  eval_res
}

result_list <- vector("list", nrow(cv_plan))

for (i in seq_len(nrow(cv_plan))) {
  result_list[[i]] <- run_one_cv(
    trainIDs = cv_plan$trainIDs[[i]],
    fold = cv_plan$fold[[i]],
    repeat_id = cv_plan$repeat_id[[i]],
    trio = trio,
    x = x,
    y = y
  )
}

## ----fig.cap = "Fig.1 Mean cross-validation accuracy across repeats."---------
result <- dplyr::bind_rows(result_list)

result_summary <- result %>%
  dplyr::group_by(datasetID, method, evidence, metric, repeat_id) %>%
  dplyr::summarise(result = mean(result), .groups = "drop")


# visualise the result
boxplot(
  result_summary$result,
  ylab = "Accuracy",
  main = "Cross-validation performance"
)

## ----simulate-----------------------------------------------------------------
set.seed(1)

# generate a simulated matrix
sim <- rnbinom(nrow(x) * ncol(x), size = 1, mu = 1)
sim <- Matrix(sim, nrow = nrow(x), ncol = ncol(x))

# function that calculate the sparsity of a matrix
calc_sparsity <- function(data) {
  sparsity <- sum(data == 0) / length(data)
  return(sparsity)
}


# metric that compare the difference of two values
calc_diff <- function(evidence, predicted) {
  return(predicted - evidence)
}

## ----add-metric---------------------------------------------------------------
# add metric that we just defined
trio$addMetric(name = "Difference", metric = calc_diff)

# calculate the sparsity of the data, and then input into the supporting evidence
trio$addEvidence(name = "Sparsity", evidence = calc_sparsity, metrics = "Difference")

# because we used negative binomial to simulate a matrix
# we will name the method as "negative binomial"
# then calculate the sparsity of the simulated matrix 
# and compare with the Sparsity supporting evidence
eval_res <- trio$evaluate(
  list(negative_binomial = list(Sparsity = sim))
)

eval_res

## ----get-result---------------------------------------------------------------
# in the with cross validation result, we need to average the results from multiple repeats to give one value
result <- result %>%
  dplyr::group_by(datasetID, method, evidence, metric) %>%
  dplyr::summarize(result = mean(result))

result <- rbind(result, eval_res)
result

## ----session-info-------------------------------------------------------------
sessionInfo()

