######## Init GRN ########

#' Create and initialize an empty \code{\linkS4class{GRN}} object.
#' 
#' Executing this function is the very first step in the *GRaNIE* workflow. After its execution, data can be added to the object.
#' @export
#' @param objectMetadata List. Default \code{list()}. Optional (named) list with an arbitrary number of elements, all of which 
#' capture metadata for the object. \strong{Only atomic data types are allowed for each list element
#' (see ?is.atomic for more help: logical, integer, numeric, complex, character, raw, and NULL), and this slot is not supposed to store real data}. This is mainly used to distinguish GRN objects from one another by storing object-specific metadata along with the data.
#' @param outputFolder Output folder, either absolute or relative to the current working directory. Default \code{"."}. 
#' Default output folder where all pipeline output will be put unless specified otherwise. We recommend specifying an absolute path. 
#' Note that for Windows-based systems, the path must be correctly specified with "/" as path separator.
#' @param genomeAssembly Character. No default. The genome assembly of all data that to be used within this object. 
#' Currently, supported genomes are: \code{hg19}, \code{hg38}, \code{mm9}, \code{mm10}, \code{mm39}, \code{rn6}, \code{rn7}, \code{dm6}. If you need additional genomes, let us know. See function description for further information and notes.
#' @return Empty \code{\linkS4class{GRN}} object
#' @examples 
#' meta.l = list(name = "exampleName", date = "01.03.22")
#' GRN = initializeGRN(objectMetadata = meta.l, outputFolder = "output", genomeAssembly = "hg38")
#' @export
#' @importFrom stats sd median cor cor.test quantile
initializeGRN <- function(objectMetadata = list(),
                          outputFolder = ".",
                          genomeAssembly) {
  
  start = Sys.time()   
    
  checkmate::assert(checkmate::checkNull(objectMetadata), checkmate::checkList(objectMetadata))
  checkmate::assertChoice(genomeAssembly, c("hg19","hg38", "mm9", "mm10", "mm39", "rn6", "rn7", "dm6","rheMac10"))
  
  # Check individual metadata components that they are only characters but not actual data objects
  for (i in seq_len(length(objectMetadata))) {
      if (!checkmate::testAtomic(objectMetadata[[i]])) {
          message = paste0("For the objectMetadata argument, only atomic types (logical, integer, numeric, complex, character, raw, and NULL; see ?is.atomic for more information) are allowed for each list element. However, this is not the case for the element ", i, ". Real data should not be stored in this slot.")
          .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
      }
  
  }

  
  # Create the folder first if not yet existing
  checkmate::assertCharacter(outputFolder, min.chars = 1, len = 1)
  if (!dir.exists(outputFolder)) {
    res = dir.create(outputFolder)
    if (!res) {
        message = paste0("Could not create the output directory ", outputFolder, ". Check the path /and/or access rights.")
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
    }
    checkmate::assertDirectoryExists(outputFolder, access = "w")
  }
  
  # Create an absolute path out of the given outputFolder now that it exists
  outputFolder = tools::file_path_as_absolute(outputFolder)
  checkmate::assertDirectory(outputFolder, access = "w")
  
  if (!endsWith(outputFolder, .Platform$file.sep)) {
    outputFolder = paste0(outputFolder, .Platform$file.sep)
  }
  
  
  dir_output_plots = paste0(outputFolder, "plots", .Platform$file.sep)
  if (!dir.exists(dir_output_plots)) {
    dir.create(dir_output_plots)
  }
  
  GRN = .createGRNObject()
  GRN@config$functionParameters = list()
  
  GRN = .addFunctionLogToObject(GRN)
  
  GRN@config$isFiltered = FALSE
  
  par.l = list()
  
  par.l$packageVersion = as.character(utils::packageVersion("GRaNIE"))
  par.l$genomeAssembly = genomeAssembly

  
  # Make an internal subslot
  par.l$internal = list()
  
  # Fixed at 1 currently.
  par.l$internal$nPermutations = 1 
  
  # Step size for the TF-peak FDR calculation
  par.l$internal$stepsFDR = round(seq(from = -1, to = 1, by = 0.05),2)
  
  # Stringencies for AR classification
  par.l$internal$allClassificationThresholds = c(0.1, 0.05, 0.01, 0.001)
  
  # Minimum number of TFBS to include a TF in the heatmap
  par.l$internal$threshold_minNoTFBS_heatmap = 100
  
  # Colors for the different classifications
  par.l$internal$colorCategories = c("activator" = "#4daf4a", "undetermined" = "black", "repressor" = "#e41a1c", "not-expressed" = "Snow3") # diverging, modified
  
  
  GRN@config$parameters = par.l
  GRN@config$metadata = objectMetadata
  
  
  # OUTPUT
  GRN@config$directories$outputRoot         = outputFolder
  GRN@config$directories$output_plots       = dir_output_plots 
  GRN@config$files$output_log               = paste0(outputFolder, "GRN.log")
  
  .testExistanceAndCreateDirectoriesRecursively(c(outputFolder, dir_output_plots))
  
  checkmate::assertDirectory(outputFolder, access = "w")
  
  
  .startLogger(GRN@config$files$output_log , "INFO",  removeOldLog = FALSE)
  #.printParametersLog(par.l)
  
  futile.logger::flog.info(paste0("Empty GRN object created successfully. Type the object name (e.g., GRN) to retrieve summary information about it at any time."))
  
  futile.logger::flog.info(paste0(" Default output folder: ", GRN@config$directories$outputRoot))
  futile.logger::flog.info(paste0(" Genome assembly: ", genomeAssembly))
  
  .printExecutionTime(start, prefix = "")
  
  GRN
}

######## Add data and annotation ########

#' Add data to a \code{\linkS4class{GRN}} object.
#' 
#' This function adds both RNA and peak data to a \code{\linkS4class{GRN}} object, along with data normalization.
#' In addition, and highly recommended, sample metadata can be optionally provided.
#' 
#' If the \code{ChIPseeker} package is installed, additional peak annotation is provided in the annotation slot and a peak annotation QC plot is produced as part of peak-gene QC.
#' This is fully optional, however, and has no consequences for downstream functions.
#' Normalizing the data sensibly is very important. When \code{quantile}is chose, \code{limma::normalizeQuantiles} is used, which in essence does the following: 
#' Each quantile of each column is set to the mean of that quantile across arrays. The intention is to make all the normalized columns have the same empirical distribution. 
#' This will be exactly true if there are no missing values and no ties within the columns: the normalized columns are then simply permutations of one another.
#' 
#' @export
#' @template GRN 
#' @param counts_peaks Data frame. No default. Counts for the peaks, with raw or normalized counts for each peak (rows) across all samples (columns). 
#' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_peaks} below. 
#' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.
#' @param normalization_peaks Character. Default \code{DESeq2_sizeFactors}. Normalization procedure for peak data. 
#' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, 
#' \code{EDASeq_GC_peaks}, \code{gcqn_peaks}, \code{DESeq2_sizeFactors}, \code{none}.
#' @param idColumn_peaks Character. Default \code{peakID}. Name of the column in the counts_peaks data frame that contains peak IDs. 
#' The required format must be \code{chr}:\code{start}-\code{end}, with \code{chr} denoting the abbreviated chromosome name, and \code{start} and \code{end} the begin and end of the peak coordinates, respectively. End must be bigger than start. Examples for valid peak IDs are \code{chr1:400-800} or \code{chrX:20-25}.
#' @param counts_rna Data frame. No default. Counts for the RNA-seq data, with raw or normalized counts for each gene (rows) across all samples (columns). 
#' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_rna} below. 
#' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.
#' @param normalization_rna Character. Default \code{limma_quantile}. Normalization procedure for peak data. 
#' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, \code{DESeq2_sizeFactors}, \code{none}.
#' @param idColumn_RNA Character. Default \code{ENSEMBL}. Name of the column in the \code{counts_rna} data frame that contains Ensembl IDs.
#' @param sampleMetadata Data frame. Default \code{NULL}. Optional, additional metadata for the samples, such as age, sex, gender etc. 
#' If provided, the @seealso [plotPCA_all()] function can then incorporate and plot it. Sample names must match with those from both peak and RNA-Seq data. The first column is expected to contain the sample IDs, the actual column name is irrelevant.
#' @param additionalParams.l Named list. Default \code{list()}. Additional parameters for the chosen normalization method. 
#' Currently, only the GC-aware normalization methods \code{EDASeq_GC_peaks} and \code{gcqn_peaks} are supported here. 
#' Both support the parameters \code{roundResults} (logical flag, \code{TRUE} or \code{FALSE}) and \code{nBins} (Integer > 0), and \code{EDASeq_GC_peaks} supports three additional parameters:
#' \code{withinLane_method} (one of: "loess","median","upper","full") and \code{betweenLane_method}  (one of: "median","upper","full"). 
#' For more information, see the EDASeq vignette.
#' @param allowOverlappingPeaks \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should overlapping peaks be allowed (then only a warning is issued 
#' when overlapping peaks are found) or (the default) should an error be raised?
#' @param keepOriginalReadCounts \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should the original read counts as provided to the function be kept in addition to
#' storing the rad counts after a (if any) normalization? This increases the memory footprint of the object because 2 additional count matrices have to be stored.
#' @param EnsemblVersion \code{NULL} or Character(1). Default \code{NULL}. The Ensembl version to use for genome annotation retrieval via \code{biomaRt}, which is only used to populate the gene annotation metadata that is stored in \code{GRN@annotation$genes}. 
#' By default (\code{NULL}), the newest version is selected for the most recent genome assembly versions is used (see \code{biomaRt::listEnsemblArchives()} for supported versions). This parameter can override this to use a custom (older) version instead.
#' @param genomeAnnotationSource \code{AnnotationHub} or \code{biomaRt}. Default \code{AnnotationHub}. Annotation source to retrieve genome annotation data from.
#' @template forceRerun
#' @return An updated \code{\linkS4class{GRN}} object, with added data from this function(e.g., slots \code{GRN@data$peaks} and \code{GRN@data$RNA})
#' @seealso \code{\link{plotPCA_all}}
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' # library(readr)
#' # rna.df   = read_tsv("https://www.embl.de/download/zaugg/GRaNIE/rna.tsv.gz")
#' # peaks.df = read_tsv("https://www.embl.de/download/zaugg/GRaNIE/peaks.tsv.gz")
#' # meta.df  = read_tsv("https://www.embl.de/download/zaugg/GRaNIE/sampleMetadata.tsv.gz")
#' # GRN = loadExampleObject()
#' # We omit sampleMetadata = meta.df in the following line, becomes too long otherwise
#' # GRN = addData(GRN, counts_peaks = peaks.df, counts_rna = rna.df, forceRerun = FALSE)

addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactors", idColumn_peaks = "peakID", 
                    counts_rna, normalization_rna = "limma_quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL,
                    additionalParams.l = list(),
                    allowOverlappingPeaks= FALSE,
                    keepOriginalReadCounts = FALSE,
                    EnsemblVersion = NULL,
                    genomeAnnotationSource = "AnnotationHub",
                    forceRerun = FALSE) {
  
  start = Sys.time()
  
  checkmate::assertClass(GRN, "GRN")
   
  checkmate::assertDataFrame(counts_peaks, min.rows = 1, min.cols = 2)
  checkmate::assertDataFrame(counts_rna, min.rows = 1, min.cols = 2)
  checkmate::assertChoice(idColumn_peaks, colnames(counts_peaks))
  checkmate::assertChoice(idColumn_RNA, colnames(counts_rna))

  checkmate::assertChoice(normalization_peaks, 
                          choices = c("limma_cyclicloess", "limma_quantile", "limma_scale", 
                                      "csaw_cyclicLoess_orig", "csaw_TMM", 
                                      "EDASeq_GC_peaks", "gcqn_peaks",
                                      "DESeq2_sizeFactors", 
                                      "none"))
  checkmate::assertChoice(normalization_rna, 
                          choices = c("limma_cyclicloess", "limma_quantile", "limma_scale", 
                                      "csaw_cyclicLoess_orig", "csaw_TMM", 
                                      "DESeq2_sizeFactors", 
                                      "none"))
  
  checkmate::assertFlag(keepOriginalReadCounts)
  checkmate::assertFlag(allowOverlappingPeaks)
  checkmate::assert(checkmate::checkNull(EnsemblVersion), checkmate::assertSubset(as.character(EnsemblVersion), biomaRt::listEnsemblArchives()$version))
  
  checkmate::assertChoice(genomeAnnotationSource, c("biomaRt", "AnnotationHub"))
  
  checkmate::assertFlag(forceRerun)
  

  if (is.null(GRN@data$peaks$counts) |
      is.null(GRN@data$peaks$counts_metadata) | 
      is.null(GRN@data$RNA$counts) |
      is.null(GRN@data$RNA$counts_metadata) |
      forceRerun) {
      
    GRN = .addFunctionLogToObject(GRN) 
  
    # Normalize ID column names
    if (idColumn_peaks != "peakID") {
      counts_peaks = dplyr::rename(counts_peaks, peakID = !!(idColumn_peaks))
      idColumn_peaks = "peakID"
    }
    if (idColumn_RNA != "ENSEMBL") {
      counts_rna = dplyr::rename(counts_rna, ENSEMBL = !!(idColumn_RNA))
      idColumn_RNA = "ENSEMBL"
    }
      
    # Check existence of correct ID column now
    checkmate::assertSubset(idColumn_peaks, colnames(counts_peaks))
    checkmate::assertSubset(idColumn_RNA, colnames(counts_rna))
    
    # Check number of character columns: Must be 1 exactly
    .checkColumnTypes(counts_rna, "RNA")
    .checkColumnTypes(counts_peaks, "peaks")

    # Check ID columns for missing values and remove
    rna_missing_ID =  which(is.na(counts_rna$ENSEMBL))
    if (length(rna_missing_ID) > 0) {
      message = paste0("addData: Found ", length(rna_missing_ID), " missing IDs in the ID column of the RNA counts. These rows will be removed.")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
      counts_rna = dplyr::slice(counts_rna, -rna_missing_ID)
    }
    
    peaks_missing_ID =  which(is.na(counts_peaks$peakID))
    if (length(peaks_missing_ID) > 0) {
      message = paste0("addData: Found ", length(peaks_missing_ID), " missing IDs in the ID column of the peaks counts. These rows will be removed.")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
      counts_peaks = dplyr::slice(counts_peaks, -peaks_missing_ID)
    }
    
    
    # Remove potential scientific notation from peak IDs
    peaks_eNotation = which(grepl("e+", counts_peaks$peakID))
    if (length(peaks_eNotation) > 0) {
      message = paste0("addData: Found at least one peak (", paste0(counts_peaks$peakID[peaks_eNotation], collapse = ",") , ") for which the position contains the scientific notation, attempting to fix.")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
      counts_peaks$peakID[peaks_eNotation] = .removeScientificNotation_positions(counts_peaks$peakID[peaks_eNotation])
      
      
    }
    
    # Clean Ensembl IDs
    counts_rna$ENSEMBL = gsub("\\..+", "", counts_rna$ENSEMBL, perl = TRUE)
    
    # Check uniqueness of IDs
    nDuplicateRows = nrow(counts_rna) - dplyr::n_distinct(counts_rna$ENSEMBL)
    if (nDuplicateRows > 0) {
      message = paste0("addData: Found ", nDuplicateRows, " duplicate rows in RNA-Seq data, consolidating them by summing them up.")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
        
      counts_rna = counts_rna %>%
        dplyr::group_by(.data$ENSEMBL) %>%
        dplyr::summarise_if(is.numeric, sum) 
      # dplyr::summarise_if(is.numeric, sum, .groups = 'drop') # the .drop caused an error with dplyr 1.0.5
    }
    
    ## STORE COUNTS METADATA ##
    
    GRN@data$RNA$counts_metadata = tibble::tibble(ID = counts_rna$ENSEMBL, isFiltered = FALSE)
    
    # Store peak metadata and normalize peak IDS (e.g., replacing second ":" by "-")
    GRN@data$peaks$counts_metadata = .createConsensusPeaksDF(counts_peaks$peakID) 
    stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata))
    # Make sure peak ID stored here is always the same as in the metadata
    counts_peaks$peakID = GRN@data$peaks$counts_metadata$peakID
    
    #  Calculate GC content of peaks which we need before any normalization
    
    if (normalization_peaks %in% c("EDASeq_GC_peaks", "gcqn_peaks")) {
        
        # Now we need the genome annotation packages to calculate the GC content of the peak regions
        .checkAndLoadPackagesGenomeAssembly(GRN@config$parameters$genomeAssembly)
        
        GC.data.df = .calcGCContentPeaks(GRN)
        additionalParams.l[["GC_data"]] = GC.data.df
    }
   
    
    ## Normalize counts ##
    countsPeaks.norm.df  = .normalizeCountMatrix(GRN, data = counts_peaks %>% tibble::column_to_rownames("peakID") %>% as.matrix(), 
                                                 normalization = normalization_peaks,
                                                 additionalParams = additionalParams.l
                                            ) %>%
                            tibble::as_tibble(rownames = "peakID") %>%
                            dplyr::select("peakID", tidyselect::everything())
    
    countsRNA.norm.df    = .normalizeCountMatrix(GRN, data = counts_rna %>% tibble::column_to_rownames("ENSEMBL") %>% as.matrix(), 
                                                 normalization = normalization_rna,
                                                 additionalParams = additionalParams.l) %>%
                            tibble::as_tibble(rownames = "ENSEMBL") %>%
                            dplyr::select("ENSEMBL", tidyselect::everything())
    

    ## SAMPLE AND GRN METADATA ##
    GRN@config$parameters$normalization_rna = normalization_rna
    GRN@config$parameters$normalization_peaks = normalization_peaks
    
    samples_rna   = colnames(countsRNA.norm.df)
    samples_peaks = colnames(countsPeaks.norm.df)
    allSamples    = unique(c(samples_rna, samples_peaks)) %>% setdiff(c("ENSEMBL", "isFiltered", "peakID"))
    
    # Generate metadata first to determine the number of shared samples etc
    if (!is.null(sampleMetadata)) {
      
      futile.logger::flog.info("Parsing provided metadata...")
      GRN@data$metadata = sampleMetadata %>% dplyr::distinct() %>% tibble::tibble(.name_repair = "universal")
      

      # Force the first column to be the ID column
      if ("sampleID" %in% colnames(GRN@data$metadata)) {
        message = paste0("addData: Renaming the first column to sampleID. However, this column already exists, it will be renamed accordingly.")
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)  
          
        colnames(GRN@data$metadata)[which(colnames(GRN@data$metadata) == "sampleID")] = "sampleID_original"
        
      } 
      colnames(GRN@data$metadata)[1] = "sampleID"
      
      # Force character here
      GRN@data$metadata$sampleID = as.character(GRN@data$metadata$sampleID)
      
      
      # Assume the ID is in column 1, has to be unique
      if (nrow(GRN@data$metadata) > dplyr::n_distinct(GRN@data$metadata$sampleID)) {
        message = paste0("The first column in the sample metadata table must contain only unique values, as it is used as sample ID. Make sure the values are unique.")
        # tbl_ids = table(GRN@data$metadata$sampleID)
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
      }
      
      missingIDs = which(!allSamples %in% GRN@data$metadata$sampleID)
      if (length(missingIDs) > 0) {
        GRN@data$metadata = tibble::add_row(GRN@data$metadata, sampleID = allSamples[ missingIDs])
      }
    } else {
      GRN@data$metadata = tibble::tibble(sampleID = allSamples)
    }
    
    GRN@data$metadata =  GRN@data$metadata %>%
      dplyr::mutate(has_RNA = .data$sampleID  %in% samples_rna,
                    has_peaks = .data$sampleID %in% samples_peaks,
                    has_both = .data$has_RNA & .data$has_peaks
      )
    
    GRN@config$sharedSamples = dplyr::filter(GRN@data$metadata, .data$has_both) %>% dplyr::pull(.data$sampleID) %>% as.character()
    # We have our first connection type, the default one; more may be added later
    GRN@config$TF_peak_connectionTypes = "expression"
    
    ## STORE FINAL COUNT MATRICES ##
    
    # Subset data to retain only samples that appear in both RNA and peaks
    data.l = .intersectData(countsRNA.norm.df, countsPeaks.norm.df)
    
    # Store the matrices either as normal or sparse matrix
    GRN@data$peaks$counts = .storeAsMatrixOrSparseMatrix(GRN, df = data.l[["peaks"]], ID_column = "peakID", slotName = "GRN@data$peaks$counts")
    
    if (keepOriginalReadCounts) {
        GRN@data$peaks$counts_raw = .storeAsMatrixOrSparseMatrix(GRN, df = counts_peaks %>% dplyr::select("peakID", tidyselect::one_of(GRN@config$sharedSamples)), 
                                                                 ID_column = "peakID", slotName = "GRN@data$peaks$counts_raw")
    }

    GRN@data$RNA$counts   = .storeAsMatrixOrSparseMatrix(GRN, df =  data.l[["RNA"]], ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts")
    
    if (keepOriginalReadCounts) {
        GRN@data$RNA$counts_raw = .storeAsMatrixOrSparseMatrix(GRN, df = counts_rna %>% dplyr::select("ENSEMBL", tidyselect::one_of(GRN@config$sharedSamples)), 
                                                                 ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts_raw")
    }
    
    GRN@data$RNA$counts_permuted_index = sample.int(ncol(GRN@data$RNA$counts), ncol(GRN@data$RNA$counts))
    
    futile.logger::flog.info(paste0( "Final dimensions of data:"))
    futile.logger::flog.info(paste0( " RNA  : ", nrow(countsRNA.norm.df)  , " x ", ncol(countsRNA.norm.df)   - 1, " (rows x columns)"))
    futile.logger::flog.info(paste0( " peaks: ", nrow(countsPeaks.norm.df), " x ", ncol(countsPeaks.norm.df) - 1, " (rows x columns)"))
    # Create permutations for RNA
    futile.logger::flog.info(paste0( "Generate ", .getMaxPermutation(GRN), " permutations of RNA-counts"))
    
    futile.logger::flog.info(paste0("Check for overlapping peaks..."))
    
    .checkOverlappingPeaks(GRN, allowOverlappingPeaks = allowOverlappingPeaks)

    ## PEAK AND GENE ANNOTATION ##
    futile.logger::flog.info(paste0("Adding peak and gene annotation..."))
    
    GRN = .populatePeakAnnotation(GRN)
    
    if (normalization_peaks %in% c("EDASeq_GC_peaks", "gcqn_peaks")) {
        GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") 
        
        # Additional GC statistics, not used at the moment currently
        GRN = .calcAdditionalGCStatistics(GRN, GC.data.df)
    }
    


    GRN = .populateGeneAnnotation(GRN, EnsemblVersion = EnsemblVersion, genomeAnnotationSource = genomeAnnotationSource)
    
  } else {
      .printDataAlreadyExistsMessage()
  }
  
  
  .printExecutionTime(start, prefix = "")
  
  GRN
  
}

.checkColumnTypes <- function(counts_df, name) {
    columnTypes = sapply(counts_df, class) %>% table()
    if (columnTypes["character"] != 1) {
        message = paste0(" The input ", name, " data is invalid: Exactly one column must be of type character (the ID column), but ", columnTypes["character"], " columns are instead")
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
    }
    
}



.checkOverlappingPeaks <- function(GRN, allowOverlappingPeaks) {
    
    # Assume 0-based exclusive format, see https://arnaudceol.wordpress.com/2014/09/18/chromosome-coordinate-systems-0-based-1-based/ and http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for details
    consensus.gr   = .constructGRanges(GRN@data$peaks$counts_metadata, 
                                       seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), 
                                       GRN@config$parameters$genomeAssembly, zeroBased = TRUE)
    
    overlappingPeaks = which(GenomicRanges::countOverlaps(consensus.gr ,consensus.gr) > 1)
    
    if (length(overlappingPeaks) > 0) {
        
        ids = (consensus.gr[overlappingPeaks] %>% as.data.frame())$peakID
        
        messageAll = paste0("addData: ", length(overlappingPeaks), 
                            " overlapping peaks have been identified. The first ten are: ", paste0(ids[seq_len(min(10, length(ids)))], collapse = ","),
                            ". This may not be what you want, since overlapping peaks may have a heigher weight in the network. "
        )
        
        
        if (allowOverlappingPeaks) {
            
            message = paste0(messageAll, "As allowOverlappingPeaks has been set to TRUE, this is only a warning and not an error.")
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
        } else {
            message = paste0(messageAll, "As allowOverlappingPeaks = FALSE (the default), this is an error and not a warning. You may want to regenerate the peak file, eliminate peak overlaps, and rerun this function")
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
    }
}


.storeAsMatrixOrSparseMatrix <- function(GRN, df, ID_column, slotName, threshold = 0.1) {
    
    checkmate::assertSubset(GRN@config$sharedSamples, colnames(df)[-1])
    
    # Store as sparse matrix if enough 0s
    checkmate::assertIntegerish(length(GRN@config$sharedSamples), lower = 1)

    df.m = df %>% 
        dplyr::select(tidyselect::one_of(ID_column, GRN@config$sharedSamples)) %>% 
        tibble::column_to_rownames(ID_column) %>%
        as.matrix()
    
    # Determine sparsity
    fractionZero = (length(df.m) - Matrix::nnzero(df.m)) / length(df.m)
    
    
    if (fractionZero > threshold) {
        futile.logger::flog.info(paste0("Storing ", slotName, " matrix as sparse matrix because fraction of 0s is > ", threshold, " (", round(fractionZero,2), ")"))
        df.m = .asSparseMatrix(df.m, convertNA_to_zero = FALSE, 
                               dimnames = list(df[, ID_column] %>% unlist(use.names = FALSE), GRN@config$sharedSamples))
    } 
    
    df.m
    
}

.createConsensusPeaksDF <- function(peakIDs) {
  

  ids.split = strsplit(peakIDs, split = "[:-]+")
  ids.split.length = sapply(ids.split, length)
  if (!all(ids.split.length == 3)) {
    message = paste0(" At least one of the IDs in the peaks data has an unsupported format. Make sure all peakIDs are in the format \"chr:start-end\"")
    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  }
  
  
  consensus.df = tibble::tibble(chr   = as.factor(sapply(ids.split, "[[", 1)),
                                start = as.numeric(sapply(ids.split, "[[", 2)), 
                                end   = as.numeric(sapply(ids.split, "[[", 3)),
                                peakID = paste0(.data$chr, ":", .data$start, "-", .data$end),
                                isFiltered = FALSE)
  consensus.df
}

.removeScientificNotation_positions <- function(peakIDs.vec) {
  ids = strsplit(peakIDs.vec, split = ":", fixed = TRUE)
  ids_chr = sapply(ids, "[[", 1)
  ids_pos = sapply(ids, "[[", 2)
  ids_pos = strsplit(ids_pos, split = "-", fixed = TRUE)
  start = sapply(ids_pos, "[[", 1)
  end   = sapply(ids_pos, "[[", 2)
  
  paste0(ids_chr, ":", format(as.integer(start), scientific = FALSE), "-", format(as.integer(end), scientific = FALSE))
}

#' @importFrom biomaRt useEnsembl getBM
#' @importFrom ensembldb genes
#' @importFrom tools R_user_dir
#' @importFrom methods is
.retrieveAnnotationData <- function(genomeAssembly, EnsemblVersion = NULL, source = "AnnotationHub") {
    
    checkmate::assertChoice(source, c("biomaRt", "AnnotationHub"))
    
    if (source == "biomaRt") {
        
        futile.logger::flog.info(paste0("Retrieving genome annotation data from biomaRt for ", genomeAssembly, "... This may take a while"))
        
        params.l = .getBiomartParameters(genomeAssembly, suffix = "_gene_ensembl")
        
        
        columnsToRetrieve = c("chromosome_name", "start_position", "end_position",
                              "strand", "ensembl_gene_id", "gene_biotype", "external_gene_name")
        
        ensembl = .biomart_getEnsembl(biomart = "genes", version = EnsemblVersion, host = params.l[["host"]],  dataset = params.l[["dataset"]])
        
        results.df = .callBiomart(mart =  ensembl, attributes = columnsToRetrieve) 
  
        genes.df = results.df %>%
            tibble::as_tibble() %>%
            dplyr::filter(stringr::str_length(.data$chromosome_name) <= 5) %>%
            dplyr::mutate(chromosome_name = paste0("chr", .data$chromosome_name)) %>%
            dplyr::rename(gene.chr = "chromosome_name", gene.start = "start_position", gene.end = "end_position", 
                          gene.strand = "strand", gene.ENSEMBL = "ensembl_gene_id", gene.type = "gene_biotype", gene.name = "external_gene_name") %>%
            tidyr::replace_na(list(gene.type = "unknown")) %>%
            dplyr::mutate_if(is.character, as.factor) %>%
            dplyr::mutate(gene.type = dplyr::recode_factor(.data$gene.type, lncRNA = "lincRNA")) %>%  # there seems to be a name change from lincRNA -> lncRNA, lets change it here 
            dplyr::mutate(gene.strand = factor(.data$gene.strand, levels = c(1,-1), labels = c("+", "-")))
        
        
    } else if (source == "AnnotationHub") {

        futile.logger::flog.info(paste0("Retrieving genome annotation data from AnnotationHub for ", genomeAssembly, "... This may take a while"))
        
        AnnotationHub::setAnnotationHubOption("ASK", FALSE)
        
        maxAttempts = 10
        ah = .getAnnotationHub(maxAttempts = maxAttempts)
        
        if (!is(ah, "AnnotationHub")) {
            message = paste0("AnnotationHub genome failed despite attempting ", maxAttempts, " times")
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
    
        
        # TODO: Save metadata somewhere in object
        snapshotDate = AnnotationHub::snapshotDate(ah)
        
        params.l = .getBiomartParameters(genomeAssembly, suffix = "")
        if (genomeAssembly == "hg19") {
            message = "AnnotationHub genome retrieval with hg19 as genome assembly is not yet supported / implemented. Please use biomaRt instead as annotation source for the parameter genomeAnnotationSource in addData."
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        results = AnnotationHub::query(ah, c("EnsDb", params.l$dataset))
        annotationDatasets <- as.data.frame(mcols(results))
        
        # Find newest Ensembl ID automatically
        
        newestAnno.title = tail(annotationDatasets$title, 1)
        newestAnno.ID = tail(rownames(annotationDatasets), 1)
        
        
        ens.newest <- ah[[newestAnno.ID]]
        genes.df = as.data.frame(suppressWarnings(ensembldb::genes(ens.newest))) %>%
            tibble::as_tibble() %>%
            dplyr::mutate(gene.chr = paste0("chr", .data$seqnames)) %>%
            dplyr::select(-"seqnames") %>%
            dplyr::rename(gene.ENSEMBL = "gene_id", gene.start = "start", gene.end = "end",
                          gene.strand = "strand", gene.name = "gene_name", gene.type = "gene_biotype") %>%
            dplyr::select("gene.chr", "gene.start", "gene.end", "gene.strand", "gene.ENSEMBL", "gene.type", "gene.name") %>%
            tidyr::replace_na(list(gene.type = "unknown")) %>%
            #  dplyr::mutate(gene.strand = factor(.data$gene.strand, levels = c(1,-1,0), labels = c("+", "-", "*"))) %>%
            dplyr::mutate_if(is.character, as.factor) %>%
            dplyr::mutate(gene.type = dplyr::recode_factor(.data$gene.type, lncRNA = "lincRNA")) 
        
    }
    
    genes.df
   
}



.populatePeakAnnotation <- function(GRN) {
  
  countsPeaks.clean = getCounts(GRN, type = "peaks", permuted = FALSE, asMatrix = TRUE, includeFiltered = TRUE)
  
  futile.logger::flog.info(paste0(" Calculate statistics for each peak (mean and CV)"))
  
  rowMeans_peaks   = rowMeans(countsPeaks.clean)
  rowMedians_peaks = matrixStats::rowMedians(countsPeaks.clean)
  CV_peaks = matrixStats::rowSds(countsPeaks.clean) /  rowMeans_peaks
  
  metadata_peaks = tibble::tibble(peak.ID = rownames(countsPeaks.clean), 
                                  peak.mean = rowMeans_peaks, 
                                  peak.median = rowMedians_peaks, 
                                  peak.CV = CV_peaks)
  
  GRN@annotation$peaks = metadata_peaks
  

  if (!is.installed("ChIPseeker") | 
      !.checkAndLoadPackagesGenomeAssembly(GRN@config$parameters$genomeAssembly, returnLogical = TRUE) | 
      GRN@config$parameters$genomeAssembly %in% c("dm6")) {
      if (!is.installed("ChIPseeker")) {
          packageMessage = paste0("addData: The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ", 
                                  " You may want to install it and re-run this function. However, this is optional and except for some missing additional annotation columns, there is no limitation.")
          .checkPackageInstallation("ChIPseeker", packageMessage, isWarning = TRUE)
      } else {
          
          if (GRN@config$parameters$genomeAssembly %in% c("dm6")) {
              packageMessage = paste0("addData: For the genome dm6, ChiIPseeker cannot be used due to differences in the ID columns that are returned (Entrez ID and not Ensembl ID).")
              .checkPackageInstallation("ChIPseeker", packageMessage, isWarning = TRUE)
          }
          
          # otherwise, annotation packages missing, message will already been thrown
          
      }
      
  } else {
      
    
    futile.logger::flog.info(paste0(" Retrieve peak annotation using ChipSeeker. This may take a while"))
    genomeAssembly = GRN@config$parameters$genomeAssembly
    # consensusPeaks     = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered)
    consensusPeaks.gr  = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
    
    # Add ChIPSeeker anotation
    peaks.annotated = suppressMessages(ChIPseeker::annotatePeak(
      consensusPeaks.gr,
      tssRegion = c(-5000, 5000), # extended from -5kb to 5
      TxDb = .getGenomeObject(genomeAssembly, type = "txbd"),
      level = "gene", 
      assignGenomicAnnotation = TRUE,  # the default
      genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
                                    "Downstream", "Intergenic"),  # the default
      annoDb = .getGenomeObject(genomeAssembly, type = "packageName"), # optional, if provided, extra columns including SYMBOL, GENENAME, ENSEMBL/ENTREZID will be added
      sameStrand = FALSE, # the default
      ignoreOverlap = FALSE, # the default
      ignoreUpstream = FALSE, # the default
      ignoreDownstream = FALSE, # the default
      overlap = "TSS", # the default
      verbose = TRUE # the default
    ))
    
    GRN@annotation$peaks_obj = peaks.annotated
    
    peaks.annotated.df = as.data.frame(peaks.annotated)
    peaks.annotated.df$annotation[grepl("Exon", peaks.annotated.df$annotation)] = "Exon"
    peaks.annotated.df$annotation[grepl("Intron", peaks.annotated.df$annotation)] = "Intron"
    
    GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, 
                                                     peaks.annotated.df  %>% 
                                                       dplyr::select("peakID", "annotation", tidyselect::starts_with("gene"), "distanceToTSS", "ENSEMBL", "SYMBOL", "GENENAME") %>%
                                                       dplyr::mutate(annotation  = as.factor(.data$annotation), 
                                                                     ENSEMBL = as.factor(.data$ENSEMBL), 
                                                                     GENENAME = as.factor(.data$GENENAME),
                                                                     SYMBOL = as.factor(.data$SYMBOL)),
                                                     by = c("peak.ID" = "peakID")) %>%
      dplyr::rename(peak.nearestGene.chr = "geneChr",
                    peak.nearestGene.start = "geneStart", 
                    peak.nearestGene.end = "geneEnd", 
                    peak.nearestGene.length = "geneLength", 
                    peak.nearestGene.strand = "geneStrand", 
                    peak.nearestGene.name = "GENENAME",
                    peak.nearestGene.distanceToTSS = "distanceToTSS",
                    peak.nearestGene.ENSEMBL = "ENSEMBL",
                    peak.nearestGene.symbol = "SYMBOL",
                    peak.annotation = "annotation"
      )
    
    
  }
  
  GRN
  
}

.populateGeneAnnotation <- function(GRN, EnsemblVersion = NULL, genomeAnnotationSource = "AnnotationHub") {


  countsRNA.m  = getCounts(GRN, type = "rna", permuted = FALSE, asMatrix = TRUE, includeFiltered = TRUE)
  
  futile.logger::flog.info(paste0(" Calculate statistics for each of the ", nrow(countsRNA.m), " genes that were provided with the RNA-seq data (mean and CV)"))
  
  
  rowMeans_rna = rowMeans(countsRNA.m)
  rowMedians_rna = matrixStats::rowMedians(countsRNA.m)
  CV_rna = matrixStats::rowSds(countsRNA.m) /  rowMeans_rna
  
  genomeAnnotation.df = .retrieveAnnotationData(GRN@config$parameters$genomeAssembly, EnsemblVersion = EnsemblVersion, source = genomeAnnotationSource)
  
  metadata_rna = tibble::tibble(gene.ENSEMBL = rownames(countsRNA.m), 
                                gene.mean = rowMeans_rna, 
                                gene.median = rowMedians_rna, 
                                gene.CV = CV_rna) %>%
    dplyr::left_join(genomeAnnotation.df, by = c("gene.ENSEMBL")) # %>%
    # dplyr::mutate(gene.type = forcats::fct_na_value_to_level(.data$gene.type, level = "unknown/missing"))
  
  GRN@annotation$genes = metadata_rna
  
  GRN
  
}

.populateGOAnnotation <- function(GRN, results.tbl, ontology) {
  
  GRN@annotation$GO[[ontology]] = results.tbl[,c("ID", "Term")]
  GRN
  
}

#' @importFrom rlang .data `:=`
.calcGCContentPeaks <- function(GRN, nBins = 10) {
  
  futile.logger::flog.info(paste0("Calculate GC-content for peaks. This may take a while"))
  start = Sys.time()
  genomeAssembly = GRN@config$parameters$genomeAssembly

  genome = .getGenomeObject(genomeAssembly, type = "BSgenome")
  
  # Get peaks as GRanges object
  query   = .constructGRanges(GRN@data$peaks$counts_metadata, 
                              seqlengths = .getChrLengths(genomeAssembly), 
                              genomeAssembly)
  
  # Get DNAStringSet object
  seqs_peaks = Biostrings::getSeq(genome, query)
  
  GC_content.df = (Biostrings::letterFrequency(seqs_peaks, "GC") / Biostrings::letterFrequency(seqs_peaks, "ACGT")) %>%
    tibble::as_tibble() %>%
    dplyr::mutate(peak.width = GenomicRanges::width(query),
                  peak.ID = query$peakID,
                  peak.GC.class = cut(.data$`G|C`, breaks = seq(0,1,1/nBins), include.lowest = TRUE, ordered_result = TRUE)) %>%
    dplyr::rename(peak.GC.perc    = "G|C") %>%
    dplyr::select("peak.ID", tidyselect::everything())
  

  .printExecutionTime(start)
  
  GC_content.df
}

.calcAdditionalGCStatistics <- function(GRN, GC.data) {
    
    GC_classes.df = GC.data %>%
        dplyr::group_by(.data$peak.GC.class) %>%
        dplyr::summarise(n = dplyr::n(), peak_width_mean = mean(.data$peak.width), peak_width_sd = sd(.data$peak.width)) %>%
        dplyr::ungroup() %>% 
        tidyr::complete(.data$peak.GC.class, fill = list(n = 0)) %>%
        dplyr::mutate(n_rel = .data$n / nrow(GC.data))
    
    # TODO: Put where
    #ggplot2::ggplot(GC.data, ggplot2::aes(GC.class)) + geom_histogram(stat = "count") + ggplot2::theme_bw()
    
    #ggplot2::ggplot(GC_classes.df , ggplot2::aes(GC.class, n_rel)) + geom_bar(stat = "identity") + ggplot2::theme_bw()
    
    
    GRN@stats$GC = list()
    GRN@stats$GC$TFs_GC_correction = list()
    GRN@stats$GC$TFs_GC_correction_plots = list()
    GRN@stats$GC$peaks = GC_classes.df
    
    GRN
}


####### FILTER AND NORMALIZE DATA ############

.normalizeCountMatrix <- function(GRN, data, normalization, additionalParams =list()) {
    
    checkmate::assertMatrix(data)
    checkmate::assertChoice(normalization, 
                            choices = c("limma_cyclicloess", "limma_quantile", "limma_scale", 
                                        "csaw_cyclicLoess_orig", "csaw_TMM", 
                                        "EDASeq_GC_peaks", "gcqn_peaks",
                                        "DESeq2_sizeFactors",
                                        "none"))
    
    # Create a DESeq2 object
    if (normalization ==  "DESeq2_sizeFactors" | normalization == "csaw_cyclicLoess_orig" | normalization == "csaw_TMM") {

        dd <- suppressMessages(DESeq2::DESeqDataSetFromMatrix(countData = data,
                                             colData = data.frame( sampleID = colnames(data)),
                                             design = stats::as.formula(" ~ 1")))
    }
    
    # Common parameters
    if (normalization ==  "EDASeq_GC_peaks" | normalization == "gcqn_peaks") {
        
        # After either within-lane or between-lane normalization, the expression values are not counts anymore. 
        # However, their distribution still shows some typical features of counts distribution (e.g., the variance depends on the mean). 
        # Hence, for most applications, it is useful to round the normalized values to recover count-like values, which we refer to as “pseudo-counts”.
        # By default, both withinLaneNormalization and betweenLaneNormalization round the normalized values to the closest integer. 
        # This behavior can be changed by specifying round= FALSE. This gives the user more flexibility and assures that rounding approximations do not affect subsequent computations (e.g., recovering the offset from the normalized counts).
        if ("roundResults" %in% names(additionalParams)) {
            roundResults = additionalParams$roundResults
        } else {
            roundResults = FALSE
        }
        
        checkmate::assertFlag(roundResults)
        
        if ("GC_data" %in% names(additionalParams)) {
            GC_data.df = additionalParams$GC_data
        } else {
            message = "GC_data is missing in additionalParams list."
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }

    }
    
    if (normalization == "limma_cyclicloess" | normalization == "limma_quantile" | normalization == "limma_scale") {
        
        futile.logger::flog.info(paste0(" Normalizing data using the package limma with the following method: ", normalization))

        dataNorm = limma::normalizeBetweenArrays(data, method = sub("limma_", replacement = "", normalization))  
        
    } else if (normalization == "csaw_cyclicLoess_orig" | normalization == "csaw_TMM")   {
        
        futile.logger::flog.info(paste0("addData: Normalizing data using the package csaw with the following method: ", normalization))
        packageMessage = paste0("addData: The package csaw is currently not installed, but however needed for the normalization methods \"csaw_cyclicLoess_orig\" and \"csaw_TMM\"")
        .checkPackageInstallation("csaw", packageMessage, isWarning = TRUE)
        
        if (packageVersion("csaw") <= "1.14.1") {
            message = "The version of the csaw package is too old, install at least version 1.14.1 or change the normalization method"
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }

        object = SummarizedExperiment::SummarizedExperiment(list(counts = data))
        object$totals = colSums(data)
        
        if (normalization == "csaw_cyclicLoess_orig") {
            
            # Perform a cyclic loess normalization
            # We use a slighlty more complicated setup to derive size factors for library normalization
            # Instead of just determining the size factors in DeSeq2 via cirtual samples, we use 
            # a normalization from the csaw package (see https://www.rdocumentation.org/packages/csaw/versions/1.6.1/topics/normOffsets)
            # and apply a non-linear normalization. 
            # For each sample, a lowess curve is fitted to the log-counts against the log-average count. 
            # The fitted value for each bin pair is used as the generalized linear model offset for that sample. 
            # The use of the average count provides more stability than the average log-count when low counts are present for differentially bound regions.
            
            # since counts returns,by default, non-normalized counts, the following code should be fine and there is no need to also run estimateSizeFactors beforehand
 
            normFacs  = csaw::normOffsets(object, se.out = FALSE)
            
            # the normalization factors matrix should not have 0's in it
            # it should have geometric mean near 1 for each row
            # exp(mean(log(normFacs[i,]))) for each row i
            normFacs <- normFacs / exp(rowMeans(log(normFacs)))
            
            rownames(normFacs) = rownames(data)
            colnames(normFacs) = colnames(data)
            
            futile.logger::flog.info(paste0("  Using the csaw-derived feature-specific normalization factors for DESeq, which will preempt sizeFactors"))
            
            
            DESeq2::normalizationFactors(dd) <- normFacs
            
        
        } else { # TMM
            
            # This function uses the trimmed mean of M-values (TMM) method to remove composition biases, typically in background regions of the genome. 
            # The key difference from standard TMM is that precision weighting is turned off by default so as to avoid upweighting high-abundance regions. 
            # These are more likely to be bound and thus more likely to be differentially bound. 
            # Assigning excessive weight to such regions will defeat the purpose of trimming when normalizing the coverage of background regions.
            sizeFactors  = csaw::normFactors(object, se.out = FALSE)
            
            futile.logger::flog.info(paste0("  Using the csaw-derived TMM-derived normalization factors as size factors, overriding the DESeq-default size factors."))
            
            DESeq2::sizeFactors(dd) <- sizeFactors
        }
        
        dataNorm = DESeq2::counts(dd, normalized =  TRUE)
        
    } else if (normalization == "EDASeq_GC_peaks") {
        
        packageMessage = paste0("addData: The package EDASeq is currently not installed, but however needed for the normalization methods \"EDASeq_GC_peaks\"")
        .checkPackageInstallation("EDASeq", packageMessage, isWarning = TRUE)
        
        futile.logger::flog.info(paste0(" Normalizing data using the package EDASeq with the following method: ", normalization, " with the GC content as covariate."))
        # https://bioconductor.org/packages/release/bioc/vignettes/EDASeq/inst/doc/EDASeq.html#normalization

        if ("nBins" %in% names(additionalParams)) {
            nBins = additionalParams$nBins
        } else {
            nBins = 10 
        }

        # We implemented four within-lane normalization methods, namely: 
        # 1. loess robust local regression of read counts (log) on a gene feature such as GC-content (loess),
        # 2. global-scaling between feature strata using the median (median), 
        # 3. global-scaling between feature strata using the upper-quartile (upper), 
        # 4. and full-quantile normalization between feature strata (full). 
        # For a discussion of these methods in context of GC-content normalization see (Risso et al. 2011).
        if ("withinLane_method" %in% names(additionalParams)) {
            withinLane_method = additionalParams$withinLane_method
        } else {
            withinLane_method = "full"
        }
        
        # Regarding between-lane normalization, the package implements three of the methods introduced in (Bullard et al. 2010): 
        # global-scaling using the median (median), global-scaling using the upper-quartile (upper), and full-quantile normalization (full).
        if ("betweenLane_method" %in% names(additionalParams)) {
            betweenLane_method = additionalParams$withinLane_method
        } else {
            betweenLane_method = "full"
        }
        
        peaks_GC_fraction = GC_data.df$peak.GC.perc
 
        
        futile.logger::flog.info(paste0(" Using the following additional parameters for EDASeq: nBins = ", nBins, 
                                        ", withinLane_method = ", withinLane_method, 
                                        ", betweenLane_method = ", betweenLane_method,
                                        ", roundResults = ", roundResults,
                                        " as well as the automatically calculated peak GC content as covariate for withinLaneNormalization"))
        
        checkmate::assertNumeric(peaks_GC_fraction, lower = 0, upper = 1)
        checkmate::assertIntegerish(nBins, lower = 1, upper = 100)
        checkmate::assertChoice(withinLane_method, choices = c("loess","median","upper","full"))
        checkmate::assertChoice(betweenLane_method, choices = c("median","upper","full"))
        
        # Following (Risso et al. 2011), we consider two main types of effects on gene-level counts: 
        # (1) within-lane gene-specific (and possibly lane-specific) effects, e.g., related to gene length or GC-content, and 
        # (2) effects related to between-lane distributional differences, e.g., sequencing depth. 
        # Accordingly, withinLaneNormalization and betweenLaneNormalization adjust for the first and second type of effects, respectively. 
        # We recommend to normalize for within-lane effects prior to between-lane normalization.
        dataWithin <- EDASeq::withinLaneNormalization(data, y = peaks_GC_fraction, which = withinLane_method, num.bins = nBins, round = roundResults)
        dataNorm <- EDASeq::betweenLaneNormalization(dataWithin, which = betweenLane_method, round = roundResults)
        
        
    } else if (normalization == "gcqn_peaks") {
       
        futile.logger::flog.info(paste0(" Normalizing data using the GC-full-quantile (GC-FQ) normalization approach as described in Van den Berge et al. 2021 (https://doi.org/10.1101/2021.01.26.428252) using 50 bins."))

        if ("nBins" %in% names(additionalParams)) {
            nBins = additionalParams$nBins
        } else {
            # Use the recommended 50 bins as described in the paper as default
            nBins = 50
        }

        peaks_GC_class = cut(GC_data.df$peak.GC.perc, breaks = seq(0,1,1/nBins), include.lowest = TRUE, ordered_result = TRUE)

        dataNorm = .gcqn(data = data, 
                         GC_class = peaks_GC_class,
                         summary = 'mean', round = roundResults)
        
        
    } else if (normalization == "DESeq2_sizeFactors") {
        
        futile.logger::flog.info(paste0(" Normalizing data using the package DESeq2 with a standard size factor normalization."))
        
        
        # Check whether enough genes have non-zero counts to actually estimate size factors reasonably
        .checkSizeFactorEligibility(dd)
        
        
        dd = DESeq2::estimateSizeFactors(dd)
        dataNorm = DESeq2::counts(dd, normalized = TRUE)
        
    } else if (normalization == "none") {
        dataNorm = data
        futile.logger::flog.info(paste0(" Skip normalization."))
        # Nothing to do, leave countsPeaks as they are
    }
    
    dataNorm
}

.checkSizeFactorEligibility <- function(dd, minRows = 10) {
    
    counts_raw = DESeq2::counts(dd, normalized = FALSE)
    
    counts_raw[counts_raw == 0] <- NA
    nRows_nonZero =  nrow(counts_raw[stats::complete.cases(counts_raw),])
    
    if (nRows_nonZero == 0) {
        message = paste0(" Every feature contains at least one zero, cannot compute log geometric means for the DESeq2 size factor normalization. This happens when the input data contain too many zeroes and happens particularly often for single-cell derived data. Decrease the number of 0s by either decreasing the number of samples or by increasing the sequencing depth.")
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
    }
    
    if (nRows_nonZero < minRows) {
        message = paste0("addData: Almost every feature (except ", nRows_nonZero, ") contain at least one zero. DESeq2 size factor normalization based on so few features may become unreliable. This happens when the input data contain too many zeroes and happens particularly often for single-cell derived data. Decrease the number of 0s by either decreasing the number of samples or by increasing the sequencing depth.")
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
    }
    
}


#The following functions are taken from https://github.com/koenvandenberge/bulkATACGC/blob/master/methods/gcqn_validated.R

### GCQN, first implementation
FQnorm <- function(counts, type= "mean") {
    rk <- apply(counts,2,rank,ties.method = 'min')
    counts.sort <- apply(counts,2,sort)
    if (type == "mean") {
        # refdist <- apply(counts.sort,1,mean)
        refdist <- base::rowMeans(counts.sort)
    } else if (type == "median") {
        #refdist <- apply(counts.sort,1,median)
        refdist <- matrixStats::rowMedians(counts.sort)
    }
    norm <- apply(rk, 2, function(r) { refdist[r] })
    rownames(norm) <- rownames(counts)
    return(norm)
}


# GC-full-quantile (GC-FQ) normalization. GC-FQ is similar to FQ-FQ, but relies on the observation that, in
# ATAC-seq, read count distributions are often more comparable between samples within a GC-content bin, than
# between GC-content bins within a sample (Figure 2). It therefore applies between-sample FQ normalization for
# each GC-content bin separately
.gcqn <- function(data, GC_class, summary= 'mean', round= FALSE) {
    
    gcBinNormCounts <- matrix(NA, nrow = nrow(data), ncol = ncol(data), dimnames = list(rownames(data),colnames(data)))
    
    for (ii in 1:nlevels(GC_class)) {
  
        id <- which(GC_class == levels(GC_class)[ii])
        if (length(id) == 0) next
        if (length(id) == 1) {
            normCountBin <- data[id,]
            if (round) normCountBin <- round(normCountBin)
            gcBinNormCounts[id,] <- normCountBin
            next
        }
        countBin <- data[id,,drop = FALSE]
        if (summary == "mean") {
            normCountBin <- FQnorm(countBin, type = 'mean')
        } else if (summary == "median") {
            normCountBin <- FQnorm(countBin, type = 'median')
        }
        if (round) normCountBin <- round(normCountBin)
        normCountBin[normCountBin < 0] <- 0
        gcBinNormCounts[id,] <- normCountBin
    }
    return(gcBinNormCounts)
}


# peaksAnnotation = GRN@annotation$peaks
# counts = getCounts(GRN, type = "peaks", asMatrix = TRUE, includeFiltered = TRUE)
# Currently not applicable as we do not have different groups, qsmooth does not run with only one group
# 
# .gcqn_qsmooth_mod <- function(counts, peaksAnnotation) {
#     
#     packageMessage = paste0("The package qsmooth is not installed, which is however needed for the chosen normalization method. Please install it and re-run this function or change the normalization method.")
#     .checkPackageInstallation("qsmooth", packageMessage)
#     
#     groups_factors = factor(rep("A", length(GRN@config$sharedSamples)))
#     
#     gcBinNormCounts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts), dimnames=list(rownames(counts),colnames(counts)))
#     for(ii in 1:nlevels(peaksAnnotation$peak.GC.class)) {
#         id <- which(peaksAnnotation$peak.GC.class==levels(peaksAnnotation$peak.GC.class)[ii])
#         countBin <- counts[id,]
#         qs <- qsmooth::qsmooth(countBin, group_factor=groups_factors)
#         normCountBin <- qs@qsmoothData
#         normCountBin <- round(normCountBin)
#         normCountBin[normCountBin< 0] <- 0
#         gcBinNormCounts[id,] <- normCountBin
#     }
#     return(gcBinNormCounts)
# }






# 
# # Needed
# # Add DESeq2 normalization factors maybe? csaw stuff
# .normalizeCounts <- function(rawCounts, method = "quantile", ) {
#     
#     checkmate::assertChoice(idColumn, colnames(rawCounts))
#     start = Sys.time()
#     
#     futile.logger::flog.info(paste0("Normalize counts. Method: ", method, ", ID column: ", idColumn))
#     
#     
#     if (method == "quantile") {
#         
#         if (length(rmCols) > 0) {
#             input = as.matrix(rawCounts[,-rmCols])
#         } else {
#             input = as.matrix(rawCounts)
#         }
#         
#         # We use limma for normalizing quantiles and NOT preprocessCore as before due to regression bugs for version >1.50
#         counts.norm = limma::normalizeQuantiles(input)
#         
#     } else if (method == "DESeq_sizeFactor") {
#         
#         if (length(rmCols) > 0) {
#             sampleData.df = data.frame( sampleID = colnames(rawCounts)[-rmCols], stringsAsFactors = FALSE)
#             countDataNew = as.data.frame(rawCounts[, -rmCols])
#         } else {
#             sampleData.df = data.frame( sampleID = colnames(rawCounts))
#             countDataNew = as.data.frame(rawCounts)
#         }
#         
#         rownames(countDataNew) = ids
#         
#         stopifnot(identical(sampleData.df$sampleID, colnames(countDataNew)))
#         
#         dd <- DESeq2::DESeqDataSetFromMatrix(countData = countDataNew,
#                                              colData = sampleData.df,
#                                              design = stats::as.formula(" ~ 1"))
#         
#         dd = DESeq2::estimateSizeFactors(dd)
#         counts.norm = DESeq2::counts(dd, normalized = TRUE)
#         
#         if (returnDESeqObj) {
#             return(dd)
#         }
#         
#         
#     } else if (method == "none") {
#         
#         if (length(rmCols) > 0) {
#             counts.norm = rawCounts[,-rmCols]
#         } else {
#             counts.norm = rawCounts
#         }
#         
#     } else  {
#         stop("Not implemented yet")
#     }
#     
#     .printExecutionTime(start)
#     
#     counts.norm = counts.norm %>% 
#         as.data.frame()  %>% 
#         tibble::as_tibble() %>% 
#         dplyr::mutate({{idColumn}} := ids) %>%
#         dplyr::select({{idColumn}}, tidyselect::everything()) 
#     
#     colnames(counts.norm) = c(idColumn, colnames_samples)
#     
#     counts.norm
#     
# }



#' Filter RNA-seq and/or peak data from a \code{\linkS4class{GRN}} object
#' 
#' This function marks genes and/or peaks as \code{filtered} depending on the chosen filtering criteria and is based on the count data AFTER
#' potential normalization as chosen when using the \code{\link{addData}} function. Most of the filters may not be meaningful and useful anymore to apply
#' after using particular normalization schemes that can give rise to, for example, negative values such as cyclic loess normalization. If normalized counts do
#' not represents counts anymore but rather a deviation from a mean or something a like, the filtering critieria usually do not make sense anymore.
#' Filtered genes / peaks will then be disregarded when adding connections in subsequent steps via \code{\link{addConnections_TF_peak}} and  \code{\link{addConnections_peak_gene}}. \strong{This function does NOT (re)filter existing connections when the \code{\linkS4class{GRN}} object already contains connections. Thus, upon re-execution of this function with different filtering criteria, all downstream steps have to be re-run.}
#' 
#' All this function does is setting (or modifying) the filtering flag in \code{GRN@data$peaks$counts_metadata} and \code{GRN@data$RNA$counts_metadata}, respectively.
#' 
#' @template GRN 
#' @param minNormalizedMean_peaks Numeric[0,] or \code{NULL}. Default 5. Minimum mean across all samples for a peak to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter.
#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
#' @param maxNormalizedMean_peaks Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum mean across all samples for a peak to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter.
#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
#' @param minNormalizedMeanRNA Numeric[0,] or \code{NULL}. Default 5. Minimum mean across all samples for a gene to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter.
#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
#' @param maxNormalizedMeanRNA Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum mean across all samples for a gene to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter.
#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
#' @param chrToKeep_peaks Character vector or \code{NULL}. Default \code{NULL}. Vector of chromosomes that peaks are allowed to come from. This filter can be used to filter sex chromosomes from the peaks, for example (e.g, \code{c(paste0("chr", 1:22), "chrX", "chrY")})
#' @param minSize_peaks Integer[1,] or \code{NULL}. Default 20. Minimum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter.
#' @param maxSize_peaks Integer[1,] or \code{NULL}. Default 10000. Maximum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter.
#' @param minCV_peaks Numeric[0,] or \code{NULL}. Default \code{NULL}. Minimum CV (coefficient of variation, a unitless measure of variation) for a peak to be retained. Set to \code{NULL} for not applying the filter.
#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
#' @param maxCV_peaks Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum CV (coefficient of variation, a unitless measure of variation) for a peak to be retained. Set to \code{NULL} for not applying the filter.
#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
#' @param minCV_genes Numeric[0,] or \code{NULL}. Default \code{NULL}. Minimum CV (coefficient of variation, a unitless measure of variation) for a gene to be retained. Set to \code{NULL} for not applying the filter.
#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
#' @param maxCV_genes Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum CV (coefficient of variation, a unitless measure of variation) for a gene to be retained. Set to \code{NULL} for not applying the filter.
#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
#' @template forceRerun
#' @return An updated \code{\linkS4class{GRN}} object, with added data from this function.
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN = filterData(GRN, forceRerun = FALSE)
#' @export
filterData <- function(GRN, 
                        minNormalizedMean_peaks = NULL, maxNormalizedMean_peaks = NULL, 
                        minNormalizedMeanRNA = NULL,  maxNormalizedMeanRNA = NULL,
                        chrToKeep_peaks = NULL,
                        minSize_peaks = 20, maxSize_peaks = 10000,
                        minCV_peaks = NULL, maxCV_peaks = NULL,
                        minCV_genes = NULL, maxCV_genes = NULL,
                        forceRerun = FALSE) {
  
  start = Sys.time()
    
  checkmate::assertClass(GRN, "GRN")
  GRN = .addFunctionLogToObject(GRN) 
  
  GRN = .makeObjectCompatible(GRN)

  checkmate::assertNumber(minNormalizedMean_peaks, lower = 0, null.ok = TRUE)
  checkmate::assertNumber(minNormalizedMeanRNA, lower = 0, null.ok = TRUE)
  checkmate::assertNumber(maxNormalizedMean_peaks, lower = ifelse(is.null(minNormalizedMean_peaks), -.Machine$double.xmax, minNormalizedMean_peaks), null.ok = TRUE)
  checkmate::assertNumber(maxNormalizedMeanRNA, lower = ifelse(is.null(minNormalizedMeanRNA) , -.Machine$double.xmax, minNormalizedMeanRNA), null.ok = TRUE)
  checkmate::assertCharacter(chrToKeep_peaks, min.len = 1, any.missing = FALSE, null.ok = TRUE)
  # checkmate::assertSubset(chrToKeep_peaks, GRN@data$peaks$counts_metadata %>% dplyr::pull(.data$chr) %>% unique() %>% as.character())
  
  checkmate::assertIntegerish(minSize_peaks, lower = 1, null.ok = TRUE)
  checkmate::assertIntegerish(maxSize_peaks, lower = ifelse(is.null(minSize_peaks), 1, minSize_peaks), null.ok = TRUE)
  checkmate::assertNumber(minCV_peaks, lower = 0, null.ok = TRUE)
  checkmate::assertNumber(maxCV_peaks, lower = ifelse(is.null(minCV_peaks), 0, minCV_peaks), null.ok = TRUE)
  checkmate::assertNumber(minCV_genes, lower = 0, null.ok = TRUE)
  checkmate::assertNumber(maxCV_genes, lower = ifelse(is.null(minCV_genes), 0, minCV_genes), null.ok = TRUE)
  checkmate::assertFlag(forceRerun)
  
  GRN@data$peaks$counts_metadata$isFiltered = FALSE
  
  if (!is.null(GRN@data$TFs$TF_peak_overlap)) {
      # TODO here
      GRN@data$TFs$TF_peak_overlap[, "isFiltered"] = 0
  }
  
  
  # Filter peaks
  futile.logger::flog.info("FILTER PEAKS")
  peakIDs.CV = .filterPeaksByMeanCV(GRN, 
                                    minMean = minNormalizedMean_peaks, maxMean = maxNormalizedMean_peaks, 
                                    minCV = minCV_peaks, maxCV = maxCV_peaks) 
  
  # Clean peaks from alternative contigs etc 
  GRN@config$parameters$chrToKeep =  chrToKeep_peaks
  peakIDs.chr = .filterPeaksByChromosomeAndSize(GRN, 
                                                chrToKeep_peaks, 
                                                minSize_peaks = minSize_peaks, maxSize_peaks = maxSize_peaks)
  
  nPeaksBefore = nrow(GRN@data$peaks$counts_metadata)
  peaks_toKeep = intersect(peakIDs.chr, peakIDs.CV)
  futile.logger::flog.info(paste0("Collectively, filter ", nPeaksBefore - length(peaks_toKeep), " out of ", nPeaksBefore, " peaks."))
  futile.logger::flog.info(paste0("Number of remaining peaks: ", length(peaks_toKeep)))
  
  if (length(peaks_toKeep) < 1000) {
      message = paste0("filterData: Too few peaks (", length(peaks_toKeep), ") remain after filtering. At least 1000 peaks should remain. We strongly advise to adjust the filtering settings.")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
     
  }
  
  GRN@data$peaks$counts_metadata$isFiltered  = !GRN@data$peaks$counts_metadata$peakID  %in% peaks_toKeep
  #GRN@data$peaks$counts_raw$isFiltered = ! GRN@data$peaks$counts_raw$peakID  %in% peaks_toKeep
  GRN@data$peaks$counts_metadata$isFiltered = !GRN@data$peaks$counts_metadata$peakID  %in% peaks_toKeep
  
  
  if (!is.null(GRN@data$TFs$TF_peak_overlap)) {

      GRN@data$TFs$TF_peak_overlap[, "isFiltered"] = as.integer(!rownames(GRN@data$TFs$TF_peak_overlap) %in% peaks_toKeep)
  }
  
  
  # Remove genes with small rowMeans
  #Only for real data, not for background (rowmeans is equal anyway)
  # Filter peaks
  futile.logger::flog.info("FILTER RNA-seq")
  genes.CV = .filterGenesByMeanCV(GRN, 
                                  minMean = minNormalizedMeanRNA, maxMean = maxNormalizedMeanRNA, 
                                  minCV = minCV_genes, maxCV = maxCV_genes) 
  
  
  if (length(genes.CV) < 100) {
      message = paste0("Too few genes (", length(genes.CV), ") remain after filtering. At least 100 genes must remain. Adjust the filtering settings.")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  }
  
  
  GRN@data$RNA$counts_metadata$isFiltered = !GRN@data$RNA$counts_metadata$ID %in% genes.CV
  
  nRowsFlagged = length(which(GRN@data$RNA$counts_metadata$isFiltered))
  
  # Raw counts are left untouched and filtered where needed only
  futile.logger::flog.info(paste0(" Flagged ", nRowsFlagged, " rows due to filtering criteria"))
  
  GRN@config$isFiltered = TRUE
  
  .printExecutionTime(start, prefix = "")
  
  GRN
}


.filterPeaksByChromosomeAndSize <- function(GRN, chrToKeep, minSize_peaks = NULL, maxSize_peaks = NULL, idColumn = "peakID") {
  
  startTime = Sys.time()
  
  if (is.null(minSize_peaks)) {
    minSize_peaks = 1
  } 
  if (is.null(maxSize_peaks)) {
      maxSize_peaks = .Machine$double.xmax
      futile.logger::flog.info(paste0("Filter and sort peaks by size and remain only those bigger than ", minSize_peaks))
  } else {
      futile.logger::flog.info(paste0("Filter and sort peaks by size and remain only those bigger than ", minSize_peaks, " and smaller than ", maxSize_peaks))
  }
 
  
  if (is.null(chrToKeep)) {
    chrToKeep = GRN@data$peaks$counts_metadata %>% dplyr::pull(.data$chr) %>% unique()
  } else {
    futile.logger::flog.info(paste0("Filter and sort peaks and remain only those on the following chromosomes: ", paste0(chrToKeep, collapse = ",")))
  }
  

 
  futile.logger::flog.info(paste0(" Number of peaks before filtering: ", nrow(GRN@data$peaks$counts_metadata)))
  
  countsPeaks.clean = GRN@data$peaks$counts_metadata %>%
    dplyr::mutate(size = end - start) %>%
    dplyr::filter(.data$chr %in% chrToKeep, .data$size <= maxSize_peaks, .data$size >= minSize_peaks) %>%
    # arrange(chr, start) %>%
    dplyr::rename(peakID = !!(idColumn)) %>%
    dplyr::select(-"chr",-"start", -"end", -"size") %>%
    dplyr::select("peakID", tidyselect::everything())
  
  futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(countsPeaks.clean)))
  
  .printExecutionTime(startTime)
  countsPeaks.clean$peakID
}


.filterPeaksByMeanCV <- function(GRN, minMean = 0, maxMean = NULL, minCV = 0, maxCV = NULL) {
  
  startTime = Sys.time()
  
  futile.logger::flog.info(paste0(" Number of peaks before filtering : ", nrow(GRN@annotation$peaks)))
  
  if (is.null(minCV)) {
    minCV = 0
  }
  
  if (is.null(maxCV)) {
    futile.logger::flog.info(paste0("  Filter peaks by CV: Min = ", minCV))
    maxCV = 9e+99
    
  } else {
    futile.logger::flog.info(paste0("  Filter peaks by CV: Min = ", minCV, ", Max = ", maxCV))
  }
  
  
  if (is.null(minMean)) {
    
    # As data can be pre-normalized, set the minimum to a very small value so the filter is effectively off
    minMean = -9e+99
    
    if (is.null(maxMean)) {
        maxMean = 9e+99
    } else {
        futile.logger::flog.info(paste0("  Filter peaks by mean: Max = ", maxMean))  
    }   
    
  } else {
      
      if (is.null(maxMean)) {
          futile.logger::flog.info(paste0("  Filter peaks by mean: Min = ", round(minMean, 2)))
          maxMean = 9e+99
      } else {
          futile.logger::flog.info(paste0("  Filter peaks by mean: Min = ", round(minMean, 2), ", Max = ", maxMean))  
      }   
  }
  
 
  peaksFiltered = dplyr::filter(GRN@annotation$peaks, 
                                .data$peak.CV >= minCV, .data$peak.CV <= maxCV, 
                                .data$peak.mean >= minMean, .data$peak.mean <= maxMean)
  
  futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
  
  .printExecutionTime(startTime)
  
  peaksFiltered$peak.ID
}

.filterGenesByMeanCV <- function(GRN, minMean = 0, maxMean = NULL, minCV = 0, maxCV = NULL) {
  
  startTime = Sys.time()
  
  futile.logger::flog.info(paste0(" Number of genes before filtering : ", nrow(GRN@annotation$genes)))
  
  if (is.null(minCV)) {
    minCV = 0
  }
  
  if (is.null(maxCV)) {
    futile.logger::flog.info(paste0("  Filter genes by CV: Min = ", minCV))
    maxCV = 9e+99
    
  } else {
    futile.logger::flog.info(paste0("  Filter genes by CV: Min = ", minCV, ", Max = ", maxCV))
  }
  
  messageMean = paste0("  Filter genes by mean:")
  
  if (is.null(minMean)) {
    minMean = -9e+99
  } else {
    messageMean = paste0(messageMean, " Min = ", minMean)
  }
  

  if (is.null(maxMean)) {
    maxMean = 9e+99
  } else {
    messageMean = paste0(messageMean, " Max = ", maxMean)
  }   
  
  futile.logger::flog.info(messageMean)
  
  
  genesFiltered = dplyr::filter(GRN@annotation$genes, 
                                .data$gene.CV >= minCV, .data$gene.CV <= maxCV, 
                                .data$gene.mean >= minMean, .data$gene.mean <= maxMean)
  
  
  futile.logger::flog.info(paste0(" Number of genes after filtering : ", nrow(genesFiltered)))
  
  .printExecutionTime(startTime)
  
  genesFiltered$gene.ENSEMBL 
}


######## TFBS ########

#' Add TFBS to a \code{\linkS4class{GRN}} object. 
#' 
#' For this, a folder that contains one TFBS file per TF in bed or bed.gz format must be given (see details). The folder must also contain a so-called translation table, see the argument \code{translationTable} for details. 
#' We provide example files for selected supported genome assemblies (hg19, hg38 and mm10, mm39) that are fully compatible with GRaNIE as separate downloads. For more information, check \url{https://difftf.readthedocs.io/en/latest/chapter2.html#dir-tfbs}.
#' 
#' @template GRN 
#' @param source Character. One of \code{custom}, \code{JASPAR2022} or \code{JASPAR2024}. Default \code{custom}. If a custom source is being used, further details about the motif folder and files will be provided (see the other function arguments). 
#' If set to \code{JASPAR2022}, the \href{https://bioconductor.org/packages/release/data/annotation/html/JASPAR2022.html}{JASPAR2022} database is used.
#' If set to \code{JASPAR2024}, the \href{https://bioconductor.org/packages/release/data/annotation/html/JASPAR2024.html}{JASPAR2024} database is used.
#' @param motifFolder Character. No default. Only relevant if \code{source = "custom"}. Path to the folder that contains the TFBS predictions. The files must be in BED format, 6 columns, one file per TF. See the other parameters for more details. The folder must also contain a so-called translation table, see the argument \code{translationTable} for details.
#' @param TFs Character vector. Default \code{all}. Only relevant if \code{source = "custom"}. Vector of TF names to include. The special keyword \code{all} can be used to include all TF found in the folder as specified by \code{motifFolder}. If \code{all} is specified anywhere, all TFs will be included. TF names must otherwise match the file names that are found in the folder, without the file suffix.
#' @param translationTable Character. Default \code{translationTable.csv}. Only relevant if \code{source = "custom"}. Name of the translation table file that is also located in the folder along with the TFBS files. This file must have the following structure: at least 2 columns, called \code{ENSEMBL} and \code{ID}. \code{ID} denotes the ID for the TF that is used throughout the pipeline (e.g., AHR) and the prefix of how the corresponding file is called (e.g., \code{AHR.0.B} if the file for AHR is called \code{AHR.0.B_TFBS.bed.gz}), while \code{ENSEMBL} denotes the ENSEMBL ID (dot suffix; e.g., ENSG00000106546, are removed automatically if present). 
#' @param translationTable_sep Character. Default \code{" "} (white space character). Only relevant if \code{source = "custom"}. The column separator for the \code{translationTable} file.
#' @param filesTFBSPattern Character. Default \code{"_TFBS"}. Only relevant if \code{source = "custom"}. Suffix for the file names in the TFBS folder that is not part of the TF name. Can be empty. For example, for the TF CTCF, if the file is called \code{CTCF.all.TFBS.bed}, set this parameter to \code{".all.TFBS"}.
#' @param fileEnding Character. Default \code{".bed"}. Only relevant if \code{source = "custom"}. File ending for the files from the motif folder.
#' @param nTFMax \code{NULL} or Integer[1,]. Default \code{NULL}. Maximal number of TFs to import. Can be used for testing purposes, e.g., setting to 5 only imports 5 TFs even though the whole \code{motifFolder} has many more TFs defined.
#' @param EnsemblVersion \code{NULL} or Character(1). Default \code{NULL}. Only relevant if \code{source} is not set to \code{custom}, ignored otherwise. The Ensembl version to use for the retrieval of gene IDs from their provided database names (e.g., JASPAR) via \code{biomaRt}.
#' By default (\code{NULL}), the newest version is selected for the most recent genome assembly versions is used (see \code{biomaRt::listEnsemblArchives()} for supported versions). This parameter can override this to use a custom (older) version instead.
#' @param JASPAR_useSpecificTaxGroup \code{NULL} or Character(1). Default \code{NULL}. Should a tax group instead of th specific genome assembly be used for retrieving the TF list? This is useful for genomes that are not human or mouse for which JASPAR otherwise returns too few TFs otherwise.
#' If set to \code{NULL}, the specific genome version as provided in the object is used within \code{TFBSTools::getMatrixSet} in the \code{opts} list for \code{species}, 
#' while \code{tax_group} will be used instead if this argument is not set to \code{NULL}. For example, it can be set to \code{vertebrates} to use the vertebrates TF collection.
#' For more details, see \code{?TFBSTools::getMatrixSet}.
#' @param JASPAR_removeAmbiguousTFs \code{TRUE} or \code{FALSE}.  Default \code{TRUE}. Remove TFs for which the name as provided b JASPAR cannot be mapped uniquely to one and only Ensembl ID? 
#' By default (\code{NULL}), the newest version is selected (see \code{biomaRt::listEnsemblArchives()} for supported versions). This parameter can override this to use a custom (older) version instead.
#' @param ... Additional named elements for the \code{opts} function argument from \code{?TFBSTools::getMatrixSet} that is used to query the JASPAR database.
#' @template forceRerun
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function(\code{GRN@annotation$TFs} in particular)
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' @export
addTFBS <- function(GRN, source = "custom", motifFolder = NULL, TFs = "all", 
                    translationTable = "translationTable.csv",  translationTable_sep = " ", filesTFBSPattern = "_TFBS", fileEnding = ".bed", 
                    nTFMax = NULL, EnsemblVersion = NULL,
                    JASPAR_useSpecificTaxGroup = NULL, JASPAR_removeAmbiguousTFs = TRUE,
                    forceRerun = FALSE, ...) {
    
    start = Sys.time()
    checkmate::assertClass(GRN, "GRN")
    
    GRN = .makeObjectCompatible(GRN)
    
    checkmate::assertSubset(source, c("custom", "JASPAR2022", "JASPAR2024"))
    
    if (source == "custom") {
        
        if (is.null(motifFolder)) {
            futile.logger::flog.error("When using a custom source for the TFBS, please specify a valid path to a motif folder using the motifFolder parameter")
        }else{
            checkmate::assertFileExists(paste0(motifFolder, .Platform$file.sep, translationTable))
            checkmate::assertDirectoryExists(motifFolder)
            checkmate::assertCharacter(filesTFBSPattern, len = 1, min.chars = 0)
            checkmate::assertCharacter(fileEnding, len = 1, min.chars = 1)
            checkmate::assertCharacter(translationTable_sep, len = 1, min.chars = 1)
        }
    } else if (source == "JASPAR2022" | source == "JASPAR2024") {
        
        if (!is.null(motifFolder)) {
            message = paste0("addTFBS: When using the JASPAR database, the motifFolder, along with other function parameters, is ignored")
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
        }

        packageMessage = paste0("At least one of the packages ", source, ", TFBSTools, motifmatchr, rbioapi are not installed, but needed here due to source = \"", source, "\". Please install and re-run this function.")
        .checkPackageInstallation(c(source, "TFBSTools", "motifmatchr", "rbioapi"), packageMessage)  
        
        checkmate::assert(checkmate::checkNull(JASPAR_useSpecificTaxGroup), 
                          checkmate::checkSubset(JASPAR_useSpecificTaxGroup, c("plants", "vertebrates", "insects", "urochordat", "nematodes", "fungi")))
        checkmate::assertFlag(JASPAR_removeAmbiguousTFs)

    }
    
    checkmate::assertCharacter(TFs, min.len = 1)
    checkmate::assert(checkmate::testNull(nTFMax), checkmate::testIntegerish(nTFMax, lower = 1))
    checkmate::assertFlag(forceRerun)
    
    
    if (is.null(GRN@annotation$TFs) | is.null(GRN@annotation$TFs) | is.null(GRN@config$allTF)  | is.null(GRN@config$directories$motifFolder) | forceRerun) {
        
        GRN = .addFunctionLogToObject(GRN)
        
        #GRN@config$TFBS_fileEnding  = ifelse(source == "custom", fileEnding, NULL) # ifelse doesn't seem to like that null
        GRN@config$TFBS_fileEnding = if (source == "custom") fileEnding else NULL
        GRN@config$TFBS_filePattern = if (source == "custom") filesTFBSPattern else NULL
        GRN@config$directories$motifFolder <- if (source == "custom") motifFolder else NULL
        
       
        GRN = .getFinalListOfTFs(GRN, source, motifFolder, translationTable, translationTable_sep, 
                                                filesTFBSPattern, fileEnding, TFs, nTFMax, EnsemblVersion,
                                                JASPAR_useSpecificTaxGroup, JASPAR_removeAmbiguousTFs, ...)
        
        GRN@annotation$TFs = GRN@annotation$TFs %>%
            dplyr::select("ENSEMBL", "ID", "SYMBOL") %>% # in case the table contains already another column called TF.name as currently the case for JASPAR
            dplyr::rename(TF.ENSEMBL = "ENSEMBL", TF.ID = "ID", TF.name = "SYMBOL")  %>% 
            #dplyr::mutate(TF.name = .data$TF.ID)  %>%
            dplyr::select("TF.ID", "TF.name", "TF.ENSEMBL")
        
        GRN@config$allTF = GRN@annotation$TFs$TF.ID
        
        #Store all data-dependent TF information
        # GRN@config$TF_list = list()
        # GRN@config$TF_list[["all_TFBS"]] =GRN@config$allTF
        
        
        
    } else {
        .printDataAlreadyExistsMessage()
    }
    
    .printExecutionTime(start, prefix = "")
    
    GRN
    
}

.getFinalListOfTFs <- function(GRN, source, folder_input_TFBS, translationTable, translationTable_sep, 
                               filesTFBSPattern, fileEnding, TFs, nTFMax, EnsemblVersion,
                               JASPAR_useSpecificTaxGroup = NULL, JASPAR_removeAmbiguousTFs = TRUE, ...) {
    
    if (source == "JASPAR2022" | source == "JASPAR2024") {
        
        futile.logger::flog.info(paste0("Querying ", source, " database. This may take a while."))
        # get TF gene names from JASPAR
        
        if (!is.null(JASPAR_useSpecificTaxGroup)) {
            # such as "vertebrates"
            options_JASPAR = list(tax_group = JASPAR_useSpecificTaxGroup, ...)
        } else {
            options_JASPAR = list(species = .getGenomeObject(GRN@config$parameters$genomeAssembly, "txID"), ...)
        }
        
        if (source == "JASPAR2022") {
            PFMatrixList <- TFBSTools::getMatrixSet(JASPAR2022::JASPAR2022, opts = options_JASPAR) 
        } else if (source == "JASPAR2024") {
            sq24 <- RSQLite::dbConnect(RSQLite::SQLite(), JASPAR2024::db(JASPAR2024::JASPAR2024()))
            PFMatrixList <- TFBSTools::getMatrixSet(sq24, opts = options_JASPAR) 
            RSQLite::dbDisconnect(sq24)
        }
        
        
        
        GRN@config$parameters$internal$PFMatrixList = PFMatrixList
        
        # TODO: What to o with TFs with no name or IDs?
        
        TFsWithTFBSPredictions <- unlist(lapply(PFMatrixList, function(x) {return(TFBSTools::name(x))}), use.names = FALSE)
        IDsWithTFBSPredictions <- unlist(lapply(PFMatrixList, function(x) {return(TFBSTools::ID(x))}), use.names = FALSE)
        # create translation table from biomart
        TFs.df = tibble::tibble(ID = IDsWithTFBSPredictions, TF.name = TFsWithTFBSPredictions) %>%
            dplyr::mutate(TF.name.lowercase = tolower(.data$TF.name))

        params.l <- .getBiomartParameters(GRN@config$parameters$genomeAssembly, suffix = "_gene_ensembl")
        
        futile.logger::flog.info(paste0("Retrieving gene annotation for JASPAR TFs. This may take a while."))
        
        
        ensembl = .biomart_getEnsembl(biomart = "genes", version = EnsemblVersion, host = params.l[["host"]],  dataset = params.l[["dataset"]])
        
        results.df = .callBiomart(mart =  ensembl, 
                                  attributes = c("external_gene_name", "ensembl_gene_id"),
                                  filters = "external_gene_name",
                                  values = TFsWithTFBSPredictions) 
        
        mapping.df = results.df %>%
            dplyr::rename(SYMBOL = "external_gene_name",
                          ENSEMBL = "ensembl_gene_id") %>%
            dplyr::mutate(SYMBOL.lowercase = tolower(.data$SYMBOL)) %>%
            dplyr::full_join(TFs.df, by = c("SYMBOL.lowercase" = "TF.name.lowercase"), multiple = "all")
        
        
        table_distinct_id = mapping.df %>%
            dplyr::pull(.data$ID) %>%
            table()
        
        TFs_unique = mapping.df %>%
            dplyr::group_by(.data$ID) %>%
            dplyr::filter(!is.na(.data$ENSEMBL)) %>%
            dplyr::summarise(n = dplyr::n()) %>%
            dplyr::filter(.data$n == 1) %>%
            dplyr::pull(.data$ID)
            
           
        ambiguousTFs = names(which(table_distinct_id > 1))
        
        futile.logger::flog.info(paste0(" TF statistics:"))
        futile.logger::flog.info(paste0("  Number of TFs as returned by JASPAR: ", nrow(TFs.df)))
        futile.logger::flog.info(paste0("  Number of TFs with a unique mapping to an Ensembl ID: ", dplyr::n_distinct(TFs_unique)))
        futile.logger::flog.info(paste0("  Number of TFs with a non-unique mapping to an Ensembl ID: ", 
                                        length(ambiguousTFs), " (", 
                                        paste0(ambiguousTFs, collapse = ", "), ")"))
        

        if (JASPAR_removeAmbiguousTFs) {
            # Remove ambiguousTFs from list
            # SYMBOL            ENSEMBL       ID
            # 1  Foxq1 ENSRNOG00000021752 MA0040.1
            # 2  Foxq1 ENSRNOG00000062314 MA0040.1
            futile.logger::flog.info(paste0("  Removing ambiguously mapping TFs"))
            mapping.df = dplyr::filter(mapping.df, !.data$ID %in% ambiguousTFs)
        }
        
        TFsWithTFBSPredictions = TFs_unique
        
    } else { # if source == "custom"
        
        futile.logger::flog.info(paste0("Checking database folder for matching files: ", folder_input_TFBS))
        
        files = .createFileList(folder_input_TFBS, "*.bed*", recursive = FALSE, ignoreCase = FALSE, verbose = FALSE)
        TFsWithTFBSPredictions = gsub(pattern = filesTFBSPattern, "", tools::file_path_sans_ext(basename(files), compression = TRUE))
        TFsWithTFBSPredictions = gsub(pattern = fileEnding, "", TFsWithTFBSPredictions)
        
        file_input_translationTable = paste0(folder_input_TFBS, .Platform$file.sep, translationTable)
        mapping.df = .readTranslationTable(file_input_translationTable, delim = translationTable_sep)
    }
    
    futile.logger::flog.info(paste0("Found ", length(TFsWithTFBSPredictions), " matching TFs: ", paste0(TFsWithTFBSPredictions, collapse = ", ")))
    
    # Filter TFs
    if (length(TFs) == 1 && TFs == "all") {
        if (source == "custom") {
            futile.logger::flog.info(paste0("Use all TF from the database folder ", folder_input_TFBS))
        }else{ 
            futile.logger::flog.info(paste0("Use all TF from the ", source, " database"))
        }
        
        
    } else {
        
        futile.logger::flog.info(paste0("Subset TFs to user-specified list: ", paste0(TFs, collapse = ", ")))
        TFsWithTFBSPredictions = TFsWithTFBSPredictions[TFsWithTFBSPredictions %in% TFs]
        
        if (length(TFsWithTFBSPredictions) == 0) {
            message = paste0("No TFs are left after subsetting. Make sure the TF names are identical to the names in the database folder.")
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        futile.logger::flog.info(paste0("List of TFs: ", paste0(TFs, collapse = ", ")))
        
    }
    
    countsRNA = getCounts(GRN, type = "rna", permuted = FALSE)
    
    TF_missingRNA = mapping.df %>%
        dplyr::filter(!.data$ENSEMBL %in% countsRNA$ENSEMBL, .data$ID %in% TFsWithTFBSPredictions) %>% 
        dplyr::pull(.data$ID) %>%
        sort()
    
    if (length(TF_missingRNA) > 0) {
        futile.logger::flog.info(paste0("Filtering the following ", length(TF_missingRNA), " TFs as they are not present in the RNA-Seq data: ", paste0(TF_missingRNA, collapse = ", ")))
    }
    
    allTF = mapping.df %>%
        dplyr::filter(.data$ENSEMBL %in% countsRNA$ENSEMBL, .data$ID %in% TFsWithTFBSPredictions) %>% 
        dplyr::pull(.data$ID) %>%
        sort()
    
    nTF = length(allTF)
    if (nTF == 0) {
        message = paste0("No shared Tfs.")
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
    }
    
    if (!is.null(nTFMax) && nTFMax < nTF) {
        
        futile.logger::flog.info(paste0("Use only the first ", nTFMax, " TFs because nTFMax has been set."))
        allTF = allTF[seq_len(nTFMax)]
        futile.logger::flog.info(paste0("Updated list of TFs: ", paste0(allTF, collapse = ", ")))
        nTF = length(allTF)
        
        # Adjust also the JASPAR object
        if (source == "JASPAR") {
            GRN@config$parameters$internal$PFMatrixList = GRN@config$parameters$internal$PFMatrixList[allTF]
        }
        
    }
    
    futile.logger::flog.info(paste0("Running the pipeline for ", nTF, " TF in total."))
    
    mapping.df.exp = dplyr::filter(mapping.df, .data$ID %in% allTF)
    if (nrow(mapping.df.exp) == 0) {
        message = paste0("Number of rows of mapping.df.exp is 0. Something is wrong with the mapping table or the filtering")
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
    }
    
    GRN@annotation$TFs = mapping.df.exp
    
    GRN
}

#' Overlap peaks and TFBS for a \code{\linkS4class{GRN}} object
#' 
#' If the source was set to \code{JASPAR} in \code{\link{addTFBS}}, the argument \code{nCores} is ignored.
#' 
#' @template GRN
#' @template nCores
#' @template forceRerun
#' @param ... No default. Only relevant if \code{source = "JASPAR"} has been selected in \code{addTFBS}, ignored otherwise. Additional arguments for \code{motifmatchr::matchMotifs} such as custom background nucleotide frequencies or p-value cutoffs. For more information, type \code{?motifmatchr::matchMotifs}
#' @return An updated \code{\linkS4class{GRN}} object, with added data from this function (\code{GRN@data$TFs$TF_peak_overlap} in particular)
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN = overlapPeaksAndTFBS(GRN, nCores = 2, forceRerun = FALSE)
#' @export
overlapPeaksAndTFBS <- function(GRN,  nCores = 2, forceRerun = FALSE, ...) {
    
    start = Sys.time()
    
    checkmate::assertClass(GRN, "GRN")
    
    GRN = .makeObjectCompatible(GRN)
    
    checkmate::assertIntegerish(nCores, lower = 1)
    checkmate::assertFlag(forceRerun)
    
    if (is.null(GRN@data$TFs$TF_peak_overlap) | forceRerun) {
        
        GRN = .addFunctionLogToObject(GRN)   
        
        futile.logger::flog.info(paste0("Overlap peaks and TFBS using ", nCores, " cores. This may take a while, particularly if the number of samples is large..."))
        
        genomeAssembly = GRN@config$parameters$genomeAssembly
        seqlengths = .getChrLengths(genomeAssembly)
        
        # Check whether we have peaks on chromosomes not part of the sequence length reference. If yes, discard them
        annotation_discared = dplyr::filter(GRN@data$peaks$counts_metadata, !.data$chr %in% names(seqlengths))
        
        if (nrow(annotation_discared) > 0) {
            
            tbl_discarded = table(annotation_discared$chr)
            tbl_discarded = tbl_discarded[which(tbl_discarded > 0)]
            
            message = paste0("overlapPeaksAndTFBS: Found ", sum(tbl_discarded), " regions from chromosomes without a reference length. ", 
                             "Typically, these are random fragments from known or unknown chromosomes. The following regions will be discarded: \n",
                             paste0(names(tbl_discarded), " (", tbl_discarded, ")", collapse = ","))
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)  
            
            GRN@data$peaks$counts_metadata = dplyr::filter(GRN@data$peaks$counts_metadata, .data$chr %in% names(seqlengths))
        }
        
        
        if (!is.null(GRN@config$directories$motifFolder)) {
            if (!is.null(GRN@config$TFBS_filePattern)) {
                filesTFBSPattern = GRN@config$TFBS_filePattern
            } else {
                message = "Could not retrieve value from GRN@config$TFBS_filePattern. Please rerun the function addTFBS, as this was added in a recent version of the package."
                .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
            }
        }
        
        
        
        # Construct GRanges
        consensus.gr   = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = seqlengths, genomeAssembly)
        
        if (is.null(GRN@config$directories$motifFolder)) { # if source == "JASPAR"
            
            if (nCores > 1) {
                message = paste0("overlapPeaksAndTFBS: For TFs from JASPAR, only 1 core can currently be utilized for this function instead of the user-provided value of ", nCores)
                .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
            }
            
            TFBS_bindingMatrix.df = .intersectTFBSPeaks_JASPAR(GRN, consensus.gr, verbose = FALSE, ...)
            
            # Add the isFiltered column to make it compatible with the custom TFBS source.
            # TODO: This can be changed at some point, seems redundant
            TFBS_bindingMatrix.df = cbind(TFBS_bindingMatrix.df, rep(FALSE, nrow(TFBS_bindingMatrix.df)))
            colnames(TFBS_bindingMatrix.df)[ncol(TFBS_bindingMatrix.df)] = "isFiltered"
            GRN@data$TFs$TF_peak_overlap = TFBS_bindingMatrix.df %>% methods::as("dMatrix")
            
            
        }else{# if source == "custom"
            
            res.l = .execInParallelGen(nCores, returnAsList = TRUE, listNames = GRN@config$allTF, 
                                       iteration = seq_len(length(GRN@config$allTF)), 
                                       verbose = FALSE, 
                                       functionName = .intersectTFBSPeaks_custom, GRN = GRN, consensusPeaks = consensus.gr, filesTFBSPattern = filesTFBSPattern)
            
            
            # Sanity check
            
            TFBS_bindingMatrix.df = tibble::as_tibble(res.l)
            
            if (!all(colnames(TFBS_bindingMatrix.df) %in% GRN@config$allTF)) {
                
                message = paste0("Internal mismatch detected between the TF names and the TF names derived from the translation file (see log, column ID).", 
                                 "This may happen if the genome assembly version has been changed, but intermediate files have not been properly recreated. ",
                                 "Set the parameter forceRerun to TRUE and rerun the script.")
                
                .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
            }
            
            # new 
            filteredPeaks = dplyr::filter(GRN@data$peaks$counts_metadata, .data$isFiltered) %>% dplyr::pull(.data$peakID)
            # Collect binary 0/1 binding matrix from all TF and concatenate
            GRN@data$TFs$TF_peak_overlap = TFBS_bindingMatrix.df %>%
                dplyr::mutate(peakID = GRN@data$peaks$counts_metadata$peakID,
                              isFiltered = .data$peakID %in% filteredPeaks) %>% # TODO remove here?
                dplyr::mutate_if(is.logical, as.numeric) %>%
                dplyr::select(tidyselect::all_of(sort(GRN@config$allTF)), "isFiltered")
            
            GRN@data$TFs$TF_peak_overlap = .asSparseMatrix(as.matrix(GRN@data$TFs$TF_peak_overlap), 
                                                           convertNA_to_zero = FALSE, 
                                                           dimnames = list(GRN@data$peaks$counts_metadata$peakID, colnames(GRN@data$TFs$TF_peak_overlap)))
            
            # The order of rows is here the sorted version as it originates from the sorted consensus peak file
            # We resort it to match the countsPeaks.norm
            # TODO: Here could be an error due to differences in sorting. Also, whether or not all or the filtered version shall be used
            stopifnot(identical(rownames(GRN@data$TFs$TF_peak_overlap), GRN@data$peaks$counts_metadata$peakID))
            
        } }else {
            .printDataAlreadyExistsMessage()
        }
    
    .printExecutionTime(start, prefix = "")
    
    GRN
}


#' @import GenomicRanges
.intersectTFBSPeaks_custom <- function(GRN, TFIndex, consensusPeaks, filesTFBSPattern, verbose = FALSE) {
  
  TFCur = GRN@config$allTF[TFIndex]
  
  file_tfbs_in  = paste0(GRN@config$directories$motifFolder, .Platform$file.sep, TFCur, filesTFBSPattern, GRN@config$TFBS_fileEnding)
  
  # Intersect consensusPeaks GR with bed file GR 
  TFBS.df = .readTFBSFile(file_tfbs_in) 
  
  # make sure we do not have any sequence names that are not in our assembly
  seqLengths = .getChrLengths(GRN@config$parameters$genomeAssembly)
  missingSeq = which(!TFBS.df$chr %in% names(seqLengths))
  if (length(missingSeq) > 0) {
      message = paste0("overlapPeaksAndTFBS: A total of ", length(missingSeq), " out of ", nrow(TFBS.df), " entries in the file ", file_tfbs_in, " contain sequence names that cannot be found in the public databases to retrieve chromosome lengths. The following sequence names are unsupported and will be removed: ", paste0(unique(TFBS.df$chr[missingSeq]), collapse = ","))
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
      TFBS.df = TFBS.df[-missingSeq,]
  }
  
  subject.gr  = .constructGRanges(TFBS.df, seqlengths = seqLengths, GRN@config$parameters$genomeAssembly)
  
  # intersect.gr = GenomicRanges::intersect(subject.gr, consensusPeaks, ignore.strand= TRUE)
  
  query.gr = consensusPeaks
  
  overlapsAll = GenomicRanges::findOverlaps(query.gr, subject.gr, 
                                            minoverlap = 1,
                                            type = "any",
                                            select = "all",
                                            ignore.strand = TRUE)
  
  query_row_ids  = S4Vectors::queryHits(overlapsAll)
  subject_rowids = S4Vectors::subjectHits(overlapsAll)
  
  subject_overlap_df = as.data.frame(S4Vectors::elementMetadata(subject.gr)[subject_rowids, ])
  subject_overlap_df$tfbs_chr = as.character(GenomeInfoDb::seqnames(subject.gr))[subject_rowids]
  subject_overlap_df$tfbs_start = start(subject.gr)[subject_rowids]
  subject_overlap_df$tfbs_end   = end(subject.gr)  [subject_rowids]
  
  query_overlap_df = as.data.frame(S4Vectors::elementMetadata(query.gr)  [query_row_ids, "peakID", drop = FALSE])
  query_overlap_df$peak_chr    = as.character(GenomeInfoDb::seqnames(query.gr))[query_row_ids]
  query_overlap_df$peak_start  = start(query.gr)[query_row_ids]
  query_overlap_df$peak_end    = end(query.gr)  [query_row_ids]
  
  final.df = cbind.data.frame(query_overlap_df, subject_overlap_df) %>%
    dplyr::select(-"score", -"annotation") %>%
    dplyr::mutate(tfbsID = paste0(.data$tfbs_chr, ":", .data$tfbs_start, "-", .data$tfbs_end),
                  coordCentTfbs = round((.data$tfbs_start + .data$tfbs_end)/2, 0),
                  coordSummit   = round((.data$peak_start + .data$peak_end)/2, 0),
                  distance = abs(.data$coordSummit - .data$coordCentTfbs))  %>%
    dplyr::group_by(.data$peakID) %>%
    dplyr::slice(which.min(.data$distance)) %>%
    #arrange(distance, .by_group = TRUE) %>%
    # top_n(n = 2, dplyr::desc(distance)) %>%
    dplyr::ungroup()
  
  futile.logger::flog.info(paste0(" Calculating intersection for TF ", TFCur, " finished. Number of overlapping TFBS after filtering: ", nrow(final.df)))
  
  
  return(GRN@data$peaks$counts_metadata$peakID %in% final.df$peakID)
  
}



.intersectTFBSPeaks_JASPAR <- function(GRN, consensusPeaks, verbose = FALSE, ...) {
    
    if (is.null(GRN@config$parameters$internal$PFMatrixList)) {
        message = paste0(" GRN@config$parameters$internal$PFMatrixList is NULL but shouldnt be. Rerun the function addTFBS.")
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)  
    } else {
        PFMatrixList  = GRN@config$parameters$internal$PFMatrixList
    }

    TF_names <- unlist(lapply(PFMatrixList, function(x) {return(TFBSTools::name(x))}), use.names = FALSE)
    TF_IDs   <- unlist(lapply(PFMatrixList, function(x) {return(TFBSTools::ID(x))}), use.names = FALSE)
    
    overlapsAll <- motifmatchr::matchMotifs(PFMatrixList, consensusPeaks, out = 'matches', 
                                            genome = .getGenomeObject(GRN@config$parameters$genomeAssembly, type = "BSgenome"),
                                            ...)
    
    overlapsAll_mtx <- motifmatchr::motifMatches(overlapsAll)
    rownames(overlapsAll_mtx) <- GRN@data$peaks$counts_metadata$peakID
    # colnames(overlapsAll_mtx) <- TF_IDs[match(colnames(overlapsAll_mtx), TF_IDs)]
    
    # Keep TFs with some peak x motif match
    # is this needed or??
    #overlapsAll_mtx <- overlapsAll_mtx[,Matrix::colSums(motifmatchr::motifMatches(overlapsAll_mtx))!=0]
    overlapsAll_mtx
    
}



.readTFBSFile <- function(file_tfbs_in) {
  
  TFBS.df = suppressMessages(.read_tidyverse_wrapper(file_tfbs_in, type = "tsv", col_names = FALSE, ncolExpected = 3:11, verbose = FALSE))
  if (ncol(TFBS.df) == 3) {
    colnames(TFBS.df) = c("chr", "start", "end")
  } else if (ncol(TFBS.df) == 4) {
    colnames(TFBS.df) = c("chr", "start", "end", "annotation")
  } else if (ncol(TFBS.df) == 5) {
    colnames(TFBS.df) = c("chr", "start", "end", "annotation", "strand")
  } else if (ncol(TFBS.df) >= 6) {
    
    if (ncol(TFBS.df) > 6) {
      message = paste0("overlapPeaksAndTFBS: File ", file_tfbs_in, " had more than 6 columns, only the first 6 will be used.")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)  
        
      TFBS.df = TFBS.df[,seq_len(6)]
    }
    colnames(TFBS.df) = c("chr", "start", "end", "annotation", "score", "strand")
  }
  
  TFBS.df
}



# TODO: Add columns for TF availability here also
# GRN@config$TF_list[["all_TFBS"]] =GRN@config$allTF
.correlateMatrices <- function(matrix1, matrix_peaks, mapping, corMethod = "pearson", whitespacePrefix = " ") {
  
  start = Sys.time()
  
  # Set the column name to just ENSEMBL to avoid column mismatch issues
  mapping$ENSEMBL = mapping$TF.ENSEMBL
  
  # Filter to only the TFs
  # In addition, the no of TF because multiple TFs can map to the same gene/ ENSEMBL ID
  # Also filter 0 count genes because they otherwise cause errors downstream
  rowSums = rowSums(dplyr::select(matrix1, -"ENSEMBL"))
  
  # Keep only Ensembl IDs from TFs we have data from
  matrix1.norm.TFs.df = dplyr::filter(matrix1, .data$ENSEMBL %in% mapping$TF.ENSEMBL, rowSums != 0)
  
  nFiltered1 = dplyr::filter(matrix1, !.data$ENSEMBL %in% mapping$TF.ENSEMBL) %>% nrow()
  nFiltered2 = dplyr::filter(matrix1, rowSums == 0) %>% nrow()
  
  diff = nrow(matrix1) - nrow(matrix1.norm.TFs.df)
  if (diff > 0) {
    message = paste0(whitespacePrefix, "Retain ", nrow(matrix1.norm.TFs.df), " unique genes from TF/gene data out of ", nrow(matrix1), 
                     " (filter ",  nFiltered1, " non-TF genes and ", nFiltered2, 
                     " TF genes with 0 counts throughout).")
    futile.logger::flog.info(message)
  }
  
  if (nrow(matrix1.norm.TFs.df) == 0) {
    message = " No rows remaining from TF/gene data after filtering against ENSEMBL IDs from HOCOMOCO. Check your ENSEMBL IDs for overlap with the HOCOMOCO translation table."
    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  }
  
  mapping.exp = dplyr::filter(mapping, .data$TF.ENSEMBL %in% matrix1.norm.TFs.df$ENSEMBL)
  futile.logger::flog.info(paste0(whitespacePrefix, "Correlate TF/gene data for ", nrow(matrix1.norm.TFs.df), " unique Ensembl IDs (TFs) and peak counts for ", nrow(matrix_peaks), " peaks."))
  futile.logger::flog.info(paste0(whitespacePrefix, "Note: For subsequent steps, the same gene may be associated with multiple TF, depending on the translation table."))
  # Correlate TF gene counts with peak counts 
  # matrix1:  rows: all TF genes, columns: all samples
  # matrix_peaks: rows: peak IDs, columns: samples
  # Transverse both for the cor function then
  
  # counts for peaks may be 0 throughout, then a warning is thrown
  
  dataX = t(dplyr::select(matrix1.norm.TFs.df, -"ENSEMBL"))
  dataY = t(dplyr::select(matrix_peaks, -"peakID"))
  
  #If the sd is zero, a warning is issued. We suppress it here to not confuse users as this is being dealt with later by ignoring the NA entries
  
  cor.m = t(.correlateData(dataX, dataY, corMethod))
  
  colnames(cor.m) = matrix1.norm.TFs.df$ENSEMBL
  rownames(cor.m) = matrix_peaks$peakID
  
  # Some entries in the HOCOMOCO mapping can be repeated (i.e., the same ID for two different TFs, such as ZBTB4.S and ZBTB4.D)
  # Originally, we deleted these rows from the mapping and took the first entry only
  # However, since TFs with the same ENSEMBL ID can still be different with respect to their TFBS, we now duplicate such genes also in the correlation table
  #mapping.exp = mapping.exp[!duplicated(mapping.exp[, c("ENSEMBL")]),]
  #checkmate::assertSubset(as.character(mapping.exp$ENSEMBL), colnames(sort.cor.m))
  
  # If a peak has identical counts across all samples,
  sort.cor.m = cor.m[,names(sort(colMeans(cor.m, na.rm = TRUE)))] 
  # Change the column names from ENSEMBL ID to TF names. 
  # Reorder to make sure the order is the same. Due to the duplication ID issue, the number of columns may increase after the column selection
  
  # Some columns may be removed here due to zero standard deviation
  mapping.exp.filt = mapping.exp %>% dplyr::filter(.data$TF.ENSEMBL %in% colnames(sort.cor.m))
  
  sort.cor.m = sort.cor.m[,as.character(mapping.exp.filt$ENSEMBL)] 
  colnames(sort.cor.m) = as.character(mapping.exp.filt$TF.ID)
  
  .printExecutionTime(start, prefix = whitespacePrefix)
  sort.cor.m
}

.filterSortAndShuffle_peakTF_overlapTable <- function(GRN, perm, TF_peak_cor = NULL, shuffle = TRUE) {
  
  peak_TF_overlapCur.df = .asMatrixFromSparse(GRN@data$TFs$TF_peak_overlap, convertZero_to_NA = FALSE) %>% 
    tibble::as_tibble() %>%
    dplyr::filter(!.data$isFiltered) %>%  # Works because 1 / 0 is interpreted here as logical and not 1/0
    dplyr::select(-"isFiltered") 
  
  # TODO: Filter differently
  
  if (perm > 0 & shuffle) {
    peak_TF_overlapCur.df = .shuffleRowsPerColumn(peak_TF_overlapCur.df)
  }
  
  peak_TF_overlapCur.df
  
  
}



###### TF Activity functions and other data types ######

#' Add TF activity data to GRN object using a simplified procedure for estimating it. EXPERIMENTAL.
#' 
#' We do not yet provide full support for this function. It is currently being tested. Use at our own risk.
#' 
#' @template GRN
#' @template normalization_TFActivity
#' @template name_TFActivity
#' @template forceRerun
#' @return An updated \code{\linkS4class{GRN}} object, with added data from this function
#' (\code{GRN@data$TFs[[name]]} in particular, with \code{name} referring to the value of tje \code{name} parameter) 

addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_activity", forceRerun = FALSE) {
  
  checkmate::assertClass(GRN, "GRN")
  
  start = Sys.time()
  
  
  checkmate::assertChoice(normalization, c("cyclicLoess", "sizeFactors", "quantile", "none"))
  checkmate::assertCharacter(name, min.chars = 1, len = 1)
  checkmate::assertFlag(forceRerun)
  
  message = paste0("This function is currently under development and testing and not fully functional yet.")
  .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  
  forbiddenNames = "expression"
  .checkForbiddenNames(name, forbiddenNames)
  
  if (is.null(GRN@data$TFs[[name]]) | forceRerun) {
    
    GRN = .addFunctionLogToObject(GRN)
    futile.logger::flog.info(paste0("Calculate sample-specific TF activity from peaks data. This may take a while."))
    
    
    # Input: Raw peak counts per TF and TF-binding matrix
    # TODO: How to add raw counts here
    counts.df = getCounts(GRN, type = "peaks", permuted = FALSE) %>%
      tibble::as_tibble()
    
    # TODO replace by getter
    countsPeaks = .normalizeCountMatrix(GRN, data = GRN@data$peaks$counts_orig, normalization = normalization)
    
    stopifnot(identical(nrow(countsPeaks), nrow(GRN@data$TFs$TF_peak_overlap)))
    
    #Select a maximum set of TFs to run this for
    allTF = GRN@annotation$TFs$TF.ID
    
    # rownamesTFs = GRN@annotation$TFs$ENSEMBL[match(allTF, GRN@annotation$TFs$ID)] 
    
    # Calculating TF activity is done for all TF that are available
    TF.activity.m = matrix(NA, nrow = length(allTF), ncol = length(GRN@config$sharedSamples), 
                           dimnames = list(allTF, GRN@config$sharedSamples))
    
    
    pb <- progress::progress_bar$new(total = length(allTF))
    for (TFCur in allTF) {
      
      pb$tick()
      
      # Filter count matrix to those peaks with TFBS
      # Remove names from vector
      rows1 = as.vector(which(GRN@data$TFs$TF_peak_overlap[, TFCur] == 1))
      
      # Derive normalized counts for all peaks from the foreground (i.e., peaks with a predicted TFBS)
      Peaks_raw.cur.fg = countsPeaks[rows1,]
      
      # Derive z-scores
      scaled = t(scale(t(Peaks_raw.cur.fg)))
      
      colmeansCur = colMeans(scaled)
      TF.activity.m[TFCur, ] = colmeansCur
      stopifnot(identical(names(colmeansCur), GRN@config$sharedSamples))
    }
    
    # Store as data frame with both TF names and Ensembl IDs, in analogy to the other types of TF data that can be imported
    GRN@data$TFs[[name]] = TF.activity.m %>%
      tibble::as_tibble(rownames = "TF.ID") %>%
      dplyr::left_join(GRN@annotation$TFs, by = "TF.ID") %>%
      dplyr::select("ENSEMBL", "TF.ID", tidyselect::all_of(GRN@config$sharedSamples))
    
    # Update available connection types
    GRN@config$TF_peak_connectionTypes = unique(c(GRN@config$TF_peak_connectionTypes, name))
    futile.logger::flog.info(paste0("TF activity successfully calculated. Data has been stored under the name ", name))
    
  } else {
      .printDataAlreadyExistsMessage(slotName = paste0("data$TFs$", name))
  }
  
  
  .printExecutionTime(start)
  
  GRN
  
}


#' Import externally derived TF Activity data. EXPERIMENTAL.
#' 
#' We do not yet provide full support for this function. It is currently being tested. Use at our own risk.
#' 
#' @template GRN
#' @param data Data frame. No default. Data with TF data.
#' @template name_TFActivity
#' @param idColumn Character. Default \code{ENSEMBL}. Name of the ID column. Must not be unique as some TFs may correspond to the same ID.
#' @param nameColumn Character. Default \code{TF.name}. Must be unique for each TF / row.
#' @template normalization_TFActivity
#' @template forceRerun
#' @return An updated \code{\linkS4class{GRN}} object, with added data from this function.  
importTFData <- function(GRN, data, name, idColumn = "ENSEMBL", nameColumn = "TF.name", normalization = "none", forceRerun = FALSE) {
  
  checkmate::assertClass(GRN, "GRN")
 
  
  start = Sys.time()
  
  
  checkmate::assertDataFrame(data, min.cols = 2, min.rows = 1)
  checkmate::assertChoice(idColumn, colnames(data))
  checkmate::assertChoice(nameColumn, colnames(data))
  checkmate::assertSubset(GRN@config$sharedSamples, colnames(data), empty.ok = FALSE)
  checkmate::assertCharacter(name, min.chars = 1, any.missing = FALSE, len = 1)
  checkmate::assertChoice(normalization, c("cyclicLoess", "sizeFactors", "quantile", "none"))
  
  
  if (is.null(GRN@data$TFs[[name]]) | forceRerun) {
    GRN = .addFunctionLogToObject(GRN)
    futile.logger::flog.info(paste0("Importing external TF data under the name ", name)) 
    
    # Check whether TF have been added already
    if (is.null(GRN@annotation$TFs)) {
      message = paste0("No TFBS afound in the object. Make sure to run the function addTFBS first.")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
    }
    
    # Replace spaces
    name = stringr::str_replace(name, "\\s", "_")
    
    # Rename idColumn and name column to "default"
    data = dplyr::rename(data, ENSEMBL = !!(idColumn))
    idColumn = "ENSEMBL"
    
    forbiddenNames = c("TF_activity", "expression")
    .checkForbiddenNames(name, forbiddenNames)
    
    idColumns = idColumn
    data = dplyr::rename(data, TF.name = !!(nameColumn))
    idColumns = c(idColumns, "TF.name")
    
    # Check uniqueness of TF names
    if (dplyr::n_distinct(data$TF.name) < nrow(data)) {
      message = "TF names must be unique, but at least 2 TFs have the same TF name or TF names are missing."
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
    }
    
    # Make sure the column is reset
    GRN@annotation$TFs[[paste0("TF.name.", name)]] = NULL
    
    # TODO: Repeated execution results in more and more rows
    GRN@annotation$TFs = dplyr::left_join(GRN@annotation$TFs, data[, idColumns], 
                                                     by = "TF.name", suffix = c("", paste0(".", name)))
    
    # data = dplyr::select(data, -tidyselect::one_of(nameColumn))
    
    # Only TF.names are unique
    # TODO fix
    countsNorm = .normalizeCountMatrix(GRN, data = data %>% dplyr::select(-"ENSEMBL"), normalization = normalization)
    
    # Check overlap of ENSEMBL IDs
    countsNorm$ENSEMBL = data$ENSEMBL
    
    nRowBefore = nrow(countsNorm)
    countsNorm.subset = dplyr::filter(countsNorm, .data$ENSEMBL %in% GRN@annotation$TFs$TF.ENSEMBL)
    nRowAfter = nrow(countsNorm.subset)
    if (nRowAfter == 0) {
      message = "No rows overlapping with translation table, check ENSEMBL IDs."
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
    } else if (nRowAfter < nRowBefore) {
      message = paste0("importTFData: Retain ", nRowAfter, " from ", nRowBefore, " rows after filtering for overlapping ENSEMBL IDs from the translation table. This will raise a warning but this is usually expected that some rows are filtered")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
    }
    
    
    # Check overlap of sample names
    nColBefore = ncol(countsNorm.subset)
    countsNorm.subset = dplyr::select(countsNorm.subset, tidyselect::all_of(idColumns), tidyselect::all_of(GRN@config$sharedSamples))
    nColAfter = ncol(countsNorm.subset)
    if (nColBefore > nColAfter) {
      
      if (nColAfter == length(idColumns)) {
        message = "No samples overlapping."
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
      } else {
        message = "importTFData: Not all samples overlap"
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
      }
    }
    
    # Ensembl IDs may not be unique as different TFs can have the same Ensembl ID. 
    # Therefore, use TF names as row names, same as with the TF Activity matrix
    
    GRN@data$TFs[[name]] = countsNorm.subset %>%
      dplyr::select("ENSEMBL", "TF.name", tidyselect::all_of(GRN@config$sharedSamples)) %>%
      tibble::as_tibble()
    
    # Update available connection types
    GRN@config$TF_peak_connectionTypes = unique(c(GRN@config$TF_peak_connectionTypes, name))
    
    
  } else {
    
    futile.logger::flog.info(paste0("Data already exists in object, nothing to do. Set forceRerun = TRUE to regenerate and overwrite."))
    
  }
  
  .printExecutionTime(start)
  
  GRN
}

######## AR classification ########

#' Run the activator-repressor classification for the TFs for a \code{\linkS4class{GRN}} object
#' 
#' @template GRN
#' @param significanceThreshold_Wilcoxon Numeric[0,1]. Default 0.05. Significance threshold for Wilcoxon test that is run in the end for the final classification. See the Vignette and *diffTF* paper for details.
#' @param plot_minNoTFBS_heatmap Integer[1,]. Default 100. Minimum number of TFBS for a TF to be included in the heatmap that is part of the output of this function.
#' @param deleteIntermediateData \code{TRUE} or \code{FALSE}.  Default \code{TRUE}. Should intermediate data be deleted before returning the object after a successful run? Due to the size of the produced intermediate data, we recommend setting this to \code{TRUE}, but if memory or object size are not an issue, the information can also be kept.
#' @template plotDiagnosticPlots
#' @template outputFolder
#' @template corMethod
#' @template forceRerun
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function. 
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' # GRN = loadExampleObject()
#' # GRN = AR_classification_wrapper(GRN, outputFolder = ".", forceRerun = FALSE)
#' @export
AR_classification_wrapper <- function(GRN, significanceThreshold_Wilcoxon = 0.05, 
                                      plot_minNoTFBS_heatmap = 100, deleteIntermediateData = TRUE,
                                      plotDiagnosticPlots = TRUE, outputFolder= NULL,
                                      corMethod = "pearson",
                                      forceRerun = FALSE) {
  
  start = Sys.time()
    
  checkmate::assertClass(GRN, "GRN")
  GRN = .addFunctionLogToObject(GRN)
  
  GRN = .makeObjectCompatible(GRN)
  
  checkmate::assertNumber(significanceThreshold_Wilcoxon, lower = 0, upper = 1)
  checkmate::assertNumber(plot_minNoTFBS_heatmap, lower = 1)
  checkmate::assertFlag(deleteIntermediateData)
  checkmate::assertFlag(plotDiagnosticPlots)
  checkmate::assertChoice(corMethod, c("pearson", "bicor", "spearman"))
  checkmate::assertFlag(forceRerun)
  
  outputFolder = .checkOutputFolder(GRN, outputFolder)
  
  GRN@data$TFs$classification$TF.translation.orig = GRN@annotation$TFs
  
  if (is.null(GRN@data$TFs$TF_peak_overlap)) {
    message = paste0("Could not find peak - TF matrix. Run the function overlapPeaksAndTFBS first / again")
    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  }
  
  GRN@config$parameters$internal$plot_minNoTFBS_heatmap = plot_minNoTFBS_heatmap
  
  allPermutations = 0:.getMaxPermutation(GRN)
  
  connectionTypes = as.character(unique(GRN@connections$TF_peaks[["0"]]$main$TF_peak.connectionType))
  
  for (connectionTypeCur in connectionTypes) {
    
    futile.logger::flog.info(paste0(" Connection type ", connectionTypeCur, "\n"))
    
    for (permutationCur in allPermutations) {
      
      futile.logger::flog.info(paste0(" ", .getPermStr(permutationCur), "\n"))
      permIndex = as.character(permutationCur)
      
      if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]])) {
        if (is.null(GRN@data$TFs$classification[[permIndex]])) {
          GRN@data$TFs$classification[[permIndex]] = list()
        }
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]] = list()
      }
      
      if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground) |
          is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background) |
          is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground) |
          is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background) |
          is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor) |
          forceRerun
      ) {
        
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]] = list()
        
        if (connectionTypeCur == "expression") {
          counts1 = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))
          
        } else {
          
          # TF activity data
          counts1 = GRN@data$TFs[[connectionTypeCur]] %>% 
            dplyr::select(-"TF.ID")
          
        } 
        
        futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts"))
        
        counts_peaks = getCounts(GRN, type = "peaks", permuted = FALSE)
        
        TF_peak_cor = .correlateMatrices(matrix1      = counts1, 
                                         matrix_peaks = counts_peaks, 
                                         GRN@annotation$TFs, corMethod)
        
        peak_TF_overlapCur.df = .filterSortAndShuffle_peakTF_overlapTable(GRN, permutationCur, TF_peak_cor)
        res.l = .computeForegroundAndBackgroundMatrices(peak_TF_overlapCur.df, TF_peak_cor)
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground = res.l[["median_foreground"]]
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background = res.l[["median_background"]]
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground   = res.l[["foreground"]]
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background   = res.l[["background"]]
        
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor = TF_peak_cor
      }
      
      # Final classification: Calculate thresholds by calculating the quantiles of the background and compare the real values to the background
      # TODO: Clarify whether the default convertZero_to_NA= TRUE is really needed here
      if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l) | forceRerun) {
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l = 
          .calculate_classificationThresholds(.asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background), 
                                              GRN@config$parameters)
      }
      
      if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF.classification) | forceRerun) {
        
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF.classification = 
          .finalizeClassificationAndAppend(
            output.global.TFs = GRN@data$TFs$classification$TF.translation.orig %>% dplyr::mutate(TF = .data$TF.ID), 
            median.cor.tfs = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground, 
            act.rep.thres.l = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l, 
            par.l = GRN@config$parameters, 
            t.cor.sel.matrix = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground), 
            t.cor.sel.matrix.non = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background), 
            significanceThreshold_Wilcoxon = significanceThreshold_Wilcoxon)
      }
      
      
      # PLOTS FOR THE RNA-SEQ CLASSIFICATION
      
      if (plotDiagnosticPlots) {
        
        outputFolder = .checkOutputFolder(GRN, outputFolder)
        
        suffixFile = .getPermutationSuffixStr(permutationCur)
        
        
        fileCur = paste0(outputFolder, .getOutputFileName("plot_class_density"), "_", connectionTypeCur, suffixFile, ".pdf")
        if (!file.exists(fileCur) | forceRerun) {
          .plot_density(.asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground),
                        .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background), 
                        corMethod,
                        fileCur, width = 5, height = 5)
        } else {
          futile.logger::flog.info(paste0("  File ", fileCur, " already exists, not overwriting since forceRerun = FALSE"))
        }
        
        fileCur = paste0(outputFolder, .getOutputFileName("plot_class_medianClass"), "_", connectionTypeCur, suffixFile, ".pdf")
        if (!file.exists(fileCur) | forceRerun) {
          .plot_AR_thresholds(
            median.cor.tfs = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground), 
            median.cor.tfs.non = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background), 
            par.l = GRN@config$parameters, 
            act.rep.thres.l = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l, 
            corMethod = corMethod,
            file = fileCur,  width = 4, height = 8)
        } else {
          futile.logger::flog.info(paste0("  File ", fileCur, " already exists, not overwriting since forceRerun = FALSE"))
        }
        
        fileCur = paste0(outputFolder, .getOutputFileName("plot_class_densityClass"), "_", connectionTypeCur, suffixFile, ".pdf")
        if (!file.exists(fileCur) | forceRerun) {
          
          TF_peak_cor = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor
          peak_TF_overlapCur.df = .filterSortAndShuffle_peakTF_overlapTable(GRN, permutationCur, TF_peak_cor)
          .plot_heatmapAR(TF.peakMatrix.df = peak_TF_overlapCur.df, 
                          TF_mapping.df.exp = GRN@annotation$TFs %>% dplyr::mutate(TF = .data$TF.ID), 
                          sort.cor.m = TF_peak_cor, 
                          par.l = GRN@config$parameters, 
                          corMethod = corMethod,
                          median.cor.tfs = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground), 
                          median.cor.tfs.non = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background), 
                          act.rep.thres.l = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l, 
                          finalClassification = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF.classification,
                          file = fileCur, width = 8, height = 15)
        } else {
          futile.logger::flog.info(paste0("  File ", fileCur, " already exists, not overwriting since forceRerun = FALSE"))
        }
      }
      
      
      if (deleteIntermediateData) {
        
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground = NULL
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background = NULL
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground = NULL
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background = NULL
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l = NULL
        
      } else {
        # Save as sparse matrices
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground = 
          .asSparseMatrix(as.matrix(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground), convertNA_to_zero = TRUE)
        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background = 
          .asSparseMatrix(as.matrix(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background), convertNA_to_zero = TRUE)
      }
      
    } # end for all permutations
    
  } # end of for each connection type
  
  .printExecutionTime(start, prefix = "")
  
  GRN
  
}

.checkAndUpdateConnectionTypes <- function(GRN) {
  
  if (is.null(GRN@config$TF_peak_connectionTypes)) {
    GRN@config$TF_peak_connectionTypes = "expression"
    if (!is.null(GRN@data$TFs$TF_activity)) {
      GRN@config$TF_peak_connectionTypes = c(GRN@config$TF_peak_connectionTypes, "TF_activity")
    }
  }
  GRN
}


######## Connections ########

#' Add TF-peak connections to a \code{\linkS4class{GRN}} object
#' 
#' After the execution of this function, QC plots can be plotted with the function \code{\link{plotDiagnosticPlots_TFPeaks}} unless this has already been done by default due to \code{plotDiagnosticPlots = TRUE}
#' 
#' @template GRN 
#' @template plotDiagnosticPlots
#' @template plotDetails
#' @template outputFolder
#' @template corMethod
#' @param connectionTypes Character vector. Default \code{expression}. Vector of connection types to include for the TF-peak connections. If an additional connection type is specified here, it has to be available already within the object (EXPERIMENTAL). See the function \code{\link{addData_TFActivity}} for details.
#' @param removeNegativeCorrelation  Vector of \code{TRUE} or \code{FALSE}. Default \code{FALSE}. EXPERIMENTAL. Must be a logical vector of the same length as the parameter \code{connectionType}. Should negatively correlated TF-peak connections be removed for the specific connection type? For connection type expression, the default is \code{FALSE}, while for any TF Activity related connection type, we recommend setting this to \code{TRUE}.  
#' @param maxFDRToStore Numeric[0,1]. Default 0.3. Maximum TF-peak FDR value to permanently store a particular TF-peak connection in the object? This parameter has a large influence on the overall memory size of the object, and we recommend not storing connections with a high FDR due to their sheer number.
#' @param addForBackground \code{TRUE} or \code{FALSE}.  Default \code{TRUE}. Add connections also for background data. Leave at \code{TRUE} unless you know what you are doing.
#' @param useGCCorrection \code{TRUE} or \code{FALSE}.  Default \code{FALSE}. EXPERIMENTAL. Should a GC-matched background be used when calculating FDRs? For more details, see the Package Details vignette.
#' @param percBackground_size Numeric[0,100]. Default 75. EXPERIMENTAL. Percentage of the background to use as basis for sampling. If set to 0, an automatic iterative procedure will identify the maximum percentage so that all relevant GC bins with a rel. frequency above 5\% from the foreground can be matched. For more details, see the Package Details vignette. Only relevant if \code{useGCCorrection} is set to \code{TRUE}, ignored otherwise.
#' @param percBackground_resample \code{TRUE} or \code{FALSE}.  Default \code{TRUE}. EXPERIMENTAL. Should resampling be enabled for those GC bins for which not enough background peaks are available?. For more details, see the Package Details vignette. Only relevant if \code{useGCCorrection} is set to \code{TRUE}, ignored otherwise.
#' @template forceRerun
#' @seealso \code{\link{plotDiagnosticPlots_TFPeaks}}
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function. 
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN = addConnections_TF_peak(GRN, plotDiagnosticPlots = FALSE, forceRerun = FALSE)
#' @export
addConnections_TF_peak <- function(GRN, plotDiagnosticPlots = TRUE, plotDetails = FALSE, outputFolder = NULL, 
                                    corMethod = "pearson", 
                                    connectionTypes = c("expression"),
                                    removeNegativeCorrelation = c(FALSE),
                                    maxFDRToStore = 0.3, 
                                    addForBackground = TRUE,
                                    useGCCorrection = FALSE, percBackground_size = 75, percBackground_resample = TRUE,
                                    forceRerun = FALSE) {
  
  start = Sys.time()

  checkmate::assertClass(GRN, "GRN")
  
  
  GRN = .makeObjectCompatible(GRN)

  checkmate::assertFlag(plotDiagnosticPlots)
  checkmate::assertFlag(plotDetails)
  checkmate::assertChoice(corMethod, c("pearson", "bicor", "spearman"))
  checkmate::assertFlag(addForBackground)
  
  GRN = .checkAndUpdateConnectionTypes(GRN) # For compatibility with older versions
  checkmate::assertSubset(connectionTypes, GRN@config$TF_peak_connectionTypes, empty.ok = FALSE)
  
  checkmate::assertLogical(removeNegativeCorrelation, any.missing = FALSE, len = length(connectionTypes))
  
  
  #checkmate::assert(checkmate::checkSubset(add_TFActivity, c("none", "calculate", names(slot(GRN, "data")[["TFs"]]))))
  
  checkmate::assertNumber(maxFDRToStore, lower = 0, upper = 1)
  checkmate::assertFlag(useGCCorrection)
  checkmate::assertNumber(percBackground_size, lower = 0, upper = 100)
  checkmate::assertFlag(percBackground_resample)
  checkmate::assertFlag(forceRerun)
  
  
  if (is.null(GRN@connections$TF_peaks) | forceRerun) {
    
    GRN = .addFunctionLogToObject(GRN)
    GRN@connections$TF_peaks = list()
    
    GRN@config$parameters$corMethod_TF_Peak = corMethod
    GRN@config$parameters$useGCCorrection = useGCCorrection 
    
    if (is.null(GRN@data$TFs$TF_peak_overlap)) {
      message = paste0("Could not find peak - TF matrix. Run the function overlapPeaksAndTFBS first / again")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
    }
    
    if (useGCCorrection) {
        
        # Now we need the genome annotation packages to calculate the GC content of the peak regions
        .checkAndLoadPackagesGenomeAssembly(GRN@config$parameters$genomeAssembly)
        
        if (!"peak.GC.class" %in% colnames(GRN@annotation$peaks)) {
            GC.data.df = .calcGCContentPeaks(GRN)
            GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") 
        }
       
    }
    
    for (permutationCur in 0:.getMaxPermutation(GRN)) {
        
      if (!addForBackground & permutationCur != 0) {
          next
      }
      
      futile.logger::flog.info(paste0("\n", .getPermStr(permutationCur), "\n"))
      permIndex = as.character(permutationCur)
      
      resFDR.l  = .computeTF_peak.fdr(GRN, perm = permutationCur, connectionTypes = connectionTypes, corMethod = corMethod, 
                                      removeNegativeCorrelation = removeNegativeCorrelation, 
                                      maxFDRToStore = maxFDRToStore, useGCCorrection = useGCCorrection,
                                      percBackground_size = percBackground_size, threshold_percentage = 0.05,
                                      percBackground_resample = percBackground_resample, plotDetails = plotDetails
                                      )
      
      # TODO: remove extra columns again
      GRN@connections$TF_peaks[[permIndex]]$main            = .optimizeSpaceGRN(stats::na.omit(resFDR.l[["main"]]))
      GRN@connections$TF_peaks[[permIndex]]$connectionStats = resFDR.l[["connectionStats"]] 
      
      futile.logger::flog.info(paste0("Finished. Stored ", nrow(GRN@connections$TF_peaks[[permIndex]]$main), " connections with an FDR <= ", maxFDRToStore))
      
      
      # GC plots, empty when no GC correction should be done
      
      GRN@stats$GC$TFs_GC_correction_plots[[permIndex]] = resFDR.l[["plots_GC"]]
      GRN@stats$GC$TFs_GC_correction[[permIndex]]       = resFDR.l[["GC_details"]]
      rm(resFDR.l)
      
      
      
    } #end for each permutation
    
    
    if (plotDiagnosticPlots) {
      
      plotDiagnosticPlots_TFPeaks(GRN, outputFolder = outputFolder, plotDetails = FALSE, forceRerun = forceRerun)
      
    }
    
  } else {
      .printDataAlreadyExistsMessage()
  }
  
  .printExecutionTime(start, prefix = "")
  
  GRN
}



.computeTF_peak.fdr <- function(GRN, perm, connectionTypes, corMethod = "pearson", useGCCorrection = FALSE, 
                                removeNegativeCorrelation, maxFDRToStore = 0.3 , percBackground_size = 75, threshold_percentage = 0.05,
                                percBackground_resample = TRUE, plotDetails = FALSE, backgroundSize_min = 1000) {
  
  start = Sys.time()
  checkmate::assertIntegerish(backgroundSize_min, lower = 100)
  
  if (plotDetails) {
    message = "Plotting details is not supported currently. Set plotDetails = FALSE."
    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  }
  
  peak_TF_overlap.df = .filterSortAndShuffle_peakTF_overlapTable(GRN, perm)
  
  plots_GC.l = list()
  
  # Lists that contain all the data
  connections_all.l = list()
  connectionStats_all.l = list()
  
  # List of connection types for which r < 0 should be filtered
  connectionTypes_removeNegCor = connectionTypes[removeNegativeCorrelation]
  
  for (connectionTypeCur in connectionTypes) {
    
    plots_GC.l[[connectionTypeCur]] = list()
    
    futile.logger::flog.info(paste0("Calculate TF-peak links for connection type ", connectionTypeCur))
    start2 = Sys.time()
    
    if (connectionTypeCur == "expression") {
      
      counts_connectionTypeCur = getCounts(GRN, type = "rna", permuted = as.logical(perm))
      
    } else {
      
      # Keep only Ensembl ID here
      counts_connectionTypeCur = GRN@data$TFs[[connectionTypeCur]] %>% 
        dplyr::select(-"TF.ID")
      
    } 
    
    futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts"))
    
    counts_peaks = getCounts(GRN, type = "peaks", permuted = FALSE)
    
    if (nrow(counts_peaks) == 0) {
        message = "There were no unfiltered peaks left. Rerun filterData and make sure that not all peaks are filtered."
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        
    }
    
    # Filtering of the matrices happens automatically within the next function
    peaksCor.m = .correlateMatrices( matrix1 = counts_connectionTypeCur, 
                                     matrix_peaks = counts_peaks, 
                                     GRN@annotation$TFs, 
                                     corMethod,
                                     whitespacePrefix = "  ")


    allTF = intersect(colnames(peak_TF_overlap.df), colnames(peaksCor.m))
    checkmate::assertIntegerish(length(allTF), lower = 1)
   

    
    futile.logger::flog.info(paste0(" Run FDR calculations for ", length(allTF), " TFs for which TFBS predictions and ",
                                    connectionTypeCur, " data for the corresponding gene are available."))
    if (length(allTF) < ncol(peak_TF_overlap.df) | length(allTF) < ncol(peaksCor.m)) {
      
      TF_missing = setdiff(colnames(peak_TF_overlap.df), colnames(peaksCor.m))
      if (length(TF_missing) > 0) futile.logger::flog.info(paste0("  Skip the following ", length(TF_missing), " TF due to missing data or because they are marked as filtered: ", paste0(TF_missing, collapse = ",")))
    }
    
    stopifnot(nrow(peaksCor.m) == nrow(peak_TF_overlap.df))
    
    peak_TF_overlap.df = peak_TF_overlap.df[,allTF]
    
    sort.cor.m.sort = peaksCor.m[,colnames(peak_TF_overlap.df)]
    
    stopifnot(identical(colnames(sort.cor.m.sort), colnames(peak_TF_overlap.df)))
    
    if (useGCCorrection) {
        futile.logger::flog.info(paste0("  Compute FDR for each TF (GC-aware). This may take a while..."))
        cols_select = c("peak.ID", "peak.GC.class", "peak.GC.perc", "peak.width")
    } else {
        futile.logger::flog.info(paste0("  Compute FDR for each TF. This may take a while...")) 
        cols_select = c("peak.ID", "peak.width")
    }
    
    peaksFiltered = GRN@data$peaks$counts_metadata %>% 
        dplyr::filter(!.data$isFiltered)  %>%
        dplyr::left_join(GRN@annotation$peaks %>% dplyr::select(tidyselect::any_of(cols_select)), by = c("peakID" = "peak.ID"))
    
    if (!useGCCorrection) {
      minPerc = 100
      # Since we do not control for this, we set it to NA
      background_match_success = NA
    }
    
    # Stores GC-specific extra data
    GC_classes_perTF.l = list()
    pb <- progress::progress_bar$new(total = length(allTF))
    
    if (percBackground_size == 0) {
        futile.logger::flog.info(paste0("For each TF: Trying to automatically find the highest minimum percentage so that mimicking all GC bins with a relative frequency of at least ", 
                                        threshold_percentage, " in the background works"))
    }
    
    for (TFCur in allTF) {
        
      futile.logger::flog.info(paste0("TF ", TFCur))
        
      
      pb$tick()
      #start = Sys.time()
      #for(TFCur in colnames(corr_TF.sort)) {
      overlapYes = which(peak_TF_overlap.df[,TFCur] == 1)
      overlapNo  = which(peak_TF_overlap.df[,TFCur] == 0)
      
      tp <- sort.cor.m.sort[overlapYes, TFCur]
      n_tp       = length(tp)
      
      peaksForeground =                     peaksFiltered %>% dplyr::slice(overlapYes)
      peaksBackground = peaksBackgroundGC = peaksFiltered %>% dplyr::slice(overlapNo)
      
      nPeaksForeground = nrow(peaksForeground)
      nPeaksBackground = nrow(peaksBackground)
      
      
      
      # The foreground can be empty: No peak actually overlaps with the TFBS from the TF
      if (nPeaksForeground == 0) {
          futile.logger::flog.info(paste0(" No peaks in foreground, skipping TF"))
          next
      }
      
      #.printExecutionTime(start, "Interval 1: ")
      #start = Sys.time()
      # GC-adjust background and select background regions according to foreground
      if (useGCCorrection) {
        
        fp_orig <- sort.cor.m.sort[overlapNo, TFCur]
        n_fp_orig  = length(fp_orig)
        
        # Get GC info from those peaks from the foreground
        GC_classes_foreground.df = peaksForeground %>%
          dplyr::group_by(.data$peak.GC.class) %>%
          dplyr::summarise(n = dplyr::n(), peak_width_mean = mean(.data$peak.width), peak_width_sd = sd(.data$peak.width)) %>%
          dplyr::ungroup() %>% 
          tidyr::complete(.data$peak.GC.class, fill = list(n = 0)) %>%
          dplyr::mutate(n_rel = .data$n / nPeaksForeground, type = "foreground") %>%
          dplyr::arrange(dplyr::desc(.data$n_rel))

        
        GC_classes_background.df = peaksBackground %>%
          dplyr::group_by(.data$peak.GC.class) %>%
          dplyr::summarise(n = dplyr::n(), peak_width_mean = mean(.data$peak.width), peak_width_sd = sd(.data$peak.width)) %>%
          dplyr::ungroup() %>% 
          tidyr::complete(.data$peak.GC.class, fill = list(n = 0)) %>%
          dplyr::mutate(n_rel = .data$n / nPeaksBackground, type = "background_orig")
        
        
        
        background_match_success = TRUE
        
        if (percBackground_size > 0) {
          minPerc = percBackground_size
        } else {
          
          minPerc = .findMaxBackgroundSize(GC_classes_foreground.df, GC_classes_background.df, peaksBackground, 
                                           threshold_percentage =  threshold_percentage)
          
          if (minPerc == 0) {
            #futile.logger::flog.warn(paste0(" Mimicking the foreground failed for TF ", TFCur, ". The background will only be approximated as good as possible using 5% of the peaks."))
            minPerc = 5
            background_match_success = FALSE
          }
        }
        
        targetNoPeaks = minPerc/100 * nPeaksBackground
        
        # Ensure a minimum no of points in background, even if this sacrifices the mimicking of the distributions
        if (targetNoPeaks < backgroundSize_min) {
          
          if (percBackground_size > 0) {
            #futile.logger::flog.warn(paste0("Number of peaks in background is smaller than 1000 for TF ", TFCur, ". Increasing in steps of 5% until a value > 1000 is found.This warning results from a too low value for the parameter percBackground_size"))
          } else {
            background_match_success = FALSE
          }
          
          targetNoPeaksNew = targetNoPeaks
          while (targetNoPeaksNew < 1000) {
            minPerc = minPerc + 5
            targetNoPeaksNew = minPerc/100 * nPeaksBackground
          }
          
          
        }
        
        # Add sel. minPerc to table and calculate required frequencies
        
        GC_classes_all.df = dplyr::full_join(GC_classes_foreground.df, GC_classes_background.df, suffix = c(".fg",".bg"), by = "peak.GC.class") %>%
          dplyr::mutate(maxSizeBackground = .data$n.bg / .data$n_rel.fg,
                        n.bg.needed = floor(.data$n_rel.fg * targetNoPeaks), 
                        n.bg.needed.ratio = .data$n.bg / .data$n.bg.needed )
        
        GC_classes_perTF.l[[TFCur]] = GC_classes_all.df
        
        
        #futile.logger::flog.info(paste0( " GC-adjustment: Randomly select a total of ", round(targetNoPeaks,0), 
        #                  " peaks (", minPerc, " %) from the background (out of ", nrow(peaksBackground), 
        #                  " overall) in a GC-binwise fashion to mimick the foreground distribution"))
        
        # Now we know the percentage, lets select an appropriate background
        # Sample peaks from background for each GC-bin specifically
        peakIDsSel = c()
        for (i in seq_len(nrow(GC_classes_foreground.df))) {
          
          peaksBackgroundGCCur =  peaksBackground %>% dplyr::filter(.data$peak.GC.class == GC_classes_foreground.df$peak.GC.class[i])
          
          if (nrow(peaksBackgroundGCCur) == 0) {
            next
          }
          
          #Select the minimum, which for low % GC classes is smaller than the required number to mimic the foreground 100%
          # This works either perfectly when resampling is wanted or as best as possible if not
          if (percBackground_resample) {
            nPeaksCur = GC_classes_all.df$n.bg.needed[i]    
          } else {
            nPeaksCur = min(GC_classes_all.df$n.bg.needed[i], nrow(peaksBackgroundGCCur))
          }
          
          if (!is.finite(GC_classes_all.df$n.bg.needed[i])) {
              stop("This should not happen. Please report to the devs.")
          }
          
          if (GC_classes_all.df$n.bg.needed[i] > nrow(peaksBackgroundGCCur)) {
              #futile.logger::flog.info(paste0("Resampling for bin ", GC_classes_foreground.df$peak.GC.class[i], 
              #": Needed: ", GC_classes_all.df$n.bg.needed[i], ", available: ", nrow(peaksBackgroundGCCur)))
              
            peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = percBackground_resample) %>% dplyr::pull(.data$peakID))  
          } else {
            peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = FALSE) %>% dplyr::pull(.data$peakID))
          }
          # Take a sample from the background, and record the row numbers
          
        }

        
        # We cannot simply select now the peakIDs, as some peaks may be present multiple times
        #peaksBackgroundGC = peaksBackground %>% dplyr::filter(peakID %in% peakIDsSel) 
        peaksBackgroundGC = peaksBackground[match(peakIDsSel, peaksBackground$peakID),] 
        
        
        if (is.null(plots_GC.l[[connectionTypeCur]][[TFCur]])) {
          plots_GC.l[[connectionTypeCur]][[TFCur]] = .generateTF_GC_diagnosticPlots(TFCur, GC_classes_foreground.df, GC_classes_background.df, GC_classes_all.df, peaksForeground, peaksBackground, peaksBackgroundGC)
        }
        
        
        # Select the rows by their peak IDs
        fp    = sort.cor.m.sort[peakIDsSel, TFCur]
        
        fp_orig = sort.cor.m.sort[overlapNo, TFCur]
        n_fp_orig  = length(fp_orig)
        
      } else {
        # TODO: Redundant so far in this case
        fp    =  fp_orig = sort.cor.m.sort[overlapNo, TFCur]
        n_fp_orig  = length(fp_orig)
      }
      
      n_fp = length(fp)
      
      cor.peak.tf = tibble::tibble(peak.ID    = rownames(sort.cor.m.sort)[overlapYes], 
                                   TF_peak.r  = sort.cor.m.sort[overlapYes, TFCur],
                                   TF.ID    = as.factor(TFCur),
                                   TF_peak.connectionType = as.factor(connectionTypeCur))
      
      #val.sign      = (median(tp) - median(fp))
      # val.sign_orig = (median(tp) - median(fp_orig))
      
      #.printExecutionTime(start, "Interval 2: ")
      
      # Determine unique levels so plotting is identical for all
      #seq_pos<-unique(as.character(cut(GRN@config$parameters$internal$stepsFDR, breaks = GRN@config$parameters$internal$stepsFDR,      right = FALSE, include.lowest = TRUE )))
      #seq_neg<-unique(as.character(cut(GRN@config$parameters$internal$stepsFDR, breaks = rev(GRN@config$parameters$internal$stepsFDR), right = TRUE,  include.lowest = TRUE )))
      
      for (directionCur in c("pos","neg")) {
        
        indexStr = paste0(connectionTypeCur, "_", TFCur, "_", directionCur)
        
        #start = Sys.time()
        n_tp2.vec = n_fp2.vec =  n_fp2_orig.vec = rep(NA_real_, length(GRN@config$parameters$internal$stepsFDR)) 
        
        if (directionCur == "pos") {
          
          stepsCur = GRN@config$parameters$internal$stepsFDR
          rightOpen = FALSE
        } else { 
          
          stepsCur = rev(GRN@config$parameters$internal$stepsFDR)  
          rightOpen = TRUE
        }
        
        # Unique necessary to eliminate a duplication for one bin [-1,-0.95]
        levelsBins = unique(as.character(cut(stepsCur, breaks = stepsCur, right = rightOpen, include.lowest = TRUE)))
        
        cor.peak.tf$TF_peak.r_bin <- as.character(cut(cor.peak.tf$TF_peak.r, breaks = stepsCur, right = rightOpen, include.lowest = TRUE))
        #.printExecutionTime(start, "Interval 3a: ")
        #start = Sys.time()
        
        
        i = 0
        for (thres in stepsCur) {
          i = i + 1
          
          # na.rm = TRUE for all sums here to make sure NAs will not cause a problem
          if (directionCur == "pos") {
            
            n_tp2.vec[i]      = sum(tp >= thres, na.rm = TRUE)
            n_fp2.vec[i]      = sum(fp >= thres, na.rm = TRUE)
            n_fp2_orig.vec[i] = sum(fp_orig >= thres, na.rm = TRUE)
            
          } else {
            
            n_tp2.vec[i]      = sum(tp < thres, na.rm = TRUE)
            n_fp2.vec[i]      = sum(fp < thres, na.rm = TRUE)
            n_fp2_orig.vec[i] = sum(fp_orig < thres, na.rm = TRUE)
          }
          
        }
        
        #.printExecutionTime(start, "Interval 3b_new: ")
        #start = Sys.time()
        
        # Normalize the false positives to make them comparable to the true positives by dividing by the ratio
        # The maximum number is then identical to the maximum for the true positives
        n_fp2_norm.vec   = (n_fp2.vec/(n_fp/n_tp))
        n_fp2_orig_norm.vec = (n_fp2_orig.vec/(n_fp_orig/n_tp))
        
        #TODO: Decide for a variant.  +1 for raw or unnormalized values?
        
        
        fdr.curve = tibble::tibble( 
          TF_peak.r_bin2  = stepsCur,
          tpvalue = n_tp2.vec, 
          
          fpvalue = n_fp2.vec,
          fpvalue_orig = n_fp2_orig.vec,
          
          fpvalue_norm = n_fp2_norm.vec,
          fpvalue_norm_orig = n_fp2_orig_norm.vec,
          
          TF_peak.fdr_orig   = (n_fp2_orig_norm.vec) / (n_fp2_orig_norm.vec + n_tp2.vec),
          TF_peak.fdr     = (n_fp2_norm.vec) / (n_fp2_norm.vec + n_tp2.vec),
          # TF_peak.fdr_orig   = (n_fp2_orig_norm.vec + 1) / (n_fp2_orig_norm.vec + n_tp2.vec + 1),
          # TF_peak.fdr     = (n_fp2_norm.vec +1) / (n_fp2_norm.vec + n_tp2.vec + 1),
          # 
          TF_peak.fdr_direction     = directionCur,
          TF_peak.r_bin      = 
            as.character(cut(.data$TF_peak.r_bin2, breaks = stepsCur, right = rightOpen, include.lowest = TRUE))
        ) 
        
        # Derive connection summaries for all TF for both directions
        # Get no. of connections per bin, here make sure to also include that have n = 0
        
        
        # Remove negatively correlated connections for the specific connection type for which this was asked for
        if (connectionTypeCur %in% connectionTypes_removeNegCor ) {
          
          # futile.logger::flog.info(paste0(" Remove negatively correlated TF-peak pairs for connection type ", connectionTypeCur))
          cor.peak.tf = dplyr::filter(cor.peak.tf, .data$TF_peak.r >= 0)
          
        }
        
        # TODO: add number of connections in this correlation bin in non-corrected and GC-corrected data as additional column
        connectionStats_all.l[[indexStr]] =  cor.peak.tf %>%
          dplyr::group_by(.data$TF_peak.r_bin) %>%
          dplyr::summarise(n = dplyr::n()) %>%
          dplyr::ungroup() %>%
          dplyr::right_join(fdr.curve, by = "TF_peak.r_bin", multiple = "all") %>%
          dplyr::mutate(n = tidyr::replace_na(.data$n, replace = 0), 
                        TF.ID = as.factor(TFCur), 
                        TF_peak.connectionType = factor(connectionTypeCur, levels = connectionTypes),
                        TF_peak.fdr_direction  = factor(directionCur, levels = c("pos", "neg")),
                        TF_peak.r_bin = factor(.data$TF_peak.r_bin, levels = levelsBins),
                        
                        # Collect extra information, currently however a bit repetitively stored
                        nForeground              = nPeaksForeground,
                        nBackground              = nrow(peaksBackgroundGC),
                        nBackground_orig         = nPeaksBackground,
                        percBackgroundUsed       = minPerc,
                        background_match_success = background_match_success) %>%
          dplyr::select("TF.ID", "TF_peak.r_bin", 
                        "n", "tpvalue", "fpvalue", "fpvalue_norm", 
                        "TF_peak.fdr", 
                        "TF_peak.fdr_orig", "TF_peak.fdr_direction", 
                        "TF_peak.connectionType",
                        tidyselect::contains("ground")) %>%
          dplyr::rename(n_tp = "tpvalue", n_fp = "fpvalue", n_fp_norm = "fpvalue_norm") %>%
          dplyr::distinct() %>%
          dplyr::arrange(.data$TF_peak.r_bin)
        
        
        # Collect data for additional QC plots before they are filtered
        
        # Filter now high FDR connections to save space and time
        # DISCARD other rows altogether
        # Left join here is what we want, as we need this df only for "real" data
        tblFilt.df = dplyr::left_join(cor.peak.tf, fdr.curve, by = "TF_peak.r_bin", multiple = "all") %>%
          dplyr::filter(.data$TF_peak.fdr <= maxFDRToStore | .data$TF_peak.fdr_orig <= maxFDRToStore) %>%
          dplyr::select("TF.ID", "TF_peak.r_bin", "TF_peak.r", "TF_peak.fdr", 
                        "TF_peak.fdr_orig", "peak.ID", "TF_peak.fdr_direction", 
                        "TF_peak.connectionType", tidyselect::contains("value"))
        
        
        if (!plotDetails) {
          tblFilt.df = dplyr::select(tblFilt.df, -tidyselect::contains("value"))
        }

        
        connections_all.l[[indexStr]] = tblFilt.df
        
        #.printExecutionTime(start, "Interval 4: ")
        #start = Sys.time()
        
      } # end for directionCur in c("pos", "neg")
      
      
     
      
    } # end for each TF
    
    # Add additional elements
    if (useGCCorrection) {
        
        GC_classes_all.l = list()
        GC_classes_all.l[[connectionTypeCur]] =  GC_classes_perTF.l %>%
                                data.table::rbindlist(idcol = "TF.ID") %>% 
                                tibble::as_tibble() %>%
                                dplyr::select(-tidyselect::starts_with("type")) %>%
                                dplyr::mutate_if(is.character, as.factor)
                                
    } else {
        GC_classes_all.l = NULL
    }
    
    .printExecutionTime(start2, prefix = "  ")
    
  } # end for each connectionType
  
  .printExecutionTime(start, prefix = "")
  list(main            = data.table::rbindlist(connections_all.l), 
       connectionStats = data.table::rbindlist(connectionStats_all.l), 
       plots_GC        = plots_GC.l,
       GC_details      = GC_classes_all.l
  )
}

.findMaxBackgroundSize <- function(GC_classes_foreground.df, GC_classes_background.df, peaksBackground, threshold_percentage = 0.05) {
  
  # Iterate over different background sizes
  minPerc = 0

  for (percCur in c(seq(100,10,-5),5)) {
    
    if (minPerc > 0) next
    targetNoPeaks = percCur/100 * nrow(peaksBackground)
    
    futile.logger::flog.debug(paste0("  Downscaling background to ", percCur, "%"))
    
    #Check for each GC bin in the foreground, starting with the most abundant, whether we have enough background peaks to sample from
    
    # threshold_percentage: Minimum relative frequency of a GC bin so that it can cause the procedure to fail. 
    # From which percentage of GC bin frequency from the foreground should the mimicking fail?
    #The motivation is that very small bins that have no weight in the foreground will not cause a failure of the mimicking

    for (i in seq_len(nrow(GC_classes_foreground.df))) {
      
      n_rel    = GC_classes_foreground.df$n_rel[i]
      GC.class.cur = GC_classes_foreground.df$peak.GC.class[i]
      
      if (!is.finite(n_rel)) {
          futile.logger::flog.debug(paste0("  GC.class ", GC.class.cur, ": n_rel is not finite for i = ", i, ", skipping"))
          next
      }
      
      requiredNoPeaks = round(n_rel * targetNoPeaks, 0)
      # Check in background
      availableNoPeaks = GC_classes_background.df %>% 
        dplyr::filter(.data$peak.GC.class == GC.class.cur) %>%
        dplyr::pull(.data$n)
      
      ignoredStr = ""
      if (n_rel <= threshold_percentage) {
          ignoredStr = paste0(" (ignored because relative frequency < ", threshold_percentage, ")")
      }
      
      futile.logger::flog.debug(paste0("   GC.class ", GC.class.cur, ": Required: ", requiredNoPeaks, ", available: ", availableNoPeaks, ignoredStr))
      if ( availableNoPeaks < requiredNoPeaks) {
        #futile.logger::flog.info(paste0("  Not enough"))
      }
      if (availableNoPeaks < requiredNoPeaks & n_rel > threshold_percentage) {
        futile.logger::flog.debug(paste0("   Mimicking distribution FAILED (GC class ", GC.class.cur, " could not be mimicked. Aborting current downscaling value."))
        break
      }
      
      if (i == nrow(GC_classes_foreground.df)) {
        minPerc = percCur
        futile.logger::flog.info(paste0(" Found max. percentage of background that is able to mimick the foreground: ", percCur))
        
      }
      
    }  # end of  for (i in 1:nrow(GC_classes_foreground.df)) {
    
    
  } # end for all percentages
  
  minPerc
}




#' Add peak-gene connections to a \code{\linkS4class{GRN}} object
#' 
#' After the execution of this function, QC plots can be plotted with the function \code{\link{plotDiagnosticPlots_peakGene}} unless this has already been done by default due to \code{plotDiagnosticPlots = TRUE}
#' 
#' @export
#' @template GRN
#' @param  overlapTypeGene Character. \code{"TSS"} or \code{"full"}. Default \code{"TSS"}. If set to \code{"TSS"}, only the TSS of the gene is used as reference for finding genes in the neighborhood of a peak. If set to \code{"full"}, the whole annotated gene (including all exons and introns) is used instead. 
#' @template corMethod
#' @param  promoterRange Integer >=0. Default 250000. The size of the neighborhood in bp to correlate peaks and genes in vicinity. Only peak-gene pairs will be correlated if they are within the specified range. Increasing this value leads to higher running times and more peak-gene pairs to be associated, while decreasing results in the opposite.
#' @param TADs Data frame with TAD domains. Default \code{NULL}. If provided, the neighborhood of a peak is defined by the TAD domain the peak is in rather than a fixed-sized neighborhood. The expected format is a BED-like data frame with at least 3 columns in this particular order: chromosome, start, end, the 4th column is optional and will be taken as ID column. All additional columns as well as column names are ignored. For the first 3 columns, the type is checked as part of a data integrity check.
#' @param TADs_mergeOverlapping \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should overlapping TADs be merged? Only relevant if TADs are provided.
#' @param knownLinks \code{NULL} or a data frame with exactly two columns. Both columns must be of type character and they must both contain 
#' genomic coordinates in the usual format: \code{chr:start-end}, while the 2 separators between the three elements can be chosen by the user.
#' The first column denotes the **bait**, the promoter coordinates that are overlapped with the genes (usually their TSS, unless specified 
#' otherwise via the parameter \code{overlapTypeGene}, while the second column denotes the **other end (OE)** coordinates, which is overlapped with the
#' peaks/enhancers from the GRN object. 
#' **NOTE: The provided column names are ignored and column 1 is interpreted as bait column and column 2 as OE column unless column names are exactly `bait` and `OE`.**.  
#' For more details, see the Workflow vignette.)
#' @param knownLinks_separator Character vector of length 1 or 2. Default \code{c(":", "-")}. Separator(s) for the character columns that specify the genomic locations.
#' The first entry splits the chromosome from the position, while the second entry splits the start and end coordinates. If only one separator is given, the same will be used for both. 
#' @param knownLinks_useExclusively \code{TRUE} or \code{FALSE}. Default \code{FALSE}. If kept at \code{FALSE} (the default),
#' specified \code{knownLinks} will be used in addition to the regular peak-gene links that are identified via the default method. 
#' If set to \code{TRUE}, only the \code{knownLinks} will be used.
#' @param shuffleRNACounts \code{TRUE} or \code{FALSE}. Default \code{TRUE}. Should the RNA sample labels be shuffled in addition to 
#' testing random peak-gene pairs for the background? When set to \code{FALSE}, only peak-gene pairs are shuffled, but
#' for each pair, the counts from peak and RNA that are correlated are matched (i.e., sample 1 counts from peak data are compared to sample 1 counts from RNA).
#' If set to \code{TRUE}, however, the RNA sample labels are in addition shuffled so that sample 1 counts from peak data are compared to sample 4 data from RNA, for example.
#' Shuffling truly randomizes the resulting background eGRN. Note that this parameter and its influence is still being investigated. Until version 1.0.7, this parameter (although not existent explicitly)
#' was implicitly set to \code{TRUE}.
#' @template nCores
#' @template plotDiagnosticPlots
#' @param plotGeneTypes List of character vectors. Default \code{list(c("all"), c("protein_coding"))}. Each list element may consist of one or multiple gene types that are plotted collectively in one PDF. The special keyword \code{"all"} denotes all gene types that are found (be aware: this typically contains 20+ gene types, see \url{https://www.gencodegenes.org/pages/biotypes.html} for details).
#' @template outputFolder
#' @template forceRerun
#' @seealso \code{\link{plotDiagnosticPlots_peakGene}}
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function. 
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN = addConnections_peak_gene(GRN, promoterRange=10000, plotDiagnosticPlots = FALSE)
addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "pearson",
                                     promoterRange = 250000, 
                                     TADs = NULL, TADs_mergeOverlapping = FALSE,
                                     knownLinks = NULL, knownLinks_separator = c(":", "-"), knownLinks_useExclusively = FALSE,
                                     shuffleRNACounts = TRUE,
                                     nCores = 4, 
                                     plotDiagnosticPlots = TRUE, 
                                     plotGeneTypes = list(c("all"), c("protein_coding")), 
                                     outputFolder = NULL,
                                     forceRerun = FALSE) {
  
  start = Sys.time() 
    
  checkmate::assertClass(GRN, "GRN")
  
  
  GRN = .makeObjectCompatible(GRN)
  
  checkmate::assertChoice(overlapTypeGene, c("TSS", "full"))
  checkmate::assertChoice(corMethod, c("pearson", "bicor", "spearman"))
  checkmate::assertIntegerish(promoterRange, lower = 0)
  checkmate::assert(checkmate::testNull(TADs), checkmate::testDataFrame(TADs))
  checkmate::assertFlag(TADs_mergeOverlapping)
  checkmate::assertDataFrame(knownLinks, types = c("character", "character"), min.rows = 1, ncols = 2, null.ok = TRUE)
  checkmate::assertCharacter(knownLinks_separator, min.len = 1, max.len = 2)
  checkmate::assertFlag(knownLinks_useExclusively)
  checkmate::assertIntegerish(nCores, lower = 1)
  checkmate::assertFlag(plotDiagnosticPlots) 
  checkmate::assertFlag(shuffleRNACounts)
  for (elemCur in plotGeneTypes) {
      checkmate::assertSubset(elemCur, c("all", unique(as.character(GRN@annotation$genes$gene.type))) %>% stats::na.omit(), empty.ok = FALSE)
  }
  
  checkmate::assert(checkmate::testNull(outputFolder), checkmate::testDirectoryExists(outputFolder))
  checkmate::assertFlag(forceRerun)
  
  .checkPackageRobust(corMethod)
  
  # As this is independent of the underlying GRN, it has to be done only once
  
  if (is.null(GRN@connections$peak_genes[["0"]]) | forceRerun) {
    
    GRN = .addFunctionLogToObject(GRN)
    GRN@config$parameters$promoterRange = promoterRange
    GRN@config$parameters$corMethod_peak_gene = corMethod
    
    
    GRN@connections$peak_genes = list()
    
    
    # Check which gene types are available for the particular genome annotation
    # Use all of them to collect statistics. Filtering can be done later. Just remove NA
    gene.types = unique(GRN@annotation$genes$gene.type) %>% stats::na.omit()

    GRN = .calculatePeakGeneCorrelations(GRN,
                                       TADs = TADs,
                                       knownLinks = knownLinks, knownLinks_separator, knownLinks_useExclusively,
                                       mergeOverlappingTADs = TADs_mergeOverlapping,
                                       neighborhoodSize = promoterRange,
                                       gene.types = as.character(gene.types),
                                       corMethod = corMethod,
                                       shuffleRNA = shuffleRNACounts,
                                       overlapTypeGene = overlapTypeGene,
                                       nCores = nCores)
      
    
    
  } else {
      .printDataAlreadyExistsMessage()
  } 
  
  if (plotDiagnosticPlots) {
    
    GRN = plotDiagnosticPlots_peakGene(GRN, outputFolder, gene.types = plotGeneTypes, useFiltered = FALSE, forceRerun = forceRerun)
    
  }
  
  .printExecutionTime(start, prefix = "")
  
  GRN

}

#' @import tidyr
.calculatePeakGeneCorrelations <- function(GRN,
                                           TADs = NULL, 
                                           mergeOverlappingTADs = FALSE, 
                                           knownLinks, knownLinks_separator, knownLinks_useExclusively,
                                           neighborhoodSize = 250000,
                                           gene.types = c("protein_coding"),
                                           overlapTypeGene = "TSS",
                                           corMethod = "pearson",
                                           shuffleRNA = FALSE,
                                           nCores = 1,
                                           chunksize = 50000) {
  
  start.all = Sys.time()
  futile.logger::flog.info(paste0("\nPreparing data\n"))

  genomeAssembly = GRN@config$parameters$genomeAssembly
  
  consensusPeaks = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered)
  # Preprocess TAD boundaries
  if (!is.null(TADs)) {
    
    futile.logger::flog.info(paste0("Integrate Hi-C data and overlap peaks and Hi-C domains"))  
    
    # Check format
    checkmate::assertCharacter(unlist(TADs[,1]))
    checkmate::assertIntegerish(unlist(TADs[,2]), lower = 1)
    checkmate::assertIntegerish(unlist(TADs[,3]), lower = 1)
    
    colnames(TADs)[seq_len(3)] = c("chr", "start", "end")
    
    if (ncol(TADs) < 4) {
      TADs = dplyr::mutate(TADs, ID = paste0(.data$chr, ":", .data$start, "-", .data$end)) 
    } else {
      colnames(TADs)[4] = "ID"
    }
    
    
    # Construct GRanges
    query   = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
    subject = .constructGRanges(TADs, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
    
    TADOverlaps = GenomicRanges::countOverlaps(subject, subject)
    TADOverlaps_min2 = length(which(TADOverlaps > 1))
    
    futile.logger::flog.info(paste0(TADOverlaps_min2, " TADs overlap each other"))
    
    
    # Merge overlapping TADs. min.gapwidth is set to 0 to prevent that directly adjacent TADs are merged
    if (mergeOverlappingTADs & TADOverlaps_min2 > 0) {
      futile.logger::flog.info(paste0("Merge overlapping TAD domains to one domain"))  
      subject = GenomicRanges::reduce(subject, min.gapwidth = 0L)
      # Metadata has been lost, redefine it with the new boundaries
      subject$ID = paste0(as.character(GenomeInfoDb::seqnames(subject)), ":", start(subject), "-", end(subject))
    } else {
      futile.logger::flog.info(paste0("Overlapping TADs will not be merged"))  
    }
    
    
    # Check whether TAD boundaries overlap and print a warning if so
    nMultipleOverlaps = .checkSelfOverlap(subject)
    if (nMultipleOverlaps > 0) {
        message = paste0("addConnections_peak_gene:", nMultipleOverlaps, " out of ", length(subject), " TADs overlap with at least one other TAD. Please verify whether this is intended or a mistake. Particularly 1bp overlaps may not resembl the truth.")
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
    }
    
    
    # Finally, do the actual overlaps
    overlapsAll = suppressWarnings(GenomicRanges::findOverlaps(query, subject, 
                                              minoverlap = 1,
                                              type = "any",
                                              select = "all",
                                              ignore.strand = TRUE))
    
    
    query_row_ids  = S4Vectors::queryHits(overlapsAll)
    subject_rowids = S4Vectors::subjectHits(overlapsAll)
    
    subject_overlap_df = as.data.frame(S4Vectors::elementMetadata(subject)[subject_rowids, c("ID")]) %>%
      dplyr::mutate(tadChr = as.character(GenomeInfoDb::seqnames(subject))[subject_rowids],
                    tadStart = start(subject)[subject_rowids],
                    tadEnd = end(subject)[subject_rowids])
    # Some entries in here will have only NAs
    
    query_overlap_df   = as.data.frame(S4Vectors::elementMetadata(query)  [query_row_ids, "peakID"])
    
    overlaps.df = cbind.data.frame(query_overlap_df,subject_overlap_df)
    colnames(overlaps.df)[seq_len(2)] = c("peakID","tad.ID")
    
    peak.TADs.df = suppressWarnings(dplyr::left_join(consensusPeaks, overlaps.df, by = "peakID") )
    
    nPeaks = nrow(consensusPeaks)
    nPeaksWithOutTAD = length(which(is.na(peak.TADs.df$tad.ID)))
    futile.logger::flog.info(paste0(" Out of the ", nPeaks, " peaks, ", nPeaksWithOutTAD, " peaks are not within a TAD domain. These will be ignored for subsequent overlaps"))   
    
    nPeaksWithMultipleTADs = peak.TADs.df %>% dplyr::group_by(.data$peakID) %>% dplyr::summarize(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% nrow()
    
    if (nPeaksWithMultipleTADs > 0) {
      futile.logger::flog.info(paste0(" Out of the ", nPeaks, " peaks, ", nPeaksWithMultipleTADs, " overlap with more than one TAD. This usually means they are crossing TAD borders.")) 
    }
    
    
    
    
  } else {
    
    peak.TADs.df = NULL
  }
  
  if (!is.null(knownLinks)) {
      
      futile.logger::flog.info(paste0(" Known links have been provided by the user. They will be used ", 
                                      dplyr::if_else(knownLinks_useExclusively, "as a replacement for the neigborhood-based approach", "in addition to previously defined links.")))
      
      futile.logger::flog.info(paste0(" Parsing known links and identifying peak-gene pairs from the data to add. If you receive an error in the following, make sure you used the right column separators.."))
      
      if (!all(colnames(knownLinks) %in% c("bait", "OE"))) {
          colnames(knownLinks) = c("bait", "OE")
      }
      
      
      regEx = stringr::regex(paste0(knownLinks_separator, collapse = "|"))
      
      # bait and OE (other end)
      knownLinks.all = knownLinks %>% tidyr::separate_wider_delim("bait", regEx, names = c("bait.chr", "bait.start", "bait.end")) %>% 
          tidyr::separate_wider_delim("OE", regEx, names = c("OE.chr", "OE.start", "OE.end")) %>%
          dplyr::mutate(bait.start = as.numeric(.data$bait.start), bait.end = as.numeric(.data$bait.end),
                        OE.start = as.numeric(.data$OE.start), OE.end = as.numeric(.data$OE.end),
                        bait.ID = paste0(.data$bait.chr, ":", .data$bait.start, "-", .data$bait.end),
                        OE.ID   = paste0(.data$OE.chr, ":", .data$OE.start, "-", .data$OE.end),
                        bait_OE.ID = paste0(.data$bait.ID, "_", .data$OE.ID))
      
      futile.logger::flog.info(paste0(" Found ", nrow(knownLinks.all), " bait-OE links that were user-provided."))
      n_knownLinks = length(unique(knownLinks.all$bait_OE.ID))
     
      
      bait.df = knownLinks.all %>% dplyr::select(dplyr::contains("bait")) %>%
          dplyr::rename(chr = "bait.chr", start = "bait.start", end = "bait.end") %>%
          dplyr::distinct()
      
      
      OE.df = knownLinks.all %>% dplyr::select(dplyr::contains("OE")) %>%
          dplyr::rename(chr = "OE.chr", start = "OE.start", end = "OE.end") %>%
          dplyr::mutate(ext_peak.chr = .data$chr, ext_peak.start = .data$start, ext_peak.end = .data$end) %>%
          dplyr::distinct()
      
      # Genomic ranges out of it
      bait.gr = .constructGRanges(bait.df, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
      OE.gr   = .constructGRanges(OE.df, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
      
      # Overlap with peaks and genes
      # Bait = promoter(s), other end = enhancer(s).
      futile.logger::flog.info(paste0(" Overlapping manually provided links with peaks and genes as defined in the object"))
      
      consensusPeaks.gr  = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
      
      
      overlapsAll = GenomicRanges::findOverlaps(OE.gr, consensusPeaks.gr, minoverlap = 1, type = "any", select = "all", ignore.strand = TRUE)
      
      query_row_ids   = S4Vectors::queryHits(overlapsAll)
      subject_row_ids = S4Vectors::subjectHits(overlapsAll)
      
      query_overlap_df     = as.data.frame(S4Vectors::elementMetadata(OE.gr)[query_row_ids,])
      subject_overlap_df   = as.data.frame(consensusPeaks.gr)[subject_row_ids, c("seqnames", "start", "end", "peakID")]
      
      overlaps.OE.df = cbind.data.frame(query_overlap_df,subject_overlap_df) %>% dplyr::mutate(seqnames = as.character(.data$seqnames)) %>% tibble::as_tibble()
      colnames(overlaps.OE.df) = c("OE.ID", "bait_OE.ID", "ext_peak.chr", "ext_peak.start", "ext_peak.end", "orig_peak.chr", "orig_peak.start", "orig_peak.end", "peak.ID")
      
      
      genes.gr = .makeGenomicRangeGenes(GRN@annotation$genes, gene.types, overlapTypeGene)
      
      overlapsAll = suppressWarnings(GenomicRanges::findOverlaps(bait.gr, genes.gr, minoverlap = 1, type = "any", select = "all", ignore.strand = TRUE))
      
      query_row_ids   = S4Vectors::queryHits(overlapsAll)
      subject_row_ids = S4Vectors::subjectHits(overlapsAll)
      
      query_overlap_df     = as.data.frame(S4Vectors::elementMetadata(bait.gr)[query_row_ids,])
      subject_overlap_df   = as.data.frame(genes.gr)[subject_row_ids, c("seqnames", "start", "end", "gene.ENSEMBL", "gene.type", "gene.name")]
      
      overlaps.bait.df = cbind.data.frame(query_overlap_df,subject_overlap_df) %>% dplyr::mutate(seqnames = as.character(.data$seqnames)) %>% tibble::as_tibble()
      colnames(overlaps.bait.df)[1:5] = c("bait.ID", "bait_OE.ID", "gene.chr", "gene.start", "gene.end")
      
      # Merge by bait_OE_ID
      knownLinks.df = suppressWarnings(dplyr::full_join(overlaps.bait.df, overlaps.OE.df, by = "bait_OE.ID", multiple = "all"))
      
      futile.logger::flog.info(paste0("  From the ", n_knownLinks, " originally defined distinct bait-OE pairs, found ", length(unique(knownLinks.df$bait_OE.ID)), " distinct bait-OE pairs that overlap with either peaks or genes."))
      
      
      futile.logger::flog.info(paste0("  Found ", nrow(knownLinks.df), " distinct peak-gene links based on the provided bait-OE information that overlap with either peaks or genes."))
      
      knownLinks.filt.df = knownLinks.df %>% dplyr::filter(!is.na(.data$bait.ID), !is.na(.data$OE.ID))
      
      futile.logger::flog.info(paste0("  Found ", nrow(knownLinks.filt.df), " distinct peak-gene links based on the provided bait-OE information that overlap with both peaks and genes. Filtered ", 
                                      nrow(knownLinks.df) - nrow(knownLinks.filt.df), " pairs because either the bait did not overlap any genes or because the OE coordinates did not overlap any peaks as defined in the object."))
      
      
      stats_all = table(knownLinks.filt.df$bait_OE.ID)
      futile.logger::flog.info(paste0("   Statistics for the number of distinct peak-gene pairs that were added per bait-OE link:"))
      futile.logger::flog.info(paste0("   Min: ", min(stats_all), ", median: ", median(stats_all), ", max: ", max(stats_all), ". Exact distribution:"))
      stats_all2 = table(stats_all)
      for (i in 1:length(stats_all2)) {
          futile.logger::flog.info(paste0("    ", names(stats_all2)[i], ": ", stats_all2[i], " bait-OE pairs"))
          
      }

      
  } else {
      
      knownLinks.filt.df = NULL
  }
  
  # Renaming necessary because currently, consensusPeaks contains peakID and 
  if (!is.null(TADs)) {
      peak.TADs.df = dplyr::rename(peak.TADs.df, peak.ID = "peakID")
  }
  
  # OVERLAP OF PEAKS AND EXTENDED GENES
  if (!is.null(knownLinks) && nrow(knownLinks.filt.df) > 0 && knownLinks_useExclusively) {
      
      overlaps.sub.filt.df = tibble::tribble(~peak.ID, ~ext_peak.chr , ~ext_peak.start, ~ext_peak.end , ~orig_peak.start , ~orig_peak.end, 
                            ~gene.ENSEMBL, ~gene.type, ~gene.name, ~gene.chr, ~gene.start, ~gene.end, ~peak_gene.distance, ~source)
      
  } else {

      overlaps.sub.df = .calculatePeakGeneOverlaps(GRN, allPeaks = consensusPeaks, peak.TADs.df, 
                                                   neighborhoodSize = neighborhoodSize, 
                                                   genomeAssembly = genomeAssembly, 
                                                   gene.types = gene.types, overlapTypeGene = overlapTypeGene) 
      
      overlaps.sub.filt.df = overlaps.sub.df %>%
          dplyr::mutate(gene.ENSEMBL = gsub("\\..+", "", .data$gene.ENSEMBL, perl = TRUE), # Clean ENSEMBL IDs
                        source = as.factor(dplyr::if_else(is.null(TADs), "neighborhood", "TADs"))) %>%
          tibble::as_tibble() 
  }
  
  overlaps.sub.filt.df$bait_OE.ID = NA

  # Add manually defined links
  if (!is.null(knownLinks)) {

      # orig: coordinates from original peak from object 
      # ext_: new peak coordinates (OE)

      # Calculate all other attributes and source column
      knownLinks.filt.df = knownLinks.filt.df %>%
          dplyr::mutate(source = "knownLinks") %>% # Now correct the wrong gene end and put the original gene end
          dplyr::select(-"gene.end") %>%
          dplyr::left_join(GRN@annotation$genes %>% dplyr::select("gene.ENSEMBL", "gene.end"), by = "gene.ENSEMBL") %>%
          dplyr::mutate(peak_gene.distance = dplyr::case_when(  orig_peak.chr != gene.chr ~ NA,
                                                                gene.start >= orig_peak.start & gene.start <= orig_peak.end ~ 0L,
                                                                TRUE ~ pmin(abs(orig_peak.start - gene.start), abs(orig_peak.end - gene.start)))) %>%
          dplyr::select(colnames(overlaps.sub.filt.df), "bait_OE.ID") %>% # Enforce same order and column names + new ID for bait-OE
          dplyr::group_by(.data$peak.ID, .data$gene.ENSEMBL) %>%
          dplyr::filter(dplyr::row_number() == 1) %>% # Use only the first example of each unique combination
          dplyr::ungroup()
 
      overlaps.sub.filt.df = rbind(overlaps.sub.filt.df, knownLinks.filt.df)
      
      # Eliminate duplicates again
      nRowsBefore = nrow(overlaps.sub.filt.df)
      overlaps.sub.filt.df = overlaps.sub.filt.df %>%
          dplyr::group_by(.data$peak.ID, .data$gene.ENSEMBL) %>%
          dplyr::filter(dplyr::row_number() == 1) %>% # Use only the first example of each unique combination
          dplyr::ungroup()
      
      nRowsAfter = nrow(overlaps.sub.filt.df)
      if (nRowsAfter != nRowsBefore) {
          futile.logger::flog.info(paste0("Eliminated ", nRowsBefore - nRowsAfter, " duplicate peak - gene entries due to the addition of the known links..."))

      }
  }

  
  # Set to empty df to simplify the code below
  if (is.null(peak.TADs.df)) {
    peak.TADs.df = tibble::tibble(peak.ID = "", tad.ID = "")
  }
  
  futile.logger::flog.info(paste0("Source distribution of peak-gene links:"))
  stats_src = table(overlaps.sub.filt.df$source)
  for (i in 1:length(stats_src)) {
      futile.logger::flog.info(paste0(" ", names(stats_src)[i], ": ", stats_src[i]))
      
  }

  futile.logger::flog.info(paste0("\nFinished preparing data\n"))
  

  for (permCur in as.logical(0:.getMaxPermutation(GRN))) {
      
      futile.logger::flog.info(paste0("\nCalculate peak-gene correlations for ", .getPermStr(permCur), "\n"))
      
      # Only now we shuffle to make sure the background of possible connections is the same as in the foreground, as opposed to completely random
      # which would also include peak-gene connections that are not in the foreground at all
      if (permCur) {
          futile.logger::flog.info(paste0(" Randomize gene-peak links by shuffling the peak IDs."))
          overlaps.sub.filt.df$peak.ID = sample(overlaps.sub.filt.df$peak.ID, replace = FALSE)
          overlaps.sub.filt.df$peak_gene.distance = NA
          
      }
      
      # Should RNA data be shuffled?
      shuffleRNA_cur = dplyr::if_else(shuffleRNA, permCur, FALSE)

      countsPeaks.clean = getCounts(GRN, type = "peaks",  permuted = FALSE, includeIDColumn = FALSE)
      countsRNA.clean   = getCounts(GRN, type = "rna", permuted = shuffleRNA_cur, includeIDColumn = FALSE)
      
      # Cleverly construct the count matrices so we do the correlations in one go
      map_peaks = match(overlaps.sub.filt.df$peak.ID,  getCounts(GRN, type = "peaks", permuted = FALSE)$peakID)
      map_rna  = match(overlaps.sub.filt.df$gene.ENSEMBL, getCounts(GRN, type = "rna", permuted = shuffleRNA_cur)$ENSEMBL) # may contain NA values because the gene is not actually in the RNA-seq counts
      
      # There should not be any NA because it is about the peaks
      stopifnot(all(!is.na(map_peaks)))
      # Some NAs might be expected, given our annotation contains all known genes
      stopifnot(!all(is.na(map_rna)))
      
      #res.m = matrix(NA, ncol = 2, nrow = nrow(overlaps.sub.filt.df), dimnames = list(NULL, c("p.raw", "peak_gene.r")))
      
      futile.logger::flog.info(paste0(" Iterate through ", nrow(overlaps.sub.filt.df), " peak-gene combinations and calculate correlations using ", nCores, " cores. This may take a few minutes."))
      
      # parallel version of computing peak-gene correlations
      maxRow = nrow(overlaps.sub.filt.df)
      startIndexMax = ceiling(maxRow / chunksize) - 1 # -1 because we count from 0 onwards
      
      
      res.l = .execInParallelGen(nCores, returnAsList = TRUE, listNames = NULL, iteration = 0:startIndexMax, verbose = FALSE, 
                                 functionName = .correlateDataWrapper, 
                                 chunksize = chunksize, maxRow = maxRow, 
                                 counts1 = countsPeaks.clean, counts2 = countsRNA.clean, map1 = map_peaks, map2 = map_rna, 
                                 corMethod = corMethod)
      
      res.m  = do.call(rbind, res.l)
      
      futile.logger::flog.info(paste0(" Finished with calculating correlations, creating final data frame and filter NA rows due to missing RNA-seq data"))
      
      # Neighborhood size not relevant for TADs
      if (!is.null(TADs)) {
          neighborhoodSize = -1
      }
      
      selectColumns = c("peak.ID", "gene.ENSEMBL", "source", "peak_gene.distance", "tad.ID", "bait_OE.ID", "r", "p.raw")
      
      # Make data frame and adjust p-values
      res.df = suppressMessages(tibble::as_tibble(res.m) %>%
                                    dplyr::mutate(peak.ID = getCounts(GRN, type = "peaks", permuted = FALSE)$peakID[map_peaks],
                                                  gene.ENSEMBL = getCounts(GRN, type = "rna", permuted = permCur)$ENSEMBL[map_rna]) %>%
                                    dplyr::filter(!is.na(.data$gene.ENSEMBL)) %>%  # For some peak-gene combinations, no RNA-Seq data was available, these NAs are filtered
                                    # Add gene annotation and distance
                                    dplyr::left_join(overlaps.sub.filt.df, by = c("gene.ENSEMBL", "peak.ID"), multiple = "all") %>%
                                    # Integrate TAD IDs also
                                    dplyr::left_join(dplyr::select(peak.TADs.df, "peak.ID", "tad.ID"), by = c("peak.ID")) %>%
                                    
                                    dplyr::select(tidyselect::all_of(selectColumns))) %>%
          dplyr::mutate(peak.ID = as.factor(.data$peak.ID),
                        gene.ENSEMBL = as.factor(.data$gene.ENSEMBL), 
                        tad.ID = as.factor(.data$tad.ID)) %>%
          dplyr::rename(peak_gene.r = "r", 
                        peak_gene.p_raw = "p.raw",
                        peak_gene.tad_ID = "tad.ID",
                        peak_gene.source = "source",
                        peak_gene.bait_OE_ID = "bait_OE.ID")

      if (is.null(TADs)) {
          res.df = dplyr::select(res.df, -"peak_gene.tad_ID")
      }
      if (is.null(knownLinks)) {
          res.df = dplyr::select(res.df, -"peak_gene.bait_OE_ID")
      }
      
      futile.logger::flog.info(paste0(" Finished. Final number of rows after filtering: ", nrow(res.df)))
      
      
      .printExecutionTime(start.all)

      GRN@connections$peak_genes[[as.character(as.integer(permCur))]] = res.df
  }
  
  GRN
  
 
}


.makeGenomicRangeGenes <- function(genes, gene.types, overlapTypeGene = "TSS", requiredColnames = c("gene.ENSEMBL","gene.type", "gene.name")) {
    
    subject = genes %>%
        dplyr::select(-"gene.mean", -"gene.median", -"gene.CV") %>%
        dplyr::filter(!is.na(.data$gene.start), !is.na(.data$gene.end))
    
    if (!is.null(gene.types)) {
        if (!"all" %in% gene.types) {
            subject = dplyr::filter(subject, .data$gene.type %in% gene.types)
        }
    }
    
    checkmate::assertSubset(requiredColnames, colnames(subject), empty.ok = FALSE)
    
    subject.gr = GenomicRanges::makeGRangesFromDataFrame(subject, keep.extra.columns = TRUE)
    
    if (overlapTypeGene == "TSS") {
        # Take only the 5' end of the gene (start site and NOT the full gene length)
        end(subject.gr) = start(subject.gr)
    } 
    
    subject.gr
}

.calculatePeakGeneOverlaps <- function(GRN, allPeaks, peak_TAD_mapping = NULL, neighborhoodSize = 250000, genomeAssembly, 
                                       gene.types, overlapTypeGene = "TSS", removeEnsemblNA = TRUE) {
    
    start = Sys.time()
    futile.logger::flog.info(paste0("Calculate peak gene overlaps based on either a fixed neighborhood size or defined TADs..."))
    
    # EXTEND PEAKS #
    
    
    # Add Hi-C domain data to query metadata if available
    if (!is.null(peak_TAD_mapping)) {
        
        futile.logger::flog.info(paste0("Integrate Hi-C data to extend peaks"))
        
        futile.logger::flog.info(paste0(" For peaks overlapping multiple TADs, use the union of all to define the neighborhood"))
        peak_TAD_mapping = peak_TAD_mapping %>%
            dplyr::group_by(.data$peakID) %>%
            dplyr::mutate(tadStart2 = min(.data$tadStart), # If a peak overlaps multiple TADs, merge them
                          tadEnd2   = max(.data$tadEnd),  # If a peak overlaps multiple TADs, merge them
                          tad.ID_all = paste0(.data$tad.ID, collapse = "|")) %>%
            dplyr::slice(1) %>%
            dplyr::ungroup()
        
        query   = .constructGRanges(peak_TAD_mapping, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
        
        # Remove rows with NA for the TAD 
        peaksNATAD = which(is.na(query$tad.ID))
        if (length(peaksNATAD) > 0) {
            futile.logger::flog.info(paste0(" ", length(peaksNATAD), " out of ", length(query), " peaks will not be tested for gene associations because they had no associated TAD"))
            query = query[!is.na(query$tad.ID)]
        }
        
        # Store original start and end positions before modifying them
        query$orig_start = start(query)
        query$orig_end   = end(query)
        
        # Extend GRanges by integrating Hi-C data. Use the newly defined TAD coordinates
        start(query) = suppressMessages(query$tadStart2)
        end(query)   = suppressMessages(query$tadEnd2)
        
        
    } else {
        
        # Without Hi-C data, we simply extend the ranges by a user-defined amount of bp, 250 kb being the default
        futile.logger::flog.info(paste0("Extend peaks based on user-defined extension size of ", neighborhoodSize, " up- and downstream."))
        
        query   = .constructGRanges(allPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
        
        # Store original start and end positions before modifying them
        query$orig_start = start(query)
        query$orig_end   = end(query)
        
        suppressWarnings({start(query) = start(query) - neighborhoodSize})
        suppressWarnings({end(query)   = end(query) + neighborhoodSize})
        
        # Correct negative 
    }
    
    # correct ranges if within the promoterRange from the chr. starts and ends
    query = GenomicRanges::trim(query)
    
    subject.gr = .makeGenomicRangeGenes(GRN@annotation$genes, gene.types, overlapTypeGene)
    
   
    
    
    overlapsAll = suppressWarnings(GenomicRanges::findOverlaps(query, subject.gr, 
                                                               minoverlap = 1,
                                                               type = "any",
                                                               select = "all",
                                                               ignore.strand = TRUE))
    
    query_row_ids  = S4Vectors::queryHits(overlapsAll)
    subject_rowids = S4Vectors::subjectHits(overlapsAll)
    
    requiredColnames = c("gene.ENSEMBL","gene.type", "gene.name")
    
    #subject_overlap_df = as.data.frame(S4Vectors::elementMetadata(subject)[subject_rowids, c("ENSEMBL","ENTREZID", "SYMBOL")])
    subject_overlap_df = as.data.frame(S4Vectors::elementMetadata(subject.gr)[subject_rowids, requiredColnames])
    subject_overlap_df$gene.chr = as.character(GenomeInfoDb::seqnames(subject.gr))[subject_rowids]
    subject_overlap_df$gene.start = start(subject.gr)[subject_rowids]
    subject_overlap_df$gene.end   = end(subject.gr)  [subject_rowids]
    
    # Some entries in here will have only NAs
    
    query_overlap_df = as.data.frame(S4Vectors::elementMetadata(query)  [query_row_ids, "peakID"])
    
    query_overlap_df$ext_peak.chr    = as.character(GenomeInfoDb::seqnames(query))[query_row_ids]
    query_overlap_df$ext_peak.start  = start(query)[query_row_ids]
    query_overlap_df$ext_peak.end    = end(query)  [query_row_ids]
    query_overlap_df$orig_peak.start = query$orig_start[query_row_ids]
    query_overlap_df$orig_peak.end   = query$orig_end  [query_row_ids]    
    
    overlaps.df = cbind.data.frame(query_overlap_df, subject_overlap_df)
    colnames(overlaps.df)[1] = c("peak.ID")
    
    # Always compute distance to 5' of the gene:gene.start
    
    overlaps.sub.df = overlaps.df %>%
        dplyr::distinct() %>%
        dplyr::mutate(peak_gene.distance = dplyr::case_when(gene.start >= orig_peak.start & gene.start <= orig_peak.end ~ 0L,
                                                            TRUE ~ pmin(abs(orig_peak.start - gene.start), abs(orig_peak.end - gene.start))))
    
    if (removeEnsemblNA) {
        overlaps.sub.df = dplyr::filter(overlaps.sub.df, !is.na(.data$gene.ENSEMBL))
    }
    
    .printExecutionTime(start)
    
    overlaps.sub.df
    
}



#' @import ggplot2
.correlateDataWrapper <- function(startIndex, chunksize, maxRow, counts1, counts2, map1, map2, corMethod) {
  
  start = chunksize * startIndex + 1
  end = min(start +  chunksize - 1, maxRow)
  
  res.m = matrix(NA, ncol = 2, nrow = end - start + 1, dimnames = list(NULL, c("p.raw", "r")))
  
  rowCur = 0
  nPlotted = 0
  
  for (i in start:end) {
    
    rowCur = rowCur + 1
    
    if (is.na(map1[i]) | is.na(map2[i])) {
      next
    }
    
    data1   = unlist(counts1[map1[i],])
    
    # Changed in version 1.3.9: Make sure the other data compares the correct pairs.
    # This restores the previous functionality of proper shuffling for the background RNA-seq data
    data2   = unlist(counts2[map2[i],])[names(data1)]
    
    if (corMethod %in% c("pearson", "spearman")) {
        
        res =  suppressWarnings(stats::cor.test(data1, data2, method = corMethod))
        
        res.m[rowCur, "p.raw"] = res$p.value
        res.m[rowCur, "r"]     = res$estimate
    
    } else if (corMethod == "bicor") {
        
        res =  WGCNA::bicorAndPvalue(data1, data2, robustX = TRUE, robustY = TRUE)
        res.m[rowCur, "p.raw"]       = res$p
        res.m[rowCur, "r"]           = res$bicor
    }
    
    # https://stats.stackexchange.com/questions/205614/p-values-and-significance-in-rlm-mass-package-r
   
  }
  
  
  res.m
  
} # end function



#' Filter TF-peaks and peak-gene connections and combine them to TF-peak-gene connections to construct an eGRN.
#' 
#' This is one of the main integrative functions of the \code{GRaNIE} package. It has two main functions: 
#' First, filtering both TF-peak and peak-gene connections according to different criteria such as FDR and other properties 
#' Second, joining the three major elements that an eGRN consist of (TFs, peaks, genes) into one data frame, with one row per unique TF-peak-gene connection.
#' \strong{After successful execution, the connections (along with additional feature metadata) can be retrieved with the function \code{\link{getGRNConnections}}.}
#' \strong{Note that a previously stored eGRN graph is reset upon successful execution of this function along with printing a descriptive warning,
#'  and re-running the function \code{\link{build_eGRN_graph}} is necessary when any of the network functions of the package shall be executed. 
#' If the filtered connections changed, all network related enrichment functions also have to be rerun.}

#' Internally, before joining them, both TF-peak links and peak-gene connections are filtered separately for reasons of memory and computational efficacy:
#' First filtering out unwanted links dramatically reduces the memory needed for the full eGRN. Peak-gene p-value adjustment is only done after all filtering steps on the remaining set of
#' connections to lower the statistical burden of multiple-testing adjustment; therefore, this may lead to initially counter-intuitive effects such as a particular connections not being included anymore as compared to a 
#' filtering based on different thresholds, or the FDR being different for the same reason.
#' @template GRN
#' @param TF_peak.fdr.threshold Numeric[0,1]. Default 0.2. Maximum FDR for the TF-peak links. Set to 1 or NULL to disable this filter.
#' @template TF_peak.connectionTypes
#' @param peak.SNP_filter Named list. Default \code{list(min_nSNPs = 0, filterType = "orthogonal")}. Filters related to SNP data if they have 
#' been added with the function \code{\link{addSNPData}}, ignored otherwise. The named list must contain at least two elements:
#' First, \code{min_nSNPs}, an integer >= 0 that denotes how many SNPs a peak has to overlap with at least to pass the filter or be considered for inclusion.
#' Second, \code{filterType}, a character that must either be \code{orthogonal} or \code{extra} and denotes whether the SNP filter is orthogonal to the other filters (i.e, an alternative way of when a peak is considered for being kept) or whether the SNP filter is in addition to all other filters.
#' For more help, see the Vignettes.
#' @param peak_gene.p_raw.threshold Numeric[0,1]. Default NULL. Threshold for the peak-gene connections, based on the raw p-value. All peak-gene connections with a larger raw p-value will be filtered out.
#' @param peak_gene.fdr.threshold Numeric[0,1]. Default 0.2. Threshold for the peak-gene connections, based on the FDR. All peak-gene connections with a larger FDR will be filtered out.
#' @param peak_gene.fdr.method Character. Default "BH". One of: "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none", "IHW". 
#' Method for adjusting p-values for multiple testing. 
#' If set to "IHW", the package \code{IHW} is required (as it is listed under \code{Suggests}, it may not be installed), 
#' and independent hypothesis weighting will be performed, and a suitable covariate has to be specified for the parameter \code{peak_gene.IHW.covariate}.
#' @param peak_gene.IHW.covariate Character. Default \code{NULL}. Name of the covariate to use for IHW (column name from the table that is returned with the function \code{getGRNConnections}. Only relevant if \code{peak_gene.fdr.method} is set to "IHW". You have to make sure the specified covariate is suitable for IHW, see the diagnostic plots that are generated in this function for this. For many datasets, the peak-gene distance (called \code{peak_gene.distance} in the object) seems suitable.
#' @param peak_gene.IHW.nbins Integer or "auto". Default "auto". Number of bins for IHW. Only relevant if \code{peak_gene.fdr.method} is set to "IHW".
#' @template outputFolder
#' @template gene.types
#' @param allowMissingTFs \code{TRUE} or \code{FALSE}.  Default \code{FALSE}. Should connections be returned for which the TF is NA (i.e., connections consisting only of peak-gene links?). If set to \code{TRUE}, this generally greatly increases the number of connections but it may not be what you aim for.
#' @param allowMissingGenes \code{TRUE} or \code{FALSE}.  Default \code{TRUE}. Should connections be returned for which the gene is NA (i.e., connections consisting only of TF-peak links?). If set to \code{TRUE}, this generally increases the number of connections.
#' @param peak_gene.r_range Numeric(2). Default \code{c(0,1)}. Filter for lower and upper limit for the peak-gene links. Only links will be retained if the correlation coefficient is within the specified interval. This filter is usually used to filter out negatively correlated peak-gene links.
#' @param peak_gene.selection \code{"all"} or \code{"closest"}. Default \code{"all"}. Filter for the selection of genes for each peak. If set to \code{"all"}, all previously identified peak-gene are used, while \code{"closest"} only retains the closest gene for each peak that is retained until the point the filter is applied.
#' @param peak_gene.maxDistance Integer >0. Default \code{NULL}. Maximum peak-gene distance to retain a peak-gene connection.
#' @param filterTFs Character vector. Default \code{NULL}. Vector of TFs (as named in the GRN object) to retain. All TFs not listed will be filtered out.
#' @param filterGenes Character vector. Default \code{NULL}. Vector of gene IDs (as named in the GRN object) to retain. All genes not listed will be filtered out.
#' @param filterPeaks Character vector. Default \code{NULL}. Vector of peak IDs (as named in the GRN object) to retain. All peaks not listed will be filtered out.
#' @param TF_peak_FDR_selectViaCorBins \code{TRUE} or \code{FALSE}.  Default \code{FALSE}. Use a modified procedure for selecting TF-peak links. Instead of selecting solely based on the user-specified FDR, this procedure first identifies the correlation bin closest to 0 that contains at least one significant TF-peak link according to the chosen TF_peak.fdr.threshold. This is done spearately for both FDR directions.  It then retains all TF-peak links that have a correlation bin at least as extreme as the identified pair. For example, if the correlation bin [0.35,0.40] contains a significant TF-peak link while [0,0.05], [0.05,0.10], ..., [0.30,0.35] do not, all TF-peak links with a correlation of at least 0.35 or above are selected (i.e, bins [0.35,0.40], [0.40,0.45], ..., [0.95,1.00]).  Thus, for the final selection, also links with a higher FDR but a more extreme correlation may be selected.
#' @param silent \code{TRUE} or \code{FALSE}.  Default \code{FALSE}. Print progress messages and filter statistics.
#' @param resetGraphAndStoreInternally \code{TRUE} or \code{FALSE}.  Default \code{TRUE}. If set to \code{TRUE}, the stored eGRN graph (slot \code{graph}) is reset due to the potentially changed connections that
#' would otherwise cause conflicts in the information stored in the object. Also, a GRN object is returned. If set to \code{FALSE}, only the new filtered connections are returned and the object is not altered.
#' @param filterLoops  \code{TRUE} or \code{FALSE}. Default \code{TRUE}. If a TF regulates itself (i.e., the TF and the gene are the same entity), should such loops be filtered from the GRN?
#' @template forceRerun
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function. 
#' The filtered and merged TF-peak and peak-gene connections in the slot \code{GRN@connections$all.filtered} and can be retrieved (along with other feature metadata) using the function \code{\link{getGRNConnections}}.
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN = filterGRNAndConnectGenes(GRN)
#' @seealso \code{\link{visualizeGRN}}
#' @seealso \code{\link{addConnections_TF_peak}} 
#' @seealso \code{\link{addConnections_peak_gene}} 
#' @seealso \code{\link{build_eGRN_graph}} 
#' @seealso \code{\link{getGRNConnections}} 
#' @importFrom rlang .data
#' @importFrom magrittr `%>%`
#' @importFrom ggplot2 `%+%`
#' @export
#' 

# TODO: Find a new compatible way to return also, or specifically peaks that overlap a SNP when SNP information is present
# SNP_min, only keep if TF + SNPs, or only SNPs, 
# peak_keep_min_nSNPs
# peak.SNP_filter = list(min_nSNPs = 0, ignoreTF = TRUE)
filterGRNAndConnectGenes <- function(GRN,
                                     TF_peak.fdr.threshold = 0.2, 
                                     TF_peak.connectionTypes = "all",
                                     peak.SNP_filter = list(min_nSNPs = 0, filterType = "orthogonal"),
                                     peak_gene.p_raw.threshold = NULL, 
                                     peak_gene.fdr.threshold= 0.2,
                                     peak_gene.fdr.method = "BH",
                                     peak_gene.IHW.covariate = NULL,
                                     peak_gene.IHW.nbins = "auto",
                                     outputFolder = NULL,
                                     gene.types = c("all"), 
                                     allowMissingTFs = FALSE, allowMissingGenes = TRUE,
                                     peak_gene.r_range = c(0,1), 
                                     peak_gene.selection = "all",
                                     peak_gene.maxDistance = NULL,
                                     filterTFs = NULL, filterGenes = NULL, filterPeaks = NULL, 
                                     TF_peak_FDR_selectViaCorBins = FALSE,
                                     filterLoops = TRUE,
                                     resetGraphAndStoreInternally = TRUE,
                                     silent = FALSE,
                                     forceRerun = FALSE) {
  
  start = Sys.time()  

  checkmate::assertClass(GRN, "GRN")
  GRN = .addFunctionLogToObject(GRN)
  
  GRN = .makeObjectCompatible(GRN)
  
  checkmate::assertNumber(TF_peak.fdr.threshold, lower = 0, upper = 1)
  checkmate::assertSubset(TF_peak.connectionTypes, c("all", GRN@config$TF_peak_connectionTypes), empty.ok = FALSE)
  checkmate::assert(checkmate::checkNull(peak_gene.p_raw.threshold), checkmate::checkNumber(peak_gene.p_raw.threshold, lower = 0, upper = 1))
  checkmate::assert(checkmate::checkNull(peak_gene.fdr.threshold), checkmate::checkNumber(peak_gene.fdr.threshold, lower = 0, upper = 1))
  checkmate::assertChoice(peak_gene.fdr.method, c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none", "IHW"))
  
  checkmate::assert(checkmate::checkNull(peak_gene.IHW.covariate), checkmate::checkCharacter(peak_gene.IHW.covariate, min.chars = 1, len = 1))
  if (peak_gene.fdr.method == "IHW") {
      checkmate::assertCharacter(peak_gene.IHW.covariate, min.chars = 1, len = 1)
  }
  checkmate::assert(checkmate::checkIntegerish(peak_gene.IHW.nbins, lower = 1), checkmate::checkSubset(peak_gene.IHW.nbins, "auto"))
  
  checkmate::assert(checkmate::checkNull(peak.SNP_filter), checkmate::checkList(peak.SNP_filter))
  if (!is.null(peak.SNP_filter)) {
      checkmate::assertSubset(c("min_nSNPs", "filterType"), names(peak.SNP_filter))
      checkmate::assertIntegerish(peak.SNP_filter$min_nSNPs, lower = 0)
      checkmate::assertSubset(peak.SNP_filter$filterType, c("orthogonal", "extra"))
      
      if (peak.SNP_filter$min_nSNPs > 0) {
          if (!all(c("SNP_n", "SNP_rsid", "SNP_start") %in% colnames(GRN@data$peaks$counts_metadata))) {
              message = "Could not find SNP information in GRN@data$peaks$counts_metadata. Either run addSNPData beforehand or set peak.SNP_filter to NULL."
              .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
          }   
      }
    
  }
  
  checkmate::assertSubset(gene.types, c("all", unique(as.character(GRN@annotation$genes$gene.type))) %>% stats::na.omit(), empty.ok = FALSE)
  
  checkmate::assertFlag(allowMissingTFs)
  checkmate::assertFlag(allowMissingGenes)
  
  checkmate::assertNumeric(peak_gene.r_range, lower = -1, upper = 1, len = 2)
  
  checkmate::assertChoice(peak_gene.selection, c("all", "closest"))
  
  checkmate::assert(checkmate::checkNull(peak_gene.maxDistance), checkmate::checkIntegerish(peak_gene.maxDistance, lower = 0, len = 1))
  
  checkmate::assert(checkmate::checkNull(filterTFs), checkmate::checkCharacter(filterTFs, min.chars = 1, any.missing = FALSE))
  checkmate::assert(checkmate::checkNull(filterPeaks), checkmate::checkCharacter(filterPeaks, min.chars = 1, any.missing = FALSE))
  checkmate::assert(checkmate::checkNull(filterGenes), checkmate::checkCharacter(filterGenes, min.chars = 1, any.missing = FALSE))
  
  
  checkmate::assertFlag(TF_peak_FDR_selectViaCorBins)
  checkmate::assertFlag(filterLoops)
  checkmate::assertFlag(resetGraphAndStoreInternally)
  checkmate::assertFlag(silent)
  checkmate::assertFlag(forceRerun)
  
  if (peak_gene.fdr.method == "IHW" & !is.installed("IHW")) {
    packageMessage = "IHW has been selected for p-value adjustment, but IHW is currently not installed. Please install it and re-run the function or choose a different method."
    .checkPackageInstallation("IHW", packageMessage)  

  }
  
  if (!is.null(peak_gene.p_raw.threshold) & !is.null(peak_gene.fdr.threshold)) {
    message = "Both parameters peak_gene.p_raw.threshold and peak_gene.fdr.threshold have been specified, choose only either of them."
    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  }
  
  if (!is.null(peak_gene.IHW.covariate) && peak_gene.fdr.method != "IHW") {
      message = "filterGRNAndConnectGenes: peak_gene.fdr.threshold is not set to \"IHW\" while peak_gene.IHW.covariate has been specified. Therefore, peak_gene.IHW.covariate will be ignored. Set peak_gene.fdr.threshold to \"IHW\" to use IHW."
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
  }
  
  if (silent) {
    futile.logger::flog.threshold(futile.logger::WARN)
  }
  
  futile.logger::flog.info(paste0("Filter GRN network"))
  
  if (!.checkExistanceFilteredConnections(GRN, returnLogical = TRUE) | forceRerun) {
      
      
      # Reset TF-gene links as these are recomputed with the filtered set and this cannot be done beforehand
      GRN@connections$TF_genes.filtered = NULL
      GRN@connections$all.filtered = list()
      
      for (permutationCur in 0:.getMaxPermutation(GRN)) {
          
          futile.logger::flog.info(paste0("\n\n", .getPermStr(permutationCur)))
          permIndex = as.character(permutationCur)
          
          if (is.null(GRN@connections$peak_genes[[permIndex]])) {
              message = "No peak-gene connections found. Run the function addConnections_peak_gene first"
              .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
          }
          
          # Only select the absolute necessary here, no additional metadata
          ann.gene.red = GRN@annotation$genes %>%
              dplyr::mutate(gene.ENSEMBL = as.character(.data$gene.ENSEMBL)) %>%
              dplyr::select("gene.ENSEMBL", "gene.name", "gene.type")
          
          peakGeneCorrelations = GRN@connections$peak_genes[[permIndex]] %>%
              dplyr::mutate(gene.ENSEMBL = as.character(.data$gene.ENSEMBL)) %>%
              dplyr::left_join(ann.gene.red, by = "gene.ENSEMBL", multiple = "all")
          
          
          
          # Add TF Ensembl IDs
          grn.filt = GRN@connections$TF_peaks[[permIndex]]$main  %>% 
              tibble::as_tibble() %>%
              dplyr::left_join(GRN@annotation$TFs %>% dplyr::select("TF.ID", "TF.name", "TF.ENSEMBL"), by = c("TF.ID")) %>%
              dplyr::select(-"TF_peak.fdr_orig") %>%
              dplyr::mutate(TF.ENSEMBL = as.factor(.data$TF.ENSEMBL))
          
          if (is.null(grn.filt)) {
              message = "No TF-peak connections found. Run the function addConnections_TF_peak first"
              .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
          }
          
          # Filter network #
          futile.logger::flog.info(paste0("Inital number of rows left before all filtering steps: ", nrow(grn.filt)))
          
          if (!"all" %in% TF_peak.connectionTypes) {
              checkmate::assertSubset(TF_peak.connectionTypes, GRN@config$TF_peak_connectionTypes, empty.ok = FALSE)
              futile.logger::flog.info(paste0(" Filter network and retain only rows with one of the following TF-peak connection types: ", paste0(TF_peak.connectionTypes, collapse = ", ")))
              futile.logger::flog.info(paste0("  Number of TF-peak rows before filtering connection types: ", nrow(grn.filt)))
              grn.filt = dplyr::filter(grn.filt, .data$TF_peak.connectionType %in% TF_peak.connectionTypes)
              futile.logger::flog.info(paste0("  Number of TF-peak rows after filtering connection types: ", nrow(grn.filt)))
          }
          
          if (!is.null(TF_peak.fdr.threshold)) {
              futile.logger::flog.info(paste0(" Filter network and retain only rows with TF-peak connections with an FDR < ", TF_peak.fdr.threshold))
              futile.logger::flog.info(paste0("  Number of TF-peak rows before filtering TFs: ", nrow(grn.filt)))
              
              if (!TF_peak_FDR_selectViaCorBins) {
                  grn.filt = dplyr::filter(grn.filt, .data$TF_peak.fdr < TF_peak.fdr.threshold)
                  futile.logger::flog.info(paste0("  Number of TF-peak rows after filtering TFs: ", nrow(grn.filt)))
              } else {
                  
                  # Add a new ID column
                  grn.filt$row.ID = seq_len(nrow(grn.filt))
                  
                  # For each TF, identify those TF-peak correlation bins that are more extreme than the first correlation bin that is beyond the user-specified bin
                  # Add one additional column to the table, and filter later by this column
                  idsRowsKeep = c()
                  for (TFCur in unique( grn.filt$TF.ID)) {
                      
                      for (connectionTypeCur in TF_peak.connectionTypes) {
                          
                          grn.filt.TF = dplyr::filter(grn.filt, .data$TF.ID == TFCur, .data$TF_peak.connectionType == connectionTypeCur)
                          
                          for (dirCur in c("pos", "neg")) {
                              
                              if (dirCur == "pos") {
                                  
                                  grn.filt.TF.dir = dplyr::filter(grn.filt.TF, .data$TF_peak.fdr_direction == "pos")
                                  stepsCur = GRN@config$parameters$internal$stepsFDR
                                  rightOpen = FALSE
                                  grn.filt.TF.dir$TF_peak.r_bin = cut(grn.filt.TF.dir$TF_peak.r, breaks = stepsCur, 
                                                                      right = rightOpen, include.lowest = TRUE, ordered_result = TRUE)
                                  
                                  binThresholdNeg = grn.filt.TF.dir  %>% 
                                      dplyr::select("TF_peak.r_bin", "TF_peak.fdr") %>% 
                                      dplyr::distinct() %>% 
                                      dplyr::arrange(.data$TF_peak.r_bin)
                                  
                                  relBins = which(binThresholdNeg$TF_peak.fdr < TF_peak.fdr.threshold)
                                  if (length(relBins) > 0) {
                                      relBins = binThresholdNeg$TF_peak.r_bin[min(relBins):nrow(binThresholdNeg)]
                                      idsRowsKeep = c(idsRowsKeep, grn.filt.TF.dir %>% 
                                                          dplyr::filter(.data$TF_peak.r_bin %in% relBins) %>% 
                                                          dplyr::pull(.data$row.ID))
                                  }
                                  
                                  
                              } else {
                                  
                                  grn.filt.TF.dir = dplyr::filter(grn.filt.TF, .data$TF_peak.fdr_direction == "neg")
                                  
                                  stepsCur = rev(GRN@config$parameters$internal$stepsFDR)  
                                  rightOpen = TRUE
                                  
                                  grn.filt.TF.dir$TF_peak.r_bin = cut(grn.filt.TF.dir$TF_peak.r, breaks = stepsCur, 
                                                                      right = rightOpen, include.lowest = TRUE, ordered_result = TRUE)
                                  
                                  binThresholdNeg = grn.filt.TF.dir  %>% 
                                      dplyr::select("TF_peak.r_bin", "TF_peak.fdr") %>% 
                                      dplyr::distinct() %>% 
                                      dplyr::arrange(.data$TF_peak.r_bin)
                                  
                                  relBins = which(binThresholdNeg$TF_peak.fdr < TF_peak.fdr.threshold)
                                  if (length(relBins) > 0) {
                                      relBins = binThresholdNeg$TF_peak.r_bin[seq_len(max(relBins))]
                                      idsRowsKeep = c(idsRowsKeep, grn.filt.TF.dir %>% 
                                                          dplyr::filter(.data$TF_peak.r_bin %in% relBins) %>% 
                                                          dplyr::pull(.data$row.ID))
                                  }
                              }
                              
                          } # end for both directions
                      } # end for all TF-peak link types
                  } # end for all TF
                  
                  
                  grn.filt = grn.filt %>%
                      dplyr::filter(.data$row.ID %in% idsRowsKeep) %>%
                      dplyr::select(-"row.ID")
                  
                  futile.logger::flog.info(paste0("  Number of TF-peak rows after filtering TFs: ", nrow(grn.filt)))
                  
              }
              
          }
          
          
          if (!is.null(filterTFs)) {
              futile.logger::flog.info(paste0(" Filter network to the following TF IDs: ", paste0(filterTFs, collapse = ",")))
              futile.logger::flog.info(paste0("  Number of TF-peak rows before filtering TFs: ", nrow(grn.filt)))
              grn.filt = dplyr::filter(grn.filt, .data$TF.ID %in% filterTFs)
              futile.logger::flog.info(paste0("  Number of TF-peak rows after filtering TFs: ", nrow(grn.filt)))
          }
          
          if (!is.null(filterPeaks)) {
              futile.logger::flog.info(paste0(" Filter network to the following peak IDs: ", paste0(filterPeaks, collapse = ",")))
              futile.logger::flog.info(paste0("  Number of TF-peak rows before filtering peaks: ", nrow(grn.filt)))
              grn.filt = dplyr::filter(grn.filt, .data$peak.ID %in% filterPeaks)
              futile.logger::flog.info(paste0("  Number of TF-peak rows after filtering peaks: ", nrow(grn.filt)))
          }
          

          if (!is.null(peak.SNP_filter) && peak.SNP_filter$min_nSNPs > 0 && peak.SNP_filter$filterType == "extra") {
              
              
              futile.logger::flog.info(paste0(" Filter peaks based on their minimum number of SNPs they overlap with. Threshold here: ", peak.SNP_filter$min_nSNPs))
              futile.logger::flog.info(paste0("  Number of TF-peak rows before filtering peaks: ", nrow(grn.filt)))
              
              grn.filt = grn.filt %>%
                  dplyr::left_join(GRN@data$peaks$counts_metadata %>% dplyr::select("peakID", tidyselect::starts_with("SNP_")), by = c("peak.ID" = "peakID")) %>%
                  dplyr::rename(peak.SNP_n = "SNP_n", peak.SNP_rsid = "SNP_rsid", peak.SNP_start = "SNP_start")

              grn.filt = dplyr::filter(grn.filt, .data$SNP_n >= peak.SNP_filter$min_nSNPs)
              futile.logger::flog.info(paste0("  Number of TF-peak rows after filtering for peaks with at least ", peak.SNP_filter$min_nSNPs, " overlapping SNPs: ", nrow(grn.filt)))
          }
          
          
          # Filters on peak-genes
          
          futile.logger::flog.info("2. Filter peak-gene connections")
          
          if (!is.null(filterGenes)) {
              futile.logger::flog.info(paste0(" Filter peak-gene connections for the following gene IDs: ", paste0(filterGenes, collapse = ",")))
              futile.logger::flog.info(paste0("  Number of rows before filtering genes: ", nrow(peakGeneCorrelations)))
              peakGeneCorrelations = dplyr::filter(peakGeneCorrelations, .data$gene.ENSEMBL %in% filterGenes)
              futile.logger::flog.info(paste0("  Number of rows after filtering genes: ", nrow(peakGeneCorrelations)))
          }
          
          if (!is.null(peak_gene.maxDistance)) {
              futile.logger::flog.info(paste0(" Filter peak-gene connections for their distance and keep only connections with a maximum distance of  ", peak_gene.maxDistance))
              futile.logger::flog.info(paste0("  Number of peak-gene rows before filtering connection types: ", nrow(peakGeneCorrelations)))
              peakGeneCorrelations = dplyr::filter(peakGeneCorrelations, .data$peak_gene.distance < peak_gene.maxDistance)
              futile.logger::flog.info(paste0("  Number of peak-gene rows after filtering connection types: ", nrow(peakGeneCorrelations)))
          }
          
          if (!"all" %in% gene.types) {
              futile.logger::flog.info(paste0(" Filter genes by gene type, keep only the following gene types: ", paste0(gene.types, collapse = ", ")))
              futile.logger::flog.info(paste0("  Number of peak-gene rows before filtering by gene type: ", nrow(peakGeneCorrelations)))
              peakGeneCorrelations = dplyr::filter(peakGeneCorrelations, .data$gene.type %in% gene.types)
              futile.logger::flog.info(paste0("  Number of peak-gene rows after filtering by gene type: ", nrow(peakGeneCorrelations)))
          }
          
         
   
          if (!is.null(peak.SNP_filter) && peak.SNP_filter$min_nSNPs > 0 && peak.SNP_filter$filterType == "extra") {
              
              futile.logger::flog.info(paste0(" Filter peak-genes based on their minimum number of SNPs they overlap with. Threshold here: ", peak.SNP_filter$min_nSNPs))
              futile.logger::flog.info(paste0("  Number of peak-gene rows before filtering peaks: ", nrow(peakGeneCorrelations)))
              
              peakGeneCorrelations = peakGeneCorrelations %>%
                  dplyr::left_join(GRN@data$peaks$counts_metadata %>% dplyr::select("peakID", tidyselect::starts_with("SNP_")), by = c("peak.ID" = "peakID")) %>%
                  dplyr::rename(peak.SNP_n = "SNP_n", peak.SNP_rsid = "SNP_rsid", peak.SNP_start = "SNP_start")
              
              peakGeneCorrelations = dplyr::filter(peakGeneCorrelations, .data$SNP_n >= peak.SNP_filter$min_nSNPs)
              futile.logger::flog.info(paste0("  Number of peak-gene rows after filtering for peaks with at least ", peak.SNP_filter$min_nSNPs, " overlapping SNPs: ", nrow(peakGeneCorrelations)))
          }
          
          
          
          futile.logger::flog.info(paste0("3. Merging TF-peak with peak-gene connections and filter the combined table..."))
          # Now we need the connected genes. All fitters that are independent of that have been done
          # Don't warn about the coercing of factors etc
          

          if (allowMissingTFs | (!is.null(peak.SNP_filter) && peak.SNP_filter$min_nSNPs > 0 && peak.SNP_filter$filterType == "orthogonal")) {
              grn.filt = suppressWarnings(dplyr::full_join(grn.filt, peakGeneCorrelations, by = "peak.ID"))
              
              if (!allowMissingTFs) {
                  
                  # Add SNP data first time
                  grn.filt = grn.filt %>%
                      dplyr::left_join(GRN@data$peaks$counts_metadata %>% dplyr::select("peakID", tidyselect::starts_with("SNP_")), by = c("peak.ID" = "peakID")) %>%
                      dplyr::rename(peak.SNP_n = "SNP_n", peak.SNP_rsid = "SNP_rsid", peak.SNP_start = "SNP_start")
                  
                  futile.logger::flog.info(paste0(" Keeping peaks if they either have a TF connected OR if they overlap with at least ", peak.SNP_filter$min_nSNPs, " SNPs"))

                  
                  # Keep only connections without a TF iff the peak overlaps SNPs
                  grn.filt = grn.filt %>%
                      dplyr::filter(.data$peak.SNP_n >= peak.SNP_filter$min_nSNPs | !is.na(.data$TF.ID))
              }
              
          } else {
              grn.filt = suppressWarnings(dplyr::left_join(grn.filt, peakGeneCorrelations, by = "peak.ID"))
          }
          
          
          futile.logger::flog.info(paste0("Inital number of rows left before filtering steps: ", nrow(grn.filt)))
          
          if (filterLoops) {
              futile.logger::flog.info(paste0(" Filter TF-TF self-loops"))
              futile.logger::flog.info(paste0("  Number of rows before filtering genes: ", nrow(grn.filt)))
              
              # Be aware of NA values here in the selection, depending on allowMissingTFs
              grn.filt = dplyr::filter(grn.filt, 
                                       is.na(.data$TF.ENSEMBL) | 
                                           (!is.na(.data$TF.ENSEMBL) & (as.character(.data$gene.ENSEMBL) != as.character(.data$TF.ENSEMBL))))
              
              futile.logger::flog.info(paste0("  Number of rows after filtering genes: ", nrow(grn.filt)))
          }
          
          
          
          if (allowMissingGenes) {
              # Nothing to do here
          } else {
              
              futile.logger::flog.info(paste0(" Filter rows with missing ENSEMBL IDs"))
              futile.logger::flog.info(paste0("  Number of rows before filtering: ", nrow(grn.filt)))
              grn.filt = dplyr::filter(grn.filt, !is.na(.data$gene.ENSEMBL))
              futile.logger::flog.info(paste0("  Number of rows after filtering: ", nrow(grn.filt)))
              
          }
          
          # TODO: Make order more logical
          
          if (!is.null(peak_gene.r_range)) {
              
              futile.logger::flog.info(paste0(" Filter network and retain only rows with peak_gene.r in the following interval: (", 
                                              peak_gene.r_range[1], " - ", peak_gene.r_range[2], "]"))
              
              futile.logger::flog.info(paste0("  Number of rows before filtering: ", nrow(grn.filt)))
              grn.filt = dplyr::filter(grn.filt, 
                                       is.na(.data$peak_gene.r) | .data$peak_gene.r  > peak_gene.r_range[1], 
                                       is.na(.data$peak_gene.r) | .data$peak_gene.r <= peak_gene.r_range[2])
              futile.logger::flog.info(paste0("  Number of rows after filtering: ", nrow(grn.filt)))
              
          }
          
          
          if (!is.null(peak_gene.p_raw.threshold)) {
              
              futile.logger::flog.info(paste0(" Filter network and retain only rows with peak-gene connections with p.raw < ", peak_gene.p_raw.threshold))
              futile.logger::flog.info(paste0("  Number of rows before filtering TFs: ", nrow(grn.filt)))
              grn.filt = dplyr::filter(grn.filt, is.na(.data$peak_gene.p_raw) | .data$peak_gene.p_raw < peak_gene.p_raw.threshold)
              futile.logger::flog.info(paste0("  Number of rows after filtering TFs: ", nrow(grn.filt)))
              
          }
          
          if (!is.null(peak_gene.fdr.threshold)) {
              
              futile.logger::flog.info(paste0(" Calculate FDR based on remaining rows, filter network and retain only rows with peak-gene connections with an FDR < ",  peak_gene.fdr.threshold))
              
              futile.logger::flog.info(paste0("  Number of rows before filtering genes (including/excluding NA): ", nrow(grn.filt), "/", nrow(grn.filt %>% dplyr::filter(!is.na(.data$peak_gene.p_raw)))))
              
              
              # Adjusted p-value is calculated dynamically here and therefore, for different filters, the numbers may vary
              # After a discussion in the group, this procedure was agreed upon even though it can sometimes yield confusing results when compared among each other
              
              if (peak_gene.fdr.method == "IHW") {
                  
                  # Identify those entries for which both p-value and covariate are not NA
                  
                  covariate_val = grn.filt %>% dplyr::pull(!!(peak_gene.IHW.covariate))
                  indexes = which(!is.na(grn.filt$peak_gene.p_raw) & !is.na(covariate_val))
                  
                  if (length(indexes) < nrow(grn.filt)) {
                      message = paste0("For permutation ", permutationCur, ", only ", length(indexes), " rows out of ", nrow(grn.filt), " can be used for IHW because some entries for either p-value or covariate were NA. The remaining ", nrow(grn.filt) - length(indexes), " rows will be ignored for p-value adjustment and set to NA.")
                      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
                  }
                  
                  
                  if (peak_gene.fdr.method == "IHW" && length(indexes) < 1000) {
                      message = paste0("filterGRNAndConnectGenes: IHW should only be performed with at least 1000 p-values, but only ", length(indexes), " are available. Switching to BH adjustment as fallback. This is to be expected for the background data but not for the real one.")
                      
                      if (permutationCur == 0) {
                          .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
                      } else {
                          futile.logger::flog.info(message)
                      }
                      
                      peak_gene.fdr.method = "BH"
                  }
                  
              }
              
              if (peak_gene.fdr.method %in% c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")) {
                  
                  grn.filt = dplyr::mutate(grn.filt, peak_gene.p_adj = stats::p.adjust(.data$peak_gene.p_raw, method = peak_gene.fdr.method))
                  
              } else { # Do IHW
                  
                  suffixFile = .getPermutationSuffixStr(permutationCur)
                  
                  outputFolder = .checkOutputFolder(GRN, outputFolder)
                  outputFile_IHW = paste0(outputFolder, .getOutputFileName("plot_peakGene_IHW_diag"), suffixFile, ".pdf")
          
                  IHW.res = .performIHW(pvalues = grn.filt$peak_gene.p_raw[indexes], 
                                        covariates = covariate_val[indexes] %>% unlist() %>% unname(), 
                                        alpha = peak_gene.fdr.threshold, nbins = peak_gene.IHW.nbins,
                                        permutation = permutationCur,
                                        pdfFile = outputFile_IHW)
                  
                  
                  grn.filt$peak_gene.p_adj  = NA
                  grn.filt$peak_gene.p_adj[indexes] = IHW::adj_pvalues(IHW.res$ihwResults)
                  
              }
              
              if (allowMissingGenes) {
                  grn.filt = dplyr::filter(grn.filt, is.na(.data$peak_gene.p_adj) | .data$peak_gene.p_adj <  peak_gene.fdr.threshold) # keep NA here due to completeCases variable 
              } else {
                  grn.filt = dplyr::filter(grn.filt, .data$peak_gene.p_adj <  peak_gene.fdr.threshold) 
              }
              
              futile.logger::flog.info(paste0("  Number of rows after filtering genes (including/excluding NA): ", nrow(grn.filt), "/", nrow(grn.filt %>% dplyr::filter(!is.na(.data$peak_gene.p_adj)))))
          }
          
          
          if (peak_gene.selection == "closest") {
              
              # Select only the closest gene for each peak
              # Currently, this filter is applied BEFORE any of the other peak-gene filters
              futile.logger::flog.info(paste0(" Filter network and retain only the closest genes for each peak. Note that previous filters may already have eliminated the overall closest gene for a particular peak. To make sure to always use the closest gene in the network, set the other peak_gene filters to NULL."))
              
              # NA distances should be kept, only genes with non NA-values should be filtered
              
              grn.filt = grn.filt %>%
                  dplyr::filter(!is.na(.data$peak_gene.distance)) %>%
                  dplyr::group_by(.data$peak.ID) %>%
                  dplyr::slice(which.min(.data$peak_gene.distance)) %>%
                  dplyr::ungroup() %>%
                  rbind(dplyr::filter(grn.filt, is.na(.data$peak_gene.distance))) # rbind the na rows separately here
              
              futile.logger::flog.info(paste0("  Number of rows after filtering: ", nrow(grn.filt)))
              
          }
          
          
          grn.filt = grn.filt %>%
              dplyr::select(tidyselect::starts_with("TF."), 
                            tidyselect::starts_with("TF_peak."), 
                            tidyselect::starts_with("peak."), 
                            tidyselect::starts_with("peak_gene."),
                            tidyselect::starts_with("gene."),
                            tidyselect::everything()) %>%
              dplyr::mutate(peak.ID      = as.factor(.data$peak.ID),
                            gene.ENSEMBL = as.factor(.data$gene.ENSEMBL),
                            TF.ID      = as.factor(.data$TF.ID),
                            TF.name      = as.factor(.data$TF.name))
          
          
          GRN@connections$all.filtered[[permIndex]] = grn.filt
          
          futile.logger::flog.info(paste0("Final number of rows left after all filtering steps: ", nrow(grn.filt)))
          
          if (nrow(grn.filt) == 0 & permutationCur == 0 & !silent) {
              message = "filterGRNAndConnectGenes: No connections passed the filter steps. Rerun the function and be less stringent."
              .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
          }

          
      } #end for all permutations
      
      
      if (length(GRN@graph) > 0 & resetGraphAndStoreInternally) {
          message = "filterGRNAndConnectGenes: To avoid object inconsistencies and unexpected/non-reproducible results, the graph slot in the object has been reset. For all network-related functions as well as eGRN visualization, rerun the method build_eGRN_graph and all other network-related ans enrichment functions to update to the new set of filtered connections"
          .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
          GRN@graph = list()
      }

      
  } else {
      .printDataAlreadyExistsMessage()
  }
  
  if (silent) futile.logger::flog.threshold(futile.logger::INFO)
  
  if (!silent) .printExecutionTime(start)
  
  return(GRN)
 
}




.performIHW <- function(pvalues, covariates, alpha = 0.1, covariate_type = "ordinal",
                        nbins = "auto", permutation,
                        m_groups = NULL, quiet = TRUE, nfolds = 5L,
                        nfolds_internal = 5L, nsplits_internal = 1L, lambdas = "auto",
                        seed = 1L, distrib_estimator = "grenander", lp_solver = "lpsymphony",
                        adjustment_type = "BH", return_internal = FALSE, 
                        doDiagnostics = TRUE, pValThreshold_diagnosticPlots = 0.2,
                        pdfFile = NULL, verbose = TRUE, ...) {
  
  start = Sys.time()
  
  checkmate::assertNumeric(pvalues, lower = 0, upper = 1, any.missing = TRUE)
  checkmate::assert(checkmate::checkNumeric(covariates), checkmate::checkFactor(covariates))
  stopifnot(length(pvalues) == length(covariates))
  checkmate::assertNumber(alpha, lower = 0, upper = 1)
  checkmate::assertChoice(covariate_type, c("ordinal", "nominal"))
  checkmate::assert(checkmate::checkInt(nbins, lower = 1), checkmate::checkSubset(nbins, c("auto")))
  checkmate::assert(checkmate::checkNull(m_groups), checkmate::checkInteger(m_groups, len = length(covariates)))
  checkmate::assertLogical(quiet)
  checkmate::assertInt(nfolds, lower = 1)
  checkmate::assertInt(nfolds_internal, lower = 1)
  checkmate::assertInt(nsplits_internal, lower = 1)
  checkmate::assert(checkmate::checkNumeric(lambdas), checkmate::checkSubset(lambdas, c("auto")))
  checkmate::assert(checkmate::checkNull(seed), checkmate::checkInteger(seed)) 
  checkmate::assertChoice(distrib_estimator, c("grenander", "ECDF"))
  checkmate::assertChoice(lp_solver, c("lpsymphony", "gurobi"))
  checkmate::assertChoice(adjustment_type, c("BH", "bonferroni"))
  checkmate::assertLogical(return_internal)
  checkmate::assertLogical(doDiagnostics)
  checkmate::assert(checkmate::checkNull(pdfFile), checkmate::checkDirectoryExists(dirname(pdfFile), access = "w"))
  checkmate::assertLogical(verbose)
  
  
  res.l = list()
  
  
  futile.logger::flog.info(paste0("Perform IHW based on ", length(pvalues), " p-values"))
  
  ihw_res = IHW::ihw(pvalues, covariates, alpha, covariate_type = covariate_type,
                     nbins = nbins, m_groups = m_groups, quiet = quiet, nfolds = nfolds,
                     nfolds_internal = nfolds_internal, nsplits_internal = nsplits_internal, lambdas = lambdas,
                     seed = seed, distrib_estimator = distrib_estimator, lp_solver = lp_solver,
                     adjustment_type = adjustment_type, return_internal = return_internal, ...)
  
  res.l$ihwResults = ihw_res
  
  if (IHW::nbins(ihw_res) == 1) {
    message = "filterGRNAndConnectGenes: Only 1 bin, IHW reduces to Benjamini Hochberg (uniform weights). Skipping diagnostic plots"
    .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
    return(res.l)
  } else {
    futile.logger::flog.info(paste0("  Number of chosen bins (should be >1): ", IHW::nbins(ihw_res)))
  }
  
  
  if (doDiagnostics) {
    
    futile.logger::flog.info(paste0("  Generate diagnostic plots for IHW results and data for ", .getPermStr(permutation), "..."))
    
    # We can compare this to the result of applying the method of Benjamini and Hochberg to the p-values only:
    
    padj_bh = stats::p.adjust(pvalues, method = "BH")
    rejectionsBH = sum(padj_bh <= alpha, na.rm = TRUE)
    rejectionsIHW = IHW::rejections(ihw_res)
    
    futile.logger::flog.info(paste0("  Number of rejections for IHW: ", 
                                    rejectionsIHW, " (for comparison, number of rejections for BH: ", 
                                    rejectionsBH, " [the latter should be lower])"))
    
    if (rejectionsIHW < rejectionsBH) {
      message = paste0("filterGRNAndConnectGenes: For ", .getPermStr(permutation), ", the number of rejections for IHW (", rejectionsIHW, ") is smaller than for BH (", rejectionsBH, "), something might be wrong. The covariate chosen might not be appropriate (if this happens for real data) or it is simply a consequence of the data being permuted.")
      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
    }
    
    res.l$ihwPlots = list()
    
    ##                                                                           ##
    ## 1. Build in diagnostic functions: Estimated weights and decision boundary ##
    ##                                                                           ##
    
    # Plot No. 1
    res.l$ihwPlots$estimatedWeights  = 
      IHW::plot(ihw_res, what = "weights") + 
      ggplot2::ggtitle("Estimated weights")
    
    # Plot No. 2
    res.l$ihwPlots$decisionBoundary  = 
      IHW::plot(ihw_res, what = "decisionboundary") + 
      ggplot2::ggtitle("Decision boundary")
    
    # Plot No. 3
    res.l$ihwPlots$rawVsAdjPVal_all <- 
      as.data.frame(ihw_res) %>%
      ggplot2::ggplot(ggplot2::aes(x = .data$pvalue, y = .data$adj_pvalue, col = .data$group)) + 
      ggplot2::geom_point(size = 0.25) + ggplot2::scale_colour_hue(l = 70, c = 150, drop = FALSE) + 
      ggplot2::ggtitle("Raw versus adjusted p-values")
    
    
    # Plot No. 4
    res.l$ihwPlots$rawVsAdjPVal_subset = 
      res.l$ihwPlots$rawVsAdjPVal_all %+% subset(IHW::as.data.frame(ihw_res), adj_pvalue <= pValThreshold_diagnosticPlots) + 
      ggplot2::ggtitle("raw versus adjusted p-values (zoom)")
    
    ##                       ##
    ## 2. p-value histograms ##
    ##                       ##
    
    data.df = data.frame(pValues = pvalues, covariate = covariates)
    
    # One of the most useful diagnostic plots is the p-value histogram (before applying any multiple testing procedure)
    
    # Plot No. 5
    
    res.l$ihwPlots$pValHistogram = 
      ggplot2::ggplot(data.df, ggplot2::aes(x = .data$pValues)) + ggplot2::geom_histogram(binwidth = 0.025, boundary = 0) + 
      ggplot2::ggtitle("p-Value histogram independent of the covariate")
    
    # Stratified p-value histograms by covariate
    # Plot No. 6
    
    nGroups = ihw_res@nbins
    
    data.df$covariate_group <- IHW::groups_by_filter(data.df$covariate, nGroups)
    res.l$ihwPlots$pValHistogramGroupedByCovariate = 
      ggplot2::ggplot(data.df, ggplot2::aes(x = .data$pValues)) + ggplot2::geom_histogram(binwidth = 0.025, boundary = 0) + 
        ggplot2::facet_wrap( ~covariate_group, nrow = 2) + 
      ggplot2::ggtitle("p-Value histogram grouped by the covariate")
    
    # Plot No. 7
    res.l$ihwPlots$pValHistogramGroupedByCovariateECDF = 
      ggplot2::ggplot(data.df, ggplot2::aes(x = .data$pValues, col = .data$covariate_group)) + ggplot2::stat_ecdf(geom = "step")  + 
      ggplot2::ggtitle("p-Value histogram grouped by the covariate (ECDF)")
    
    
    # Check whether the covariate is informative about power under the alternative (property 1), 
    # plot the −log10(p-values) against the ranks of the covariate:
    data.df <- stats::na.omit(data.df)
    data.df$covariateRank = rank(data.df$covariate)/nrow(data.df)
    
    # Plot No. 8
    res.l$ihwPlots$pValAgainstRankCovariate = 
      ggplot2::ggplot(data.df, ggplot2::aes(x = .data$covariateRank, y = -log10(.data$pValues))) + ggplot2::geom_hex(bins = 100) + 
      ggplot2::ggtitle("p-Value against rank of the covariate")
    
    
    if (!is.null(pdfFile)) {
      
      .printMultipleGraphsPerPage(res.l$ihwPlots, nCol = 1, nRow = 1, pdfFile = pdfFile)
      futile.logger::flog.info(paste0("Diagnostic plots written to file ", pdfFile))
    }
    
  }
  
  .printExecutionTime(start)
  return(res.l)
  
  
}

#' Add TF-gene correlations to a \code{\linkS4class{GRN}} object. 
#' 
#' The information is currently stored in \code{GRN@connections$TF_genes.filtered}. Note that raw p-values are not adjusted.
#' 
#' @export
#' @template GRN
#' @template corMethod
#' @template nCores
#' @template forceRerun
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function.
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN = add_TF_gene_correlation(GRN, forceRerun = FALSE)
add_TF_gene_correlation <- function(GRN, corMethod = "pearson", nCores = 1, forceRerun = FALSE) {
  
  start = Sys.time() 
  
  checkmate::assertClass(GRN, "GRN")  
     
  
  GRN = .makeObjectCompatible(GRN)
  
  checkmate::assertChoice(corMethod, c("pearson", "bicor", "spearman"))
  checkmate::assertIntegerish(nCores, lower = 1)
  checkmate::assertFlag(forceRerun)
  
  .checkPackageRobust(corMethod)
  
  if (is.null(GRN@connections$TF_genes.filtered) | forceRerun) {
    
    GRN = .addFunctionLogToObject(GRN) 
    GRN@connections$TF_genes.filtered = list()
    GRN@config$parameters$corMethod_TF_gene = corMethod
    
    .checkExistanceFilteredConnections(GRN)
    
    futile.logger::flog.info(paste0("Calculate correlations for TF and genes from the filtered set of connections"))
    
    for (permutationCur in 0:.getMaxPermutation(GRN)) {
      
      futile.logger::flog.info(paste0(" ", .getPermStr(permutationCur)))
      # Get all TF peak pairs to check
      
      permIndex = as.character(permutationCur)
      TF_genePairs = GRN@connections$all.filtered[[permIndex]] %>%
        dplyr::filter(!is.na(.data$gene.ENSEMBL)) %>%
        dplyr::select("TF.ID", "gene.ENSEMBL") %>%
        dplyr::distinct() %>%
        dplyr::left_join(GRN@annotation$TFs, by = c("TF.ID"), suffix = c("", ".transl")) # %>%
      # dplyr::distinct(ENSEMBL, ENSEMBL.transl)
      # TODO: Improve: Only loop over distinct ENSMBL_TF and ENSEMBL_gene pairs
      
      maxRow = nrow(TF_genePairs)
      if ( maxRow > 0) {
        futile.logger::flog.info(paste0("  Iterate through ", maxRow, " TF-gene combinations and (if possible) calculate correlations using ", nCores, " cores. This may take a few minutes."))
        
        
        countsRNA.clean  = getCounts(GRN, type = "rna",  permuted = as.logical(permutationCur), includeIDColumn = FALSE)
        
        map_TF =   match(TF_genePairs$TF.ENSEMBL, getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))$ENSEMBL)
        map_gene = match(TF_genePairs$gene.ENSEMBL, getCounts(GRN, type = "rna",permuted = as.logical(permutationCur))$ENSEMBL)
        
        # Some NAs might be expected, given our annotation contains all known genes
        stopifnot(!all(is.na(map_TF)))
        stopifnot(!all(is.na(map_gene)))
        
        chunksize = 10000
        startIndexMax = ceiling(maxRow / chunksize) - 1 # -1 because we count from 0 onwards
        
        
        res.l = .execInParallelGen(nCores, returnAsList = TRUE, listNames = NULL, iteration = 0:startIndexMax, verbose = FALSE, functionName = .correlateDataWrapper, 
                                   chunksize = chunksize, maxRow = maxRow, counts1 = countsRNA.clean, counts2 = countsRNA.clean, 
                                   map1 = map_TF, map2 = map_gene, corMethod = corMethod)
        
        res.m  = do.call(rbind, res.l)
        
        
        selectColumns = c("gene.ENSEMBL", "TF.ENSEMBL", "r", "p.raw", "TF.ID")
        
        futile.logger::flog.info(paste0("  Done. Construct the final table, this may result in an increased number of TF-gene pairs due to different TF names linked to the same Ensembl ID."))
        
        
        # Make data frame and adjust p-values
        res.df = suppressMessages(tibble::as_tibble(res.m) %>%
                                    dplyr::mutate(TF.ENSEMBL   = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))$ENSEMBL[map_TF],
                                                  gene.ENSEMBL = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))$ENSEMBL[map_gene]) %>%
                                    dplyr::filter(!is.na(.data$gene.ENSEMBL), !is.na(.data$TF.ENSEMBL)) %>%  # For some peak-gene combinations, no RNA-Seq data was available, these NAs are filtered
                                    dplyr::left_join(GRN@annotation$TFs, by = c("TF.ENSEMBL"), multiple = "all") %>%
                                    dplyr::select(tidyselect::all_of(selectColumns))) %>%
          dplyr::mutate(gene.ENSEMBL = as.factor(.data$gene.ENSEMBL), 
                        TF.ENSEMBL   = as.factor(.data$TF.ENSEMBL),
                        TF.ID           = as.factor(.data$TF.ID)) %>%
          dplyr::rename(TF_gene.r     = "r", 
                        TF_gene.p_raw = "p.raw") %>%
          dplyr::select("TF.ID", "TF.ENSEMBL", "gene.ENSEMBL", tidyselect::everything())
        
        
      } else {
            futile.logger::flog.info(paste0(" Nothing to do, skip."))

            res.df = tibble::tribble(~TF.ID, ~TF.ENSEMBL, ~gene.ENSEMBL, ~TF_gene.r, ~TF_gene.p_raw)
        
      }
      
      
      
      GRN@connections$TF_genes.filtered[[permIndex]] = res.df
      
    } # end for each permutation
  } else {
      .printDataAlreadyExistsMessage()
  }
  
  .printExecutionTime(start, prefix = "")
  
  GRN
}




######### SNP functions ################

#' Add SNP data to a \code{\linkS4class{GRN}} object and associate SNPs to peaks. 
#' 
#' This function accepts a vector of SNP IDs (rsID), retrieves their genomic positions and 
#' overlaps them with the peaks to extend the peak metadata (`GRN@data$peaks$counts_metadata`) by storing the number, positions and rsids of all
#' overlapping SNPs per peak  (new columns starting with `SNP_`). 
#' Optionally, SNPs in LD with the user-provided SNPs can be identified using the \code{LDlinkR} package. Note that only SNPs in LD are associated with a peak for those SNPs directly overlapping a peak. 
#' That is, if a user-provided SNP does not overlap with any peak, neither the SNP itself nor any of the SNPs in LD will be associated with any peak, even if an LD SNP overlaps another peak.
#' The results of are stored in \code{GRN@annotation$SNPs} (full, unfiltered table) and \code{GRN@annotation$SNPs_filtered} (filtered table), 
#' and rapid re-filtering is possible without re-querying the database (time-consuming)
#' 
#' `biomaRt` is used to retrieve genomic positions for the user-defined SNPs, which can take a long time depending
#' on the number of SNPs provided. Similarly, querying the \code{LDlink} servers may take a long time. 
#' 
#' @export
#' @template GRN
#' @param SNP_IDs Character vector. No default. Vector of SNP IDs (rsID) that should be integrated and overlapped with the peaks.
#' @param EnsemblVersion \code{NULL} or Character(1). Default \code{NULL}. Only relevant if \code{source} is not set to \code{custom}, ignored otherwise. The Ensembl version to use for the retrieval of gene IDs from their provided database names (e.g., JASPAR) via \code{biomaRt}.
#' By default (\code{NULL}), the newest version is selected for the most recent genome assembly versions is used (see \code{biomaRt::listEnsemblArchives()} for supported versions). This parameter can override this to use a custom (older) version instead.
#' @param add_SNPs_LD \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should SNPs in LD with any of the user-provided SNPs that overlap a peak be identified and added to the peak? 
#' If set to \code{TRUE}, \code{LDlinkR::LDproxy_batch} will be used to identify SNPs in LD based on the 
#' user-provided \code{SNP_IDs} argument, a specific (set of) populations (argument \code{population}) and a value for \code{r2d}.
#' The full table (stored in \code{GRN@annotation$SNPs}) is then subsequently filtered, see also the \code{filter} argument.
#' @param requeryLD \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Only applicable if \code{add_SNPs_LD = TRUE} and ignored otherwise.
#' Should \code{LDlinkR::LDproxy_batch} be re-executed if already present? As this
#' is very time-consuming, querying the database is only performed if this parameter is set to \code{TRUE} or if \code{GRN@annotation$SNPs} is missing.
#' @param population Character vector. Default \code{CEU}. Only applicable if \code{add_SNPs_LD = TRUE} and ignored otherwise.
#' Population code(s) from the 1000 Genomes project to be used for \code{LDlinkR::LDproxy_batch}.
#' Multiple codes are allowed. For all valid codes, see \url{https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/README_populations.md} 
#' @param r2d \code{r2} or \code{d}. Default \code{r2}. Only applicable if \code{add_SNPs_LD = TRUE} and ignored otherwise.
#' See the help of the function \code{LDlinkR::LDproxy_batch} for more details.
#' @param token Character or \code{NULL}. Default \code{NULL}. Only applicable if \code{add_SNPs_LD = TRUE} and ignored otherwise. 
#' \code{LDlink} provided user token. Register for token at https://ldlink.nih.gov/?tab=apiaccess. 
#' Has to be done only once and is very simple and straight forward but unfortunately necessary. An example, non-functional token is \code{2c49a3b54g04}.
#' @param filter Character. Default \code{R2 > 0.8}. Only applicable if \code{add_SNPs_LD = TRUE} and ignored otherwise.
#' Filter criteria for the output table as generated by \code{LDlinkR::LDproxy_batch}.
#' \code{dplyr::filter} style is used to specify filters, and multiple filtering criteria can be used (e.g., \code{R2 > 0.8 & MAF > 0.01}).
#' The filtered table is stored in \code{GRN@annotation$SNPs_filtered}. Note that re-filtering is quick without the need to re-query the database unless \code{requeryLD = TRUE}.
#' @template forceRerun
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function.
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN = addSNPData(GRN, SNP_IDs = c("rs7570219", "rs6445264", "rs12067275"), forceRerun = FALSE)
addSNPData <- function(GRN, SNP_IDs, EnsemblVersion = NULL, 
                       add_SNPs_LD = FALSE, requeryLD = FALSE, population = "CEU", r2d = "r2", token = NULL, 
                       filter = "R2 > 0.8",
                       forceRerun = FALSE) {

    start = Sys.time()   
    
    checkmate::assertClass(GRN, "GRN")
    
    GRN = .makeObjectCompatible(GRN)
    
    checkmate::assertCharacter(SNP_IDs, min.chars = 4, any.missing = FALSE, min.len = 1)
    
    checkmate::assert(checkmate::checkNull(EnsemblVersion), checkmate::assertSubset(as.character(EnsemblVersion), biomaRt::listEnsemblArchives()$version))
    checkmate::assertFlag(add_SNPs_LD)
    checkmate::assertFlag(requeryLD)
    checkmate::assertFlag(forceRerun)
    
    
    SNP_IDs = unique(SNP_IDs)

    if (!all(c("SNP_n", "SNP_rsid", "SNP_start") %in% colnames(GRN@data$peaks$counts_metadata)) | forceRerun) {
        
        # Identify SNPs in LD
        if (add_SNPs_LD) {

            
            # Check genome, must be hg19 or hg38
            genomeQuery = dplyr::case_when(GRN@config$parameters$genomeAssembly == "hg38" ~ "grch38_high_coverage" ,
                             GRN@config$parameters$genomeAssembly == "hg19" ~ "grch37",
                             TRUE ~ NA)

            if (is.na(genomeQuery)) {
                message = paste0("For querying SNPS for LD, the genome must be either hg19 or hg38 but not ", GRN@config$parameters$genomeAssembly)
                .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
            }
            
            file_SNPs = paste0("combined_query_snp_list_", genomeQuery, ".txt")
            
            if (requeryLD | is.null(GRN@annotation$SNPs)) {
                
                packageMessage = paste0("The package LDlinkR is not installed but required for this function.")
                .checkPackageInstallation("LDlinkR", packageMessage, isWarning = FALSE)
                
                checkmate::assertCharacter(token, len = 1, min.chars = 5)
                checkmate::assertCharacter(population, min.len = 1)
                checkmate::assertChoice(r2d, c("r2", "d"))
                
                futile.logger::flog.info(paste0(" Calling LDlinkR::LDproxy_batch for ", length(SNP_IDs), " rsid. This may take a while, particularly if the number of provided IDs is large"))
                
                tryCatch({ 
                     LDlinkR::LDproxy_batch(SNP_IDs, pop = population, token = token, 
                                                 append = TRUE, r2d = r2d, genome_build = genomeQuery)
                    
                    
                }, error = function(e) {
                    futile.logger::flog.warn(paste0("An error occured for LDlinkR::LDproxy_batch. The error message was: ", e))
                })
            
                if (!file.exists(file_SNPs)) {
                    
                    error_Biomart = paste0("A temporary error occured with LDlinkR::LDproxy_batch. This may be caused by an unresponsive webserver or the expted file ", file_SNPs, " could not be found/read.", 
                                           " Try again at a later time. Note that this error is not caused by GRaNIE but external services.")
                    .checkAndLogWarningsAndErrors(NULL, error_Biomart, isWarning = FALSE)
                    return(NULL)
                    
                } 
                
                
                futile.logger::flog.info(paste0(" Finished retrieving all SNPs in LD"))
                snps.df = readr::read_tsv(file_SNPs, skip = 1, col_names = FALSE, show_col_types = FALSE)
                
                colnames(snps.df) = c("row_number", "query_snp", "RS_Number", "Coord", "Alleles", "MAF", "Distance" ,  "Dprime", "R2", 
                                      "Correlated_Alleles", "FORGEdb", "RegulomeDB", "Function")
                
                GRN@annotation$SNPs = snps.df %>%
                    dplyr::select("query_snp", "RS_Number", "Coord", "MAF", "Distance", "Dprime", "R2") %>%
                    dplyr::filter(.data$RS_Number != ".")
                    
                    pos.df = stringr::str_split_fixed(GRN@annotation$SNPs$Coord, pattern = ":", n = 2)
                    colnames(pos.df) = c("SNP_chr", "SNP_start")
                    
                    GRN@annotation$SNPs = cbind(GRN@annotation$SNPs, pos.df) %>%
                        dplyr::select(-"Coord")
                    
            } else {
                futile.logger::flog.info(paste0(" Not re-querying LDlink database as results already found in object"))
                
                checkmate::assertSubset(c("query_snp", "RS_Number", "SNP_chr", "SNP_start", "MAF", "Distance", "Dprime", "R2"), colnames(GRN@annotation$SNPs))
            }
            
            checkmate::assertCharacter(filter, len = 1, min.chars = 1)
            
            # FILTER
            futile.logger::flog.info(paste0(" Filtering SNP information using the filter ", filter))
            futile.logger::flog.info(paste0("  Number of rows before filtering: ", nrow(GRN@annotation$SNPs)))
            GRN@annotation$SNPs_filtered = dplyr::filter(GRN@annotation$SNPs, !!!rlang::parse_exprs(paste(filter, collapse = ";")))
            futile.logger::flog.info(paste0("  Number of rows after filtering : ", nrow(GRN@annotation$SNPs_filtered)))
          
        } else {
            futile.logger::flog.info(paste0(" Not retrieving SNPS in LD, use only user-provided SNPs"))
            
        }
       
        
        GRN@data$peaks$counts_metadata = dplyr::select(GRN@data$peaks$counts_metadata, -tidyselect::starts_with("SNP_") )
        
        futile.logger::flog.info(paste0(" Quering biomaRt for ", length(SNP_IDs), " rsid annotation, this may take a while, particularly if the number of provided IDs is large"))
        
        
        genomeAssembly = GRN@config$parameters$genomeAssembly
        params.l = .getBiomartParameters(genomeAssembly, suffix = "_snp")
        
        ensembl = .biomart_getEnsembl(biomart = "snp" , version = EnsemblVersion, host = params.l[["host"]],  dataset = params.l[["dataset"]])
        
        results.df = .callBiomart(mart =  ensembl, attributes = c("refsnp_id", "chr_name", "chrom_start", "chrom_end"), 
                                  filters = "snp_filter",
                                  values = SNP_IDs, maxAttempts = 5) 
        
        results.df =  results.df %>%
            dplyr::rename(seqnames = "chr_name", start = "chrom_start", end = "chrom_end", annotation = "refsnp_id") %>%
            dplyr::mutate(seqnames = paste0("chr", .data$seqnames)) %>%
            dplyr::select("seqnames", "start", "end", "annotation")
        
        futile.logger::flog.info(paste0(" Retrieved annotation for a total of ", nrow(results.df), " SNPs out of ", length(SNP_IDs), " that were provided" ))
        
        # Filter SNPs with chromosomes that are not in the genome assembly
        results.filt.df = results.df %>%
            dplyr::filter(.data$seqnames %in% names(.getChrLengths(genomeAssembly)))
        
        nFiltered = nrow(results.df) - nrow(results.filt.df)
        if (nFiltered > 0) futile.logger::flog.info(paste0("  Filtered ", nFiltered , " due to chromosome names" ))
        
        
        # Overlap peaks with SNPs and add a metadata column in peak annotation
        
        # TODO Clarify zeroBased, this changes the SNP position by 0 if set to TRUE
        SNPs.gr = .constructGRanges(results.filt.df, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly = genomeAssembly, zeroBased = FALSE)
        
        peaks.gr  = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
     
        futile.logger::flog.info(paste0(" Overlapping peaks and SNPs"))
        
        overlapsAll = GenomicRanges::findOverlaps(peaks.gr, SNPs.gr, 
                                                  minoverlap = 1,
                                                  type = "any", select = "all",
                                                  ignore.strand = TRUE)
        

        
        query_row_ids   = S4Vectors::queryHits(overlapsAll)
        subject_row_ids = S4Vectors::subjectHits(overlapsAll)
        
        query_overlap_df     = as.data.frame(S4Vectors::elementMetadata(peaks.gr)[query_row_ids, "peakID"])
        subject_overlap_df   = as.data.frame( SNPs.gr)[subject_row_ids, c("seqnames", "start", "annotation")]
        
        overlaps.df = cbind.data.frame(query_overlap_df,subject_overlap_df) %>% dplyr::mutate(seqnames = as.character(.data$seqnames)) %>% tibble::as_tibble()
        colnames(overlaps.df) = c("peakID", "SNP_chr", "SNP_start", "SNP_rsid")
        
        if (add_SNPs_LD) {
            overlaps.df = dplyr::left_join(overlaps.df, 
                                            GRN@annotation$SNPs_filtered %>% dplyr::select("query_snp", "RS_Number", "SNP_chr", "SNP_start"), 
                                            by = c("SNP_rsid" = "query_snp"), multiple = "all") %>%
                dplyr::mutate(association = dplyr::if_else(.data$SNP_rsid == .data$RS_Number, "overlap", "LD"))
            

                # Fill NA values and correct start positions
                NA_rows = which(is.na(overlaps.df$RS_Number))
                overlaps.df$RS_Number[NA_rows] =  overlaps.df$SNP_rsid[NA_rows]
                overlaps.df$association[NA_rows] =  "user"
                overlaps.df$SNP_chr.y[NA_rows] =  overlaps.df$SNP_chr.x[NA_rows]
                overlaps.df$SNP_start.y[NA_rows] =  overlaps.df$SNP_start.x[NA_rows]
                
                
                overlaps.df = overlaps.df %>%
                    dplyr::select(-"SNP_chr.x", -"SNP_start.x", -"SNP_rsid") %>%
                    dplyr::rename(SNP_chr = "SNP_chr.y", SNP_start = "SNP_start.y", SNP_rsid = "RS_Number")
                
                # Some SNPs may now be classified as "LD" although they ALSO overlap the peak in fact
                
            
            # TODO: NA values for SNP and wrong SNP position for LD cases
        } else {
            overlaps.df = dplyr::mutate(overlaps.df, association = "user")
        }
        
        
        # Summarize on the peak level before integrating
        overlaps.grouped.df = overlaps.df %>%
            dplyr::group_by(.data$peakID) %>%
            dplyr::summarise(SNP_n = dplyr::n(), 
                             SNP_rsid = paste0(.data$SNP_rsid, collapse = ","), 
                             SNP_start = paste0(.data$SNP_start, collapse = ","),
                             SNP_origin = paste0(.data$association, collapse = ",")) 
        
        GRN@data$peaks$counts_metadata = dplyr::left_join(GRN@data$peaks$counts_metadata, overlaps.grouped.df, by = "peakID", multiple = "all")
        GRN@data$peaks$counts_metadata$SNP_n[which(is.na(GRN@data$peaks$counts_metadata$SNP_n))] = 0
        
        futile.logger::flog.info(paste0(" Added SNP information to GRN@data$peaks$counts_metadata"))
        
        futile.logger::flog.info(paste0(" Statistics: "))
        for (i in sort(unique(GRN@data$peaks$counts_metadata$SNP_n))) {
            futile.logger::flog.info(paste0("  Number of peaks with ", i, " SNPs associated: ", dplyr::filter(GRN@data$peaks$counts_metadata, .data$SNP_n == i) %>% nrow()))
        }
        
    } else {
        
        .printDataAlreadyExistsMessage()
        
    }
    
    .printExecutionTime(start, prefix = "")
    GRN
        
}




####### STATS #########


#' Generate a summary for the number of connections for different filtering criteria for a \code{\linkS4class{GRN}} object. 
#' 
#' This functions calls \code{\link{filterGRNAndConnectGenes}} repeatedly and stores the total number of connections and other statistics each time to summarize them afterwards. 
#' All arguments are identical to the ones in \code{\link{filterGRNAndConnectGenes}}, see the help for this function for details.
#' The function \code{\link{plot_stats_connectionSummary}} can be used afterwards for plotting.
#' 
#' @export
#' @template GRN 
#' @param TF_peak.fdr Numeric vector[0,1]. Default \code{c(0.001, 0.01, 0.05, 0.1, 0.2)}. TF-peak FDR values to iterate over.
#' @template TF_peak.connectionTypes
#' @param peak_gene.fdr Numeric vector[0,1]. Default \code{c(0.001, 0.01, 0.05, 0.1, 0.2)}. Peak-gene FDR values to iterate over.
# #' @param peak_gene.p_raw  Numeric vector[0,1]. Default \code{NULL}. Peak-gene raw p-value values to iterate over. Skipped if set to NULL.
#' @param peak_gene.r_range Numeric vector of length 2[-1,1]. Default \code{c(0,1)}. The correlation range of peak-gene connections to keep.
#' @template gene.types
#' @param allowMissingGenes Logical vector of length 1 or 2. Default \code{c(FALSE, TRUE)}. Allow genes to be missing for peak-gene connections? If both \code{FALSE} and \code{TRUE} are given, the code loops over both
#' @param allowMissingTFs Logical vector of length 1 or 2. Default \code{c(FALSE)}. Allow TFs to be missing for TF-peak connections?  If both \code{FALSE} and \code{TRUE} are given, the code loops over both
#' @template forceRerun
#' @seealso \code{\link{plot_stats_connectionSummary}}
#' @seealso \code{\link{filterGRNAndConnectGenes}}
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function.
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN = generateStatsSummary(GRN, TF_peak.fdr = c(0.01, 0.1), peak_gene.fdr = c(0.01, 0.1))
#' 
generateStatsSummary <- function(GRN, 
                                 TF_peak.fdr = c(0.001, 0.01, 0.05, 0.1, 0.2),
                                 TF_peak.connectionTypes = "all",
                                 peak_gene.fdr = c(0.001, 0.01, 0.05, 0.1, 0.2),
#                                 peak_gene.p_raw = NULL,
                                 peak_gene.r_range = c(0,1),
                                 gene.types = c("all"),
                                 allowMissingGenes = c(FALSE, TRUE),
                                 allowMissingTFs = c(FALSE),
                                 forceRerun = FALSE) {
  
  start = Sys.time()   
  
  checkmate::assertClass(GRN, "GRN")
 
  
  GRN = .makeObjectCompatible(GRN)

  checkmate::assertNumeric(TF_peak.fdr, lower = 0, upper = 1, min.len = 1)
  checkmate::assertSubset(TF_peak.connectionTypes, c("all", GRN@config$TF_peak_connectionTypes), empty.ok = FALSE)
  checkmate::assertNumeric(peak_gene.fdr, lower = 0, upper = 1, min.len = 1)
  #   checkmate::assert(checkmate::checkNull(peak_gene.p_raw), checkmate::checkNumeric(peak_gene.p_raw, lower = 0, upper = 1, min.len = 1))
  checkmate::assertNumeric(peak_gene.r_range, lower = -1, upper = 1, len = 2)
  checkmate::assertSubset(gene.types, c("all", unique(as.character(GRN@annotation$genes$gene.type))) %>% stats::na.omit(), empty.ok = FALSE)
  checkmate::assertSubset(allowMissingGenes, c(TRUE, FALSE))
  checkmate::assertSubset(allowMissingTFs, c(TRUE, FALSE))
  checkmate::assertFlag(forceRerun)
  
  if (is.null(GRN@stats$connections) | is.null(GRN@stats$connectionDetails.l) | forceRerun) {
    
    GRN = .addFunctionLogToObject(GRN)
    GRN@stats$connections = .initializeStatsDF()
    
    if (TF_peak.connectionTypes == "all") {
      TF_peak.connectionTypesAllComb = unique(GRN@config$TF_peak_connectionTypes)
    } else {
      TF_peak.connectionTypesAllComb = unique(TF_peak.connectionTypes)
    }
    
    
    futile.logger::flog.info(paste0("Generating summary. This may take a while..."))
    
    
    for (permutationCur in 0:.getMaxPermutation(GRN)) {
      
      futile.logger::flog.info(paste0("\n", .getPermStr(permutationCur), "...\n"))
      
      permIndex = as.character(permutationCur)
      GRN@stats$connectionDetails.l[[permIndex]] = list()
      
      # Iterate over different stringency thresholds and collect statistics
      
      for (TF_peak.fdr_cur in TF_peak.fdr) {
        
        TF_peak.fdr_cur.str = as.character(TF_peak.fdr_cur)
        
        futile.logger::flog.info(paste0("Calculate network stats for TF-peak FDR of ", TF_peak.fdr_cur))
        GRN@stats$connectionDetails.l[[permIndex]] [[TF_peak.fdr_cur.str]] = list()
        
        futile.logger::flog.debug(paste0("Iterating over different peak-gene FDR thresholds..."))
        for (peak_gene.fdr_cur in peak_gene.fdr) {
          
          futile.logger::flog.debug(paste0("Peak-gene FDR = ", peak_gene.fdr_cur))
          peak_gene.fdr_cur.str = as.character(peak_gene.fdr_cur)
          GRN@stats$connectionDetails.l[[permIndex]] [[TF_peak.fdr_cur.str]] [[peak_gene.fdr_cur.str]] = list()
          
          for (allowMissingTFsCur in allowMissingTFs) {
            
            futile.logger::flog.debug(paste0("  allowMissingTFs = ", allowMissingTFsCur))
            allowMissingTFsCur.str = as.character(allowMissingTFsCur)
            GRN@stats$connectionDetails.l[[permIndex]] [[TF_peak.fdr_cur.str]] [[peak_gene.fdr_cur.str]] [[allowMissingTFsCur.str]] = list()
            
            for (allowMissingGenesCur in allowMissingGenes) {
              
              futile.logger::flog.debug(paste0("  allowMissingGenes = ", allowMissingGenesCur))
              allowMissingGenesCur.str = as.character(allowMissingGenesCur)
              GRN@stats$connectionDetails.l[[permIndex]] [[TF_peak.fdr_cur.str]] [[peak_gene.fdr_cur.str]] [[allowMissingTFsCur.str]] [[allowMissingGenesCur.str]] = list()
              
              for (TF_peak.connectionTypeCur in TF_peak.connectionTypesAllComb) {
                
                GRN@stats$connectionDetails.l[[permIndex]] [[TF_peak.fdr_cur.str]] [[peak_gene.fdr_cur.str]] [[allowMissingTFsCur.str]] [[allowMissingGenesCur.str]] [[TF_peak.connectionTypeCur]] = list()
                
                futile.logger::flog.debug(paste0("    TF_peak.connectionType = ", TF_peak.connectionTypeCur))
                
                futile.logger::flog.threshold(futile.logger::WARN)
                GRN = filterGRNAndConnectGenes(GRN, 
                                               TF_peak.fdr.threshold = TF_peak.fdr_cur, 
                                               TF_peak.connectionTypes = TF_peak.connectionTypeCur, 
                                               peak_gene.p_raw.threshold = NULL, 
                                               peak_gene.fdr.threshold = peak_gene.fdr_cur,
                                               gene.types = gene.types, 
                                               allowMissingGenes = allowMissingGenesCur, 
                                               allowMissingTFs = allowMissingTFsCur,
                                               peak_gene.r_range = peak_gene.r_range,
                                               filterTFs = NULL, filterGenes = NULL, filterPeaks = NULL,
                                               resetGraphAndStoreInternally = FALSE,
                                               silent = TRUE,
                                               forceRerun = TRUE)
                
                futile.logger::flog.threshold(futile.logger::INFO)
                
                results.l = .addStats(GRN@stats$connections, GRN@connections$all.filtered[[permIndex]], 
                                      perm = permutationCur, 
                                      TF_peak.fdr = TF_peak.fdr_cur, TF_peak.connectionType = TF_peak.connectionTypeCur,
                                      peak_gene.p_raw = NA,
                                      peak_gene.fdr = peak_gene.fdr_cur, 
                                      peak_gene.r_range = paste0(peak_gene.r_range, collapse = ","),
                                      gene.types = paste0(gene.types, collapse = ","),
                                      allowMissingGenes = allowMissingGenesCur, 
                                      allowMissingTFs   = allowMissingTFsCur)
                
                GRN@stats$connections  = results.l[["summary"]]
                
                GRN@stats$connectionDetails.l[[permIndex]] [[TF_peak.fdr_cur.str]] [[peak_gene.fdr_cur.str]] [[allowMissingTFsCur.str]] [[allowMissingGenesCur.str]] [[TF_peak.connectionTypeCur]] = 
                  results.l[["details"]]
                
              } # end of  for (TF_peak.connectionTypeCur in TF_peak.connectionTypesAllComb)
              
            } # end of  for (allowMissingGenesCur in allowMissingGenes) 
            
          } # end of  for (allowMissingTFsCur in allowMissingTFs)
          
        } # end of for (peak_gene.fdr_cur in peak_gene.fdr)
        
        
        # REMOVED FOR NOW, WAS INCOMPLETE AND NOT NEEDED ANYWAY
        # if (!is.null(peak_gene.p_raw)) {
        #   
        #   futile.logger::flog.info(paste0(" Iterating over different peak-gene raw p-value thresholds..."))
        #   for (peak_gene.p_raw_cur in peak_gene.p_raw) {
        #     
        #     futile.logger::flog.info(paste0("  Peak-gene raw p-value = ", peak_gene.p_raw_cur))
        #     
        #     # TODO: Add the other ones here also
        #     for (allowMissingGenesCur in allowMissingGenes) {
        #       
        #       GRN = filterGRNAndConnectGenes(GRN, 
        #                                      TF_peak.fdr.threshold = TF_peak.fdr_cur, peak_gene.p_raw.threshold = peak_gene.p_raw_cur, 
        #                                      peak_gene.fdr.threshold= NULL,
        #                                      gene.types = gene.types, 
        #                                      allowMissingGenes = allowMissingGenesCur, peak_gene.r_range = peak_gene.r_range,
        #                                      filterTFs = NULL, filterGenes = NULL, filterPeaks = NULL,
        #                                      silent = TRUE)
        #       
        #       
        #       GRN@stats$connections = .addStats(GRN@stats$connections, GRN@connections$all.filtered[[permIndex]], 
        #                                         perm = permutationCur, 
        #                                         TF_peak.fdr = TF_peak.fdr_cur, TF_peak.connectionType = TF_peak.connectionTypeCur,
        #                                         peak_gene.fdr = NA, 
        #                                         peak_gene.p_raw = peak_gene.p_raw_cur,
        #                                         peak_gene.r_range = paste0(peak_gene.r_range, collapse = ","),
        #                                         gene.types = paste0(gene.types, collapse = ","),
        #                                         allowMissingGenes = allowMissingGenesCur,
        #                                         allowMissingTFs   = allowMissingTFsCur)
        #       
        #     } # end of for (allowMissingGenesCur in allowMissingGenes)
        #     
        #   } # end of   for (peak_gene.p_raw_cur in peak_gene.p_raw) 
        # } # end of  if (!is.null(peak_gene.p_raw))
        
        
      }
    } # end for each permutation
    
  } else {
      .printDataAlreadyExistsMessage()
  }
  
  .printExecutionTime(start, prefix = "")
  
  GRN
  
}


.addStats <- function(stats.df, connections.df, perm, 
                      TF_peak.fdr, TF_peak.connectionType, 
                      peak_gene.p_raw, peak_gene.fdr, peak_gene.r_range, 
                      gene.types,
                      allowMissingGenes, allowMissingTFs) {
  
  TF.stats   = dplyr::select(connections.df, "TF.ID", "peak.ID")   %>% dplyr::filter(!is.na(.data$peak.ID)) %>% dplyr::pull(.data$TF.ID)   %>% as.character() %>% table() 
  gene.stats = dplyr::select(connections.df, "peak.ID", "gene.ENSEMBL") %>% dplyr::filter(!is.na(.data$gene.ENSEMBL)) %>% dplyr::pull(.data$gene.ENSEMBL) %>% as.character() %>% table() 
  
  peak_gene.stats = dplyr::select(connections.df, "peak.ID", "gene.ENSEMBL") %>% dplyr::filter(!is.na(.data$gene.ENSEMBL),!is.na(.data$peak.ID)) %>% dplyr::pull(.data$peak.ID) %>% as.character() %>% table() 
  peak.TF.stats   = dplyr::select(connections.df, "peak.ID", "TF.ID")      %>% dplyr::filter(!is.na(.data$TF.ID),     !is.na(.data$peak.ID)) %>% dplyr::pull(.data$peak.ID) %>% as.character() %>% table() 
  
  if (length(TF.stats) > 0) {
    TF.connections = c(min(TF.stats, na.rm = TRUE), 
                       mean(TF.stats, na.rm = TRUE), 
                       median(TF.stats, na.rm = TRUE), 
                       max(TF.stats, na.rm = TRUE))
  } else {
    TF.connections = rep(0,4)
  }
  
  if (length(gene.stats) > 0) {
    gene.connections = c(min(gene.stats, na.rm = TRUE), 
                         mean(gene.stats, na.rm = TRUE), 
                         median(gene.stats, na.rm = TRUE), 
                         max(gene.stats, na.rm = TRUE))
  } else {
    gene.connections = rep(0,4)
  }
  
  if (length(peak_gene.stats) > 0) {
    peak_gene.connections = c(min(peak_gene.stats, na.rm = TRUE), 
                              mean(peak_gene.stats, na.rm = TRUE), 
                              median(peak_gene.stats, na.rm = TRUE), 
                              max(peak_gene.stats, na.rm = TRUE))
  } else {
    peak_gene.connections = rep(0,4)
  }
  
  if (length(peak.TF.stats) > 0) {
    peak_TF.connections = c(min(peak.TF.stats, na.rm = TRUE), 
                            mean(peak.TF.stats, na.rm = TRUE), 
                            median(peak.TF.stats, na.rm = TRUE), 
                            max(peak.TF.stats, na.rm = TRUE))
  } else {
    peak_TF.connections = rep(0,4)
  }
  
  stats.df = tibble::add_row(stats.df, 
                             perm = perm, 
                             TF_peak.fdr = TF_peak.fdr,
                             TF_peak.connectionType = TF_peak.connectionType,
                             peak_gene.p_raw = peak_gene.p_raw,
                             peak_gene.fdr = peak_gene.fdr,
                             peak_gene.r_range = peak_gene.r_range,
                             gene.types = gene.types,
                             allowMissingGenes = allowMissingGenes,
                             allowMissingTFs = allowMissingTFs,
                             
                             nGenes = dplyr::n_distinct(connections.df$gene.ENSEMBL),
                             nPeaks = dplyr::n_distinct(connections.df$peak.ID),
                             nTFs   = dplyr::n_distinct(connections.df$TF.ID),
                             
                             TF.connections_min           = TF.connections[1],
                             TF.connections_mean          = TF.connections[2],
                             TF.connections_median        = TF.connections[3],
                             TF.connections_max           = TF.connections[4],
                             
                             peak_TF.connections_min      = peak_TF.connections[1],
                             peak_TF.connections_mean     = peak_TF.connections[2],
                             peak_TF.connections_median   = peak_TF.connections[3],
                             peak_TF.connections_max      = peak_TF.connections[4],
                             
                             peak_gene.connections_min    = peak_gene.connections[1],
                             peak_gene.connections_mean   = peak_gene.connections[2],
                             peak_gene.connections_median = peak_gene.connections[3],
                             peak_gene.connections_max    = peak_gene.connections[4],
                             
                             gene.connections_min         = gene.connections[1],
                             gene.connections_mean        = gene.connections[2],
                             gene.connections_median      = gene.connections[3],
                             gene.connections_max         = gene.connections[4],
  )
  
  list(summary = stats.df, details = list(TF = TF.stats, gene = gene.stats, peak_gene = peak_gene.stats, peak.TF = peak.TF.stats))
}


.initializeStatsDF <- function() {
  
  tibble::tribble(~perm, 
                  ~TF_peak.fdr, ~TF_peak.connectionType,
                  ~peak_gene.p_raw, ~peak_gene.fdr, ~peak_gene.r_range, 
                  ~gene.types,
                  ~allowMissingGenes, ~allowMissingTFs,
                  ~nGenes, ~nPeaks, ~nTFs,
                  ~TF.connections_min, ~TF.connections_mean, ~TF.connections_median, ~TF.connections_max,
                  ~peak_TF.connections_min, ~peak_TF.connections_mean, ~peak_TF.connections_median,  ~peak_TF.connections_max,
                  ~peak_gene.connections_min, ~peak_gene.connections_mean, ~peak_gene.connections_median, ~peak_gene.connections_max,
                  ~gene.connections_min, ~gene.connections_mean, ~gene.connections_median,  ~gene.connections_max)
  
}


####### Retrieve GRN objects or parts of it #########


#' Load example GRN dataset
#' 
#' Loads an example GRN object with 6 TFs, ~61.000 peaks, ~19.000 genes, 259 filtered connections and pre-calculated enrichments. 
#' This function uses \code{BiocFileCache} if installed to cache the example object, which is 
#' considerably faster than re-downloading the file anew every time the function is executed.
#' If not, the file is re-downloaded every time anew. Thus, to enable caching, you may install the package \code{BiocFileCache}.
#' 
#' @export
#' @param forceDownload \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should the download be enforced even if the local cached file is already present?
#' @param fileURL Character. Default \url{https://git.embl.de/grp-zaugg/GRaNIE/-/raw/master/data/GRN.rds}. URL to the GRN example object in rds format.
#' @examples 
#' GRN = loadExampleObject()
#' @return An small example \code{\linkS4class{GRN}} object
loadExampleObject <- function(forceDownload = FALSE, fileURL = "https://git.embl.de/grp-zaugg/GRaNIE/-/raw/master/data/GRN.rds") {
    
    checkmate::assertFlag(forceDownload)
    options(timeout = 200)
    
    if (!is.installed("BiocFileCache")) {
        
        message = paste0("loadExampleObject: The package BiocFileCache is not installed, but recommended if you want to speed-up the retrieval of the GRN example object ",
                         "via this function when using it multiple times. If not installed, the example object has to be downloaded anew every time you use this function.")
        .checkPackageInstallation("BiocFileCache", message, isWarning = TRUE)
        
        GRN = readRDS(url(fileURL))
        
    } else {
        
        
        # Taken and modified from https://www.bioconductor.org/packages/release/bioc/vignettes/BiocFileCache/inst/doc/BiocFileCache.html
        
        bfc <- .get_cache()
        
        if (forceDownload) {
            BiocFileCache::removebfc(bfc, ask = FALSE)
            bfc <- .get_cache()
            
        }
        
        rid <- BiocFileCache::bfcquery(bfc, "GRaNIE_object_example")$rid
        if (!length(rid)) {
            rid <- names(BiocFileCache::bfcadd(bfc, "GRaNIE_object_example", fileURL))
        }
        if (!isFALSE(BiocFileCache::bfcneedsupdate(bfc, rid)) | forceDownload) {
            messageStr = paste0("Downloading GRaNIE example object from ", fileURL)
            message(messageStr)
            
            # Not needed, redundant it seems, as file is already downloaded through the code above
            # filePath = BiocFileCache::bfcdownload(bfc, rid, ask = FALSE)
        }
        
        
        filePath = BiocFileCache::bfcrpath(bfc, rids = rid)
        
        # Now we can read in the locally stored file
        GRN = readRDS(filePath)
    }
    
    
    # Change the default path to the current working directory
    GRN@config$directories$outputRoot = "."
    GRN@config$directories$output_plots = "."
    GRN@config$directories$motifFolder = "."
    GRN@config$files$output_log = "GRN.log"
    GRN@config$metadata$file_peaks = basename(GRN@config$metadata$file_peaks)
    GRN@config$metadata$file_rna = basename(GRN@config$metadata$file_rna)
    GRN@config$metadata$file_sampleMetadata = basename(GRN@config$metadata$file_sampleMetadata)
    GRN@config$parameters$packageVersion = as.character(packageVersion("GRaNIE"))
    
    GRN = .makeObjectCompatible(GRN)
    
    message("Finished successfully. You may explore the example object. Start by typing the object name to the console to see a summaty. Happy GRaNIE'ing!")

    GRN
    
}



#' Get counts for the various data defined in a \code{\linkS4class{GRN}} object
#' 
#' Get counts for the various data defined in a \code{\linkS4class{GRN}} object.
#' \strong{Note: This function, as all \code{get} functions from this package, does NOT return a \code{\linkS4class{GRN}} object.}
#' 
#' @template GRN 
#' @param type Character. Either \code{peaks} or \code{rna}. \code{peaks} corresponds to the counts for the open chromatin data, while \code{rna} refers to th RNA-seq counts. If set to \code{rna}, both real (foreground) and background data can be retrieved, while for \code{peaks}, only the real (i.e., the one with index \code{0}) can be retrieved.
#' @param asMatrix Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. If set to \code{FALSE}, counts are returned as a data frame with or without an ID column (see \code{includeIDColumn}). If set to \code{TRUE}, counts are returned as a matrix with the ID column as row names.
#' @param includeIDColumn Logical. \code{TRUE} or \code{FALSE}. Default \code{TRUE}. Only relevant if \code{asMatrix = FALSE}. If set to \code{TRUE}, an explicit ID column is returned (no row names). If set to \code{FALSE}, the IDs are in the row names instead.
#' @param includeFiltered  Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. If set to \code{FALSE}, genes or peaks marked as filtered (after running the function \code{filterData}) will not be returned. If set to \code{TRUE}, all elements are returned regardless of the currently active filter status.
#' @template permuted
#' @export
#' @import tibble
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' counts.df = getCounts(GRN, type = "peaks", permuted = FALSE)
#' @return Data frame of counts, with the type as indicated by the function parameters. This function does **NOT** return a \code{\linkS4class{GRN}} object.
getCounts <- function(GRN, type,  permuted = FALSE, asMatrix = FALSE, includeIDColumn = TRUE, includeFiltered = FALSE) {
    
    checkmate::assertClass(GRN, "GRN")

    GRN = .makeObjectCompatible(GRN)
    
    checkmate::assertChoice(type, c("peaks", "rna"))
    checkmate::assertFlag(asMatrix)
    checkmate::assertFlag(includeIDColumn)
    checkmate::assertFlag(includeFiltered)
    checkmate::assertFlag(permuted)
    
    if (type == "peaks") {
        
        if (permuted) {
            message = "Could not find permuted peak counts in GRN object. Peaks are not stored as permuted, set permuted = FALSE for type = \"peaks\""
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        classPeaks = class(GRN@data$peaks$counts)
        if ("matrix" %in% classPeaks) {
            result = GRN@data$peaks$counts
        } else if ("dgCMatrix" %in% classPeaks) {
            result = .asMatrixFromSparse(GRN@data$peaks$counts, convertZero_to_NA = FALSE)
        } else {
            message = paste0("Unsupported class for GRN@data$peaks$counts. Contact the authors.")
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        
    } else if (type == "rna") {
        
        classRNA = class(GRN@data$RNA$counts)
        if ("matrix" %in% classRNA) {
            result = GRN@data$RNA$counts
        } else if ("dgCMatrix" %in% classRNA) {
            result = .asMatrixFromSparse(GRN@data$RNA$counts, convertZero_to_NA = FALSE)
        } else {
            message = paste0("Unsupported class for GRN@data$RNA$counts. Contact the authors.")
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
    }
    
    
    if (permuted) {
        
        if (type == "rna") {
            
            # Columns are shuffled so that non-matching samples are compared throughout the pipeline for all correlation-based analyses
            colnames(result) = colnames(result)[GRN@data$RNA$counts_permuted_index]
        }
    } 
    
    if (!includeFiltered) {
        
        if (type == "rna") {
            nonFiltered = which(!GRN@data$RNA$counts_metadata$isFiltered)
        } else {
            nonFiltered = which(!GRN@data$peaks$counts_metadata$isFiltered)
        }
        
        result = result[nonFiltered,]
    }
    
    
    
    if (!asMatrix) {
        
        result.df =  result %>%
            as.data.frame()
        
        if (includeIDColumn)  {
            
            ID_column = dplyr::if_else(type == "rna", "ENSEMBL", "peakID")
            
            result.df = result.df %>%
                tibble::rownames_to_column(ID_column) 
        } 
        
        return(result.df)
        
    }
    
    result
}


#' Extract connections or links from a \code{\linkS4class{GRN}} object as a data frame.
#' 
#' Returns stored connections/links (either TF-peak, peak-genes, TF-genes or the filtered set of connections as produced by \code{\link{filterGRNAndConnectGenes}}).
#' Additional meta columns (TF, peak and gene metadata) can be added optionally. 
#' \strong{Note: This function, as all \code{get} functions from this package, does NOT return a \code{\linkS4class{GRN}} object.}
#' 
#' @export
#' @template GRN 
#' @template background
#' @param type Character. One of \code{TF_peaks}, \code{peak_genes}, \code{TF_genes} or \code{all.filtered}. Default \code{all.filtered}. The type of connections to retrieve.
#' @param include_TF_gene_correlations Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should TFs and gene correlations be returned as well? If set to \code{TRUE}, they must have been computed beforehand with \code{\link{add_TF_gene_correlation}}.
#' @param include_TFMetadata Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should TF metadata be returned as well?
#' @param include_peakMetadata Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should peak metadata be returned as well?
#' @param include_geneMetadata Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should gene metadata be returned as well?
#' @param include_variancePartitionResults Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. 
#' Should the results from the function \code{\link{add_featureVariation}} be included? 
#' If set to \code{TRUE}, they must have been computed beforehand with \code{\link{add_featureVariation}}; otherwise, an error is thrown.
#' @return A data frame with the requested connections. This function does **NOT** return a \code{\linkS4class{GRN}} object. Depending on the arguments, the
#' data frame that is returned has different columns, which however can be divided into the following classes according to their name:
#' \itemize{
#' \item TF-related: Starting with \code{TF.}:
#'  \itemize{
#'  \item \code{TF.name} and \code{TF.ID}: Name / ID of the TF
#'  \item \code{TF.ENSEMBL}: Ensembl ID (unique)
#'  }
#' \item peak-related: Starting with \code{peak.}: 
#'  \itemize{
#'  \item \code{peak.ID}: ID (coordinates)
#'  \item \code{peak.mean}, \code{peak.median}, \code{peak.CV}: peak mean, median and its coefficient of variation (CV) across all samples
#'  \item \code{peak.annotation}: Peak annotation as determined by \code{ChIPseeker} such as Promoter, 5’ UTR, 3’ UTR, Exon, Intron, Downstream, Intergenic
#'  \item \code{peak.nearestGene*}: Additional metadata for the nearest gene such as position (\code{chr}, \code{start}, \code{end}, \code{strand}), 
#'  name (\code{name}, \code{symbol} and \code{ENSEMBL}), and distance to the TSS (\code{distanceToTSS})
#'  \item \code{peak.GC.perc}: GC percentage
#'  }
#' \item gene-related: Starting with \code{gene.}:
#'  \itemize{
#'  \item \code{gene.name} and \code{gene.ENSEMBL}: gene name and Ensembl ID
#'  \item \code{gene.type}: gene type (such as \code{protein_coding}, \code{lincRNA}) as retrieved by \code{biomaRt}
#'  \item \code{gene.mean}, \code{gene.median}, \code{gene.CV}: gene mean, median and its coefficient of variation (CV) across all samples
#'  }
#'  \item TF-peak-related: Starting with \code{TF_peak.}:
#'  \itemize{
#'  \item \code{TF_peak.r} and \code{TF_peak.r_bin}: Correlation coefficient of the TF-peak pair and its correlation bin (in bins of width 0.05, such as (-0.55,-0.5] for r = -0.53)
#'  \item \code{TF_peak.fdr} and \code{TF_peak.fdr_direction}: TF-peak FDR and the directionality from which it was derived (see Methods in the paper, \code{pos} or \code{neg})
#'  \item \code{TF_peak.connectionType}: TF-peak connection type. This is by default \code{expression}, meaning that expression was used to construct the TF and peak
#'  }      
#'  \item peak-gene-related: Starting with \code{peak_gene.}:
#'  \itemize{
#'  \item \code{peak_gene.source}: Source/Origin of the identified connection. Either \code{neighborhood}, \code{TADs} or \code{knownLinks}, 
#'  depending on the parameters used when running the function \code{\link{addConnections_peak_gene}}
#'  \item \code{peak_gene.bait_OE_ID}: Only present when known links have been provided (see \code{\link{addConnections_peak_gene}}). This column denotes the original IDs of the bait and OE coordinates that identified this link.
#'  \item \code{peak_gene.tad_ID}: Only present when TADs have been provided (see \code{\link{addConnections_peak_gene}}). This column denotes the original ID of the TAD ID that identified this link.
#'  \item \code{peak_gene.distance}: Peak-gene distance (usually taken the TSS of the gene as reference unless specified otherwise, see the parameter \code{overlapTypeGene} for more information from \code{\link{addConnections_peak_gene}}).
#'  If the peak-gene connection is across chromosomes (as defined by the known links, see \code{\link{addConnections_peak_gene}}), the distance is set to NA.
#'  \item \code{peak_gene.r}: Correlation coefficient of the peak-gene pair
#'  \item \code{peak_gene.p_raw} and \code{peak_gene.p_adj}: Raw and adjusted p-value of the peak-gene pair

#'  }
#'  \item TF-gene-related: Starting with \code{TF_gene.}:
#'  \itemize{
#'  \item \code{TF_gene.r}: Correlation coefficient of the TF-gene pair
#'  \item \code{TF_gene.p_raw}: Raw  p-value of the TF-gene pair
#'  }
#' }
#' 
#' @seealso \code{\link{filterGRNAndConnectGenes}}
#' @seealso \code{\link{add_featureVariation}}
#' @seealso \code{\link{add_TF_gene_correlation}}
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN_con.all.df = getGRNConnections(GRN)
getGRNConnections <- function(GRN, type = "all.filtered",  background = FALSE, 
                              include_TF_gene_correlations = FALSE, 
                              include_TFMetadata = FALSE,
                              include_peakMetadata = FALSE,
                              include_geneMetadata = FALSE,
                              include_variancePartitionResults = FALSE) {
    
    checkmate::assertClass(GRN, "GRN")  
    GRN = .addFunctionLogToObject(GRN)
    
    GRN = .makeObjectCompatible(GRN)
    
    checkmate::assertChoice(type, c("TF_peaks", "peak_genes", "TF_genes", "all.filtered"))
    checkmate::assertFlag(background)
    #checkmate::assertIntegerish(permutation, lower = 0, upper = .getMaxPermutation(GRN))
    checkmate::assertFlag(include_TF_gene_correlations)
    checkmate::assertFlag(include_variancePartitionResults)
    checkmate::assertFlag(include_TFMetadata)
    checkmate::assertFlag(include_peakMetadata)
    checkmate::assertFlag(include_geneMetadata)
    
    permIndex = dplyr::if_else(background, "1", "0")
    
    if (type == "all.filtered") {
        
        merged.df = GRN@connections$all.filtered[[permIndex]]
        
    } else if (type == "TF_peaks") {
        
        merged.df = tibble::as_tibble(GRN@connections$TF_peaks[[permIndex]]$main) %>%
            dplyr::left_join(GRN@annotation$TFs %>% dplyr::select("TF.ID", "TF.name", "TF.ENSEMBL"), by = "TF.ID")
        
    } else if (type == "peak_genes") {
        
        merged.df = tibble::as_tibble(GRN@connections$peak_genes[[permIndex]])
        
    } else if (type == "TF_genes") {
        
        if (is.null(GRN@connections$TF_genes.filtered)) {
            message = "Please run the function add_TF_gene_correlation first. "
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        merged.df = tibble::as_tibble(GRN@connections$TF_genes.filtered[[permIndex]])
        
    } 
    
    
    if (include_TF_gene_correlations) {
        
        if (is.null(GRN@connections$TF_genes.filtered)) {
            message = "Please run the function add_TF_gene_correlation first. "
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        if (type %in% c("TF_peaks", "peak_genes") ) {
            message = "TF-gene correlations can only be added for types that include both TFs and genes, but not for TF-peaks or peak-genes. Please adjust the parameter type accordingly or set include_TF_gene_correlations = FALSE"
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        # Merge with TF-gene table
        merged.df = merged.df %>%
            dplyr::left_join(GRN@connections$TF_genes.filtered[[permIndex]], 
                             by = c("TF.ID", "TF.ENSEMBL", "gene.ENSEMBL")) 
    }
    
    if (include_variancePartitionResults) {
        
        if (ncol(GRN@annotation$genes %>% dplyr::select(tidyselect::starts_with("variancePartition"))) == 0 |
            ncol(GRN@annotation$peaks %>% dplyr::select(tidyselect::starts_with("variancePartition"))) == 0) {
            message = "Please run the function add_featureVariation first. "
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        merged.df = merged.df %>%
            dplyr::left_join(GRN@annotation$TFs %>% 
                                 dplyr::select("TF.ENSEMBL", tidyselect::starts_with("TF.variancePartition")), 
                             by = "TF.ENSEMBL")  %>%
            dplyr::left_join(GRN@annotation$genes %>% 
                                 dplyr::select("gene.ENSEMBL", tidyselect::starts_with("gene.variancePartition")), 
                             by = "gene.ENSEMBL")  %>%
            dplyr::left_join(GRN@annotation$peaks %>% 
                                 dplyr::select("peak.ID", tidyselect::starts_with("peak.variancePartition")), 
                             by = "peak.ID")
        
    }
    
    if (include_TFMetadata) {
        
        if (type %in% c("peak_genes") ) {
            message = "TF metadata can only be added for types that include TFs, but not for peak-genes. Please adjust the parameter type accordingly or set include_TFMetadata = FALSE"
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        colsMissing = setdiff(colnames(GRN@annotation$TFs), colnames(merged.df))
        if (length(colsMissing) > 0) {
            merged.df = merged.df %>%
                dplyr::left_join(GRN@annotation$TFs, by = c("TF.ID", "TF.ENSEMBL"))
        }
    }
    
    if (include_peakMetadata) {
        
        if (type %in% c("TF_genes") ) {
            message = "Peak metadata can only be added for types that include peaks, but not for TF-genes. Please adjust the parameter type accordingly or set include_peakMetadata = FALSE"
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        colsMissing = setdiff(colnames(GRN@annotation$peaks), colnames(merged.df))
        if (length(colsMissing) > 0) {
            # Dont specify id here so that by default, the join is for all common columns
            
            merged.df = suppressMessages(merged.df %>% dplyr::left_join(GRN@annotation$peaks))
        }
    }
    
    if (include_geneMetadata) {
        
        if (type %in% c("TF_peaks") ) {
            message = "Gene metadata can only be added for types that include genes, but not for TF-peaks. Please adjust the parameter type accordingly or set include_geneMetadata = FALSE"
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }
        
        colsMissing = setdiff(colnames(GRN@annotation$genes), colnames(merged.df))
        if (length(colsMissing) > 0) {
            merged.df = merged.df %>%
                dplyr::left_join(GRN@annotation$genes %>% dplyr::select(-"gene.type", -"gene.name"), by = "gene.ENSEMBL", multiple = "all")
        }
    }
    
    
    
    merged.df = merged.df %>%
        dplyr::select(tidyselect::starts_with("TF."), 
                      tidyselect::starts_with("peak."), 
                      tidyselect::starts_with("TF_peak."), 
                      tidyselect::starts_with("peak_gene."), 
                      tidyselect::starts_with("gene."), 
                      tidyselect::starts_with("TF_gene."), 
                      tidyselect::everything()) %>%
        tibble::as_tibble()
    
    
    if ("geneId" %in% colnames(merged.df)) {
        merged.df = dplyr::select(merged.df, -"geneId")
    }
    
    return(merged.df)
    
}



#' Retrieve parameters for previously used function calls and general parameters for a \code{\linkS4class{GRN}} object. 
#' 
#' \strong{Note: This function, as all \code{get} functions from this package, does NOT return a \code{\linkS4class{GRN}} object.}
#' 
#' @export
#' @template GRN 
#' @param name Character. Default \code{all}. Name of parameter or function name to retrieve. Set to the special keyword \code{all} to retrieve all parameters.
#' @param type Character. Either \code{function} or \code{parameter}. Default \code{parameter}. When set to \code{function}, a valid \code{GRaNIE} function name must be given that has been run before. When set to \code{parameter}, in combination with \code{name}, returns a specific parameter (as specified in \code{GRN@config})).
#' @return The requested parameters. This function does **NOT** return a \code{\linkS4class{GRN}} object.
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' params.l = getParameters(GRN, type = "parameter", name = "all")
getParameters <- function(GRN, type = "parameter", name = "all") {
    
    checkmate::assertClass(GRN, "GRN")
    GRN = .addFunctionLogToObject(GRN)
    checkmate::assertChoice(type, c("function", "parameter"))
    
    
    GRN = .makeObjectCompatible(GRN)
    
    
    
    if (type == "function") {
        
        checkmate::assertCharacter(name, any.missing = FALSE, len = 1)
        functionParameters = GRN@config$functionParameters[[name]]
        if (is.null(functionParameters)) {
            message = paste0("Could not find details for function \"", name, "\" in the object. Either the function has not been run or it does not exist.\nOnly the following function names exist in the object: ", 
                             paste0(names(GRN@config$functionParameters), collapse = ", "))
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        } 
        
        return(functionParameters)
        
    } else {
        
        if (name == "all") {
            return(GRN@config)
        } else {
            parameters = GRN@config[[name]]
            if (is.null(parameters)) {
                checkmate::assertChoice(name, names(GRN@config$parameters))
            } 
            
            return(parameters)
        }
        
    }
    
}



#' Summarize a \code{\linkS4class{GRN}} object to a named list for comparison with other \code{\linkS4class{GRN}} objects. 
#' 
#' \strong{Note: This function, as all \code{get} functions from this package, does NOT return a \code{\linkS4class{GRN}} object.}

#' @template GRN 
#' @param silent \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should the function be silent and print nothing? 
#' @return A named list summarizing the GRN object. This function does **NOT** return a \code{\linkS4class{GRN}} object, but instead a named lsit with the
#' following elements: 
#' \itemize{
#' \item \code{data}:
#'  \itemize{
#'  \item \code{peaks}, \code{genes} and \code{TFs}: 
#'  \item \code{sharedSamples}: 
#'  \item \code{metadata}: 
#'  }
#' \item \code{parameters} and \code{config}: GRN parameters and config information
#'  \item \code{connections}: Connection summary for different connection types
#'  \itemize{
#'  \item \code{TF_peak}: TF-peak  (number of connections for different FDR thresholds)
#'  \item \code{peak_genes}: Peak-gene 
#'  \item \code{TF_peak_gene}: TF-peak-gene
#'  }      
#'  \item \code{network}: Network-related summary, including the number of nodes, edges, communities and enrichment for both the TF-peak-gene and TF-gene network
#'  \itemize{
#'  \item \code{TF_gene}
#'  \item \code{TF_peak_gene}
#'  }
#' }
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' summary.l = getGRNSummary(GRN)
#' @export
getGRNSummary <- function(GRN, silent = FALSE) {
    
    start = Sys.time()
    checkmate::assertClass(GRN, "GRN")
    
    GRN = .makeObjectCompatible(GRN)
    
    fdr_toTest = c(0.001, 0.01, 0.05, 0.1, 0.2, 0.3)
    res.list = list()
    
    res.list$data = list()
    
    if (!is.null(GRN@data$peaks$counts_metadata)) {
        nPeaks_filt  = nPeaks(GRN, filter = TRUE)
        nPeaks_all   = nPeaks(GRN, filter = FALSE)
    } else {
        nPeaks_filt =  nPeaks_all = NA
    }
    
    scarcity_peaks_all = (length(GRN@data$peaks$counts) - Matrix::nnzero(GRN@data$peaks$counts)) / length(GRN@data$peaks$counts)
    peaks.filt = getCounts(GRN, type = "peaks", asMatrix = TRUE)
    scarcity_peaks_filt = (length(peaks.filt) - Matrix::nnzero(peaks.filt)) / length(peaks.filt)
    
    res.list$data$peaks = list("nFiltered" = nPeaks_filt, "all" =  nPeaks_all,
                               "scarcity_all" = scarcity_peaks_all,
                               "scarcity_filt" = scarcity_peaks_filt)
    
    if (!is.null(GRN@data$RNA$counts_metadata)) {
        
        nGenes_filt = nGenes(GRN, filter = TRUE)
        nGenes_all  = nGenes(GRN, filter = FALSE)
        
    } else {
        nGenes_filt =  nGenes_all = NA
    }
    
    scarcity_RNA_all = (length(GRN@data$RNA$counts) - Matrix::nnzero(GRN@data$RNA$counts)) / length(GRN@data$RNA$counts)
    RNA.filt = getCounts(GRN, type = "rna", asMatrix = TRUE)
    scarcity_RNA_filt =  (length(RNA.filt) - Matrix::nnzero(RNA.filt)) / length(RNA.filt)
    
    res.list$data$genes = list("nFiltered" = nGenes_filt, "all" =  nGenes_all, 
                               "scarcity_all" = scarcity_RNA_all,
                               "scarcity_filt" = scarcity_RNA_filt)
    
    if (!is.null(GRN@data$RNA$counts_metadata) & !is.null(GRN@data$peaks$counts_metadata)) {
        nSharedSamples = length(GRN@config$sharedSamples)
    } else {
        nSharedSamples = NA
    }
    res.list$data$sharedSamples = nSharedSamples
    
    if (!is.null(GRN@annotation$TFs)) {
        nTF_withRNA = nrow(GRN@annotation$TFs)
    } else {
        nTF_withRNA = NA
    }
    
    res.list$data$TFs = nTF_withRNA
    
    res.list$data$metadata = GRN@data$metadata
    
    res.list$parameters = GRN@config$parameters
    
    res.list$config = GRN@config
    
    ## CONNECTIONS  ##
    res.list$connections = list()
    res.list$connections$TF_peak = list()
    
    fdr_list = list()
    
    
    if (!is.null(GRN@connections$TF_peaks$`0`$main)) {
        
        for (fdrCur in .getFDR_TF_peak_vector(GRN)) {
            nCon = GRN@connections$TF_peaks$`0`$main %>%
                dplyr::filter(.data$TF_peak.fdr <= fdrCur) %>%
                nrow()
            fdr_list[[paste0(fdrCur)]] = nCon
        }
        
    } else {
        
        for (fdrCur in .getFDR_TF_peak_vector(GRN)) {
            fdr_list[[paste0(fdrCur)]] = NA
        }
    }
    res.list$connections$TF_peak$nConnections_fdr = fdr_list
    
    res.list$connections$peak_genes = list()
    
    if (!is.null(GRN@connections$peak_genes$`0`)) {
        res.list$connections$peak_genes$nConnections = nrow(GRN@connections$peak_genes$`0`)
    } else {
        res.list$connections$peak_genes$nConnections = NA
    }
    
    if (!is.null(GRN@stats$peak_genes)) {
        res.list$connections$peak_genes$QC_summary = GRN@stats$peak_genes
    } else {
        res.list$connections$peak_genes$QC_summary = NA
    }
    
    
    if (!is.null(GRN@stats$connections)) {
        res.list$connections$TF_peak_genes$summary = GRN@stats$connections
    } else {
        res.list$connections$TF_peak_genes$summary = NA
    }
    
    res.list$connections$TF_peak_genes$all.filtered = list()
    
    if (!is.null(GRN@connections$all.filtered$`0`)) {
        
        nEntries = length(GRN@config$functionParameters$filterGRNAndConnectGenes)
        max_TF_peak_FDR = GRN@config$functionParameters$filterGRNAndConnectGenes[[nEntries]]$parameters$TF_peak.fdr.threshold
        max_peak_gene_FDR = GRN@config$functionParameters$filterGRNAndConnectGenes[[nEntries]]$parameters$peak_gene.fdr.threshold
        
        res.list$connections$TF_peak_genes$all.filtered$max_TF_peak_FDR = max_TF_peak_FDR
        res.list$connections$TF_peak_genes$all.filtered$max_peak_gene_FDR = max_peak_gene_FDR
        res.list$connections$TF_peak_genes$all.filtered$nConnections_real = nrow(GRN@connections$all.filtered$`0`)
        res.list$connections$TF_peak_genes$all.filtered$table_real = GRN@connections$all.filtered$`0`
        
        res.list$connections$TF_peak_genes$all.filtered$nConnections_permuted = nrow(GRN@connections$all.filtered$`1`)
        res.list$connections$TF_peak_genes$all.filtered$table_permuted = GRN@connections$all.filtered$`1`
        
        
    } else {
        res.list$connections$TF_peak_genes$all.filtered = NA
    }
    
    res.list$network = list()
    res.list$network$TF_gene = list()
    res.list$network$TF_peak_gene = list()
    
    
    if (!is.null(GRN@graph$TF_gene)) {
        
        res.list$network$TF_gene$nNodes = length(igraph::V(GRN@graph$TF_gene$graph))
        res.list$network$TF_gene$nEdges = length(igraph::E(GRN@graph$TF_gene$graph))
        
        res.list$network$TF_peak_gene$nNodes = length(igraph::V(GRN@graph$TF_peak_gene$graph))
        res.list$network$TF_peak_gene$nEdges = length(igraph::E(GRN@graph$TF_peak_gene$graph))
        
        # Community identification (no, yes and how many and how many nodes each)
        
        df = igraph::vertex.attributes(GRN@graph[["TF_gene"]]$graph) %>% as.data.frame() 
        
        if (!is.null(df) & "community" %in% colnames(df)) {
            res.list$network$TF_gene$communities = df %>% 
                dplyr::count(.data$community) %>% 
                dplyr::arrange(dplyr::desc(.data$n))
        } else {
            res.list$network$TF_gene$communities = NA
        }
        
        res.list$network$TF_gene$enrichment = list()
        
        if (!is.null(GRN@stats$Enrichment$general)) {
            
            res.list$network$TF_gene$enrichment$general = list()
            summary.df = tibble::tribble(~ontology, ~rawp, ~nTerms)
            for (ontologyCur in names(GRN@stats$Enrichment$general)) {
                res.list$network$TF_gene$enrichment$general[[ontologyCur]] = list()
                res.list$network$TF_gene$enrichment$general[[ontologyCur]]$parameters = GRN@stats$Enrichment$general[[ontologyCur]]$parameters
                
                for (rawp_cur in fdr_toTest) {
                    nCon = GRN@stats$Enrichment$general[[ontologyCur]]$results %>%
                        dplyr::filter(.data$pval <= rawp_cur) %>%
                        nrow()
                    summary.df = tibble::add_row(summary.df, ontology = ontologyCur, rawp = rawp_cur, nTerms = nCon)
                }
                
            }
            res.list$network$TF_gene$enrichment$general$summary_allOntologies = summary.df
            
            
        } else {
            res.list$network$TF_gene$enrichment$general = NA
        }
        
        ## Community and TF enrichment ##
        for (entityCur in c("byCommunity", "byTF")) {
            
            if (!is.null(GRN@stats$Enrichment[[entityCur]])) {
                
                res.list$network$TF_gene$enrichment[[entityCur]] = list()
                summary.df = tibble::tribble(~ontology, ~feature, ~rawp, ~nTerms)
                
                
                for (featureCur in names(GRN@stats$Enrichment[[entityCur]])) {
                    
                    # Different structure, skip for now
                    if (featureCur == "combined") next
                    
                    res.list$network$TF_gene$enrichment[[entityCur]][[featureCur]] = list()
                    
                    for (ontologyCur in names(GRN@stats$Enrichment[[entityCur]][[featureCur]])) {
                        res.list$network$TF_gene$enrichment[[entityCur]][[featureCur]][[ontologyCur]] = list()
                        res.list$network$TF_gene$enrichment[[entityCur]][[featureCur]][[ontologyCur]]$parameters = GRN@stats$Enrichment[[entityCur]][[featureCur]][[ontologyCur]]$parameters
                        
                        for (rawp_cur in fdr_toTest) {
                            nCon = GRN@stats$Enrichment[[entityCur]][[featureCur]] [[ontologyCur]]$results %>%
                                dplyr::filter(.data$pval <= rawp_cur) %>%
                                nrow()
                            summary.df = tibble::add_row(summary.df, ontology = ontologyCur, feature = featureCur, rawp = rawp_cur, nTerms = nCon)
                        }
                        
                    }
                    
                }
                res.list$network$TF_gene$enrichment[[entityCur]]$summary_allOntologies = summary.df
                
                
            } else {
                res.list$network$TF_gene$enrichment[[entityCur]] = NA
            }
        }
        
        
        
    } # end if (!is.null(GRN@graph$TF_gene)) {
    
    if (!silent) .printExecutionTime(start)
    res.list
    
}

#' Optional convenience function to delete intermediate data from the function \code{\link{AR_classification_wrapper}} and summary statistics that may occupy a lot of space
#' @export
#' @template GRN
#' @return An updated \code{\linkS4class{GRN}} object, with some slots being deleted (\code{GRN@data$TFs$classification} as well as \code{GRN@stats$connectionDetails.l})
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' GRN = loadExampleObject()
#' GRN = deleteIntermediateData(GRN)
deleteIntermediateData <- function(GRN) {
    
    checkmate::assertClass(GRN, "GRN")
    GRN = .addFunctionLogToObject(GRN)
    
    GRN = .makeObjectCompatible(GRN)
    
    for (permutationCur in 0:.getMaxPermutation(GRN)) {
        
        permIndex = as.character(permutationCur)
        GRN@data$TFs$classification[[permIndex]]$TF_cor_median_foreground = NULL
        GRN@data$TFs$classification[[permIndex]]$TF_cor_median_background = NULL
        GRN@data$TFs$classification[[permIndex]]$TF_peak_cor_foreground = NULL
        GRN@data$TFs$classification[[permIndex]]$TF_peak_cor_background = NULL
        GRN@data$TFs$classification[[permIndex]]$act.rep.thres.l = NULL
    }
    
    GRN@stats$connectionDetails.l = NULL
    
    GRN
    
    
}

#' Change the output directory of a GRN object
#' 
#' @export
#' @template GRN
#' @param outputDirectory Character. Default \code{.}. New output directory for all output files unless overwritten via the parameter \code{outputFolder}.
#' @return An updated \code{\linkS4class{GRN}} object, with the output directory being adjusted accordingly
#' @examples 
#' GRN = loadExampleObject()
#' GRN = changeOutputDirectory(GRN, outputDirectory = ".")
changeOutputDirectory <- function(GRN, outputDirectory = ".") {
    
    checkmate::assertClass(GRN, "GRN")
    GRN = .addFunctionLogToObject(GRN)
    
    GRN = .makeObjectCompatible(GRN)
    
    checkmate::assertCharacter(outputDirectory, len = 1, min.chars = 1)
    
    GRN@config$directories$outputRoot   = outputDirectory
    GRN@config$directories$output_plots = paste0(outputDirectory, .Platform$file.sep, "plots", .Platform$file.sep)
    GRN@config$files$output_log         = paste0(outputDirectory, .Platform$file.sep, "GRN.log")
    
    futile.logger::flog.info(paste0("Output directory changed in the object to " , outputDirectory))
    
    
    GRN
    
}

####### Internal functions #########

.printDataAlreadyExistsMessage <- function(slotName = NULL) {
    
    if (!is.null(slotName)) {
        futile.logger::flog.info(paste0("Data already exists in object (GRN@", slotName, "). Set forceRerun = TRUE to regenerate and overwrite."))
    } else {
        futile.logger::flog.info(paste0("Data already exists in object or the specified file already exists. Set forceRerun = TRUE to regenerate and overwrite."))
    }
  
}


# Converts from an older GRN object format to the most current one due to internal optimizations
.makeObjectCompatible <- function(GRN) {
    
    # Remove the GRaNIE:: prefix in case it is present, this has been changed only in version 1.3.7
    names(GRN@config$functionParameters) = gsub("GRaNIE::", "", names(GRN@config$functionParameters), fixed = TRUE)
    
    if (is.null(GRN@annotation$TFs) & !is.null(GRN@data$TFs$translationTable)) {
        GRN@annotation$TFs = GRN@data$TFs$translationTable
        GRN@data$TFs$translationTable = NULL
    }
    
    if (!is.null(GRN@annotation$TFs)) {
        
        if (!"TF.ENSEMBL" %in% colnames(GRN@annotation$TFs)) {
            GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.ENSEMBL = "ENSEMBL")
        }
        if (!"TF.HOCOID" %in% colnames(GRN@annotation$TFs) & "HOCOID" %in% colnames(GRN@annotation$TFs)) {
            GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.HOCOID = "HOCOID")
        }
        if (!"TF.ID" %in% colnames(GRN@annotation$TFs)) {
            GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.ID = "TF.HOCOID")
        }
        
        if ("ENSEMBL" %in% colnames(GRN@annotation$TFs)) {
            GRN@annotation$TFs = dplyr::select(GRN@annotation$TFs, -"ENSEMBL")
        }
    }
    
    
    # Temporary fix: Replace lincRNA with lncRNA due to a recent change in biomaRt until we update the object directly
    if ("lncRNA" %in% levels(GRN@annotation$genes$gene.type)) {
        GRN@annotation$genes = dplyr::mutate(GRN@annotation$genes, gene.type = dplyr::recode_factor(.data$gene.type, lncRNA = "lincRNA")) 
    }
    
    if (is.null(GRN@annotation$peaks) & !is.null(GRN@annotation$consensusPeaks)) {
        GRN@annotation[["peaks"]] = GRN@annotation[["consensusPeaks"]]
        GRN@annotation[["consensusPeaks"]] = NULL
        GRN@annotation[["peaks_obj"]] = GRN@annotation[["consensusPeaks_obj"]]
        GRN@annotation[["consensusPeaks_obj"]] = NULL
        
    }
    
    # Due to a recent renaming for less confusion
    if (!is.null(GRN@annotation$peaks)) {
        colnames(GRN@annotation$peaks) = gsub(pattern = "peak.gene.", replacement = "peak.nearestGene." , colnames(GRN@annotation$peaks), fixed = TRUE)
    }
   
    
    # Renamed count slots and their structure
    # 1. peaks
    if (length(GRN@data) > 0) {
        if (!is.null(GRN@data$peaks[["counts_orig"]])) {
            GRN@data$peaks[["counts_orig"]] = NULL
        }
        if (is.null(GRN@data$peaks[["counts"]])) {
            GRN@data$peaks[["counts"]] = .storeAsMatrixOrSparseMatrix(GRN, df = GRN@data$peaks$counts_norm %>% dplyr::select(-"isFiltered"), 
                                                                      ID_column = "peakID", slotName = "GRN@data$peaks$counts")
            
            # Record previously filtered peaks, they are lost otherwise
            peaksFiltered = GRN@data$peaks$counts_norm %>% dplyr::filter(.data$isFiltered) %>% dplyr::pull(.data$peakID)
            
        }
        if (!is.null(GRN@data$peaks[["counts_norm"]])) {
            GRN@data$peaks[["counts_norm"]] = NULL
        }
        
        if (is.null(GRN@data$peaks[["counts_metadata"]])) {
            GRN@data$peaks[["counts_metadata"]] = .createConsensusPeaksDF(rownames(GRN@data$peaks[["counts"]])) 
            stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata))
            
            # Restore peaks previously set to filtered
            GRN@data$peaks[["counts_metadata"]]$isFiltered[GRN@data$peaks[["counts_metadata"]]$peakID %in% peaksFiltered] = TRUE
            
        }
        if (!is.null(GRN@data$peaks[["consensusPeaks"]])) {
            GRN@data$peaks[["consensusPeaks"]] = NULL
        }
        
        # 2. RNA
        if (!is.null(GRN@data$RNA[["counts_orig"]])) {
            GRN@data$RNA[["counts_orig"]] = NULL
        }
        if (is.null(GRN@data$RNA[["counts"]]) & !is.null(GRN@data$RNA$counts_norm.l[["0"]])) {
            GRN@data$RNA[["counts"]] = .storeAsMatrixOrSparseMatrix(GRN, df = GRN@data$RNA$counts_norm.l[["0"]] %>% dplyr::select(-"isFiltered"), 
                                                                    ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts")
            
            # Record previously filtered peaks, they are lost otherwise
            genesFiltered = GRN@data$RNA$counts_norm.l$`0` %>% dplyr::filter(.data$isFiltered) %>% dplyr::pull(.data$ENSEMBL)
            
        }
        if (!is.null(GRN@data$RNA[["counts_norm.l"]])) {
            GRN@data$RNA[["counts_norm.l"]] = NULL
        }
        if (is.null(GRN@data$RNA[["counts_metadata"]]) & !is.null(GRN@data$RNA$counts)) {
            GRN@data$RNA[["counts_metadata"]] = tibble::tibble(ID = rownames(GRN@data$RNA$counts), isFiltered = FALSE)
            
            # Restore RNA previously set to filtered
            GRN@data$RNA[["counts_metadata"]]$isFiltered[GRN@data$RNA[["counts_metadata"]]$ID %in% genesFiltered] = TRUE
        }
        
        if (is.null(GRN@data$RNA[["counts_permuted_index"]]) & !is.null(GRN@data$RNA$counts)) {
            GRN@data$RNA[["counts_permuted_index"]] = sample.int(ncol(GRN@data$RNA$counts), ncol(GRN@data$RNA$counts))
        }
    }
    
    # Rename TF.name in TF-peak connections
    if (!is.null(GRN@connections$TF_peaks)) {
        for (i in as.character(0:.getMaxPermutation(GRN))) {
            if (!is.null(GRN@connections$TF_peaks[[as.character(i)]])) {
                if (!"TF.ID" %in% colnames(GRN@connections$TF_peaks[[i]]$main)) {
                    GRN@connections$TF_peaks[[i]]$main = dplyr::rename(GRN@connections$TF_peaks[[i]]$main, TF.ID = "TF.name")
                }
                if (!"TF.ID" %in% colnames(GRN@connections$TF_peaks[[i]]$connectionStats)) {
                    GRN@connections$TF_peaks[[i]]$connectionStats = dplyr::rename(GRN@connections$TF_peaks[[i]]$connectionStats, TF.ID = "TF.name")
                }
            }
        }
    }
    
    if (!is.null(GRN@connections$TF_genes.filtered)) {
        for (i in as.character(0:.getMaxPermutation(GRN))) {
            if (!is.null(GRN@connections$TF_genes.filtered[[as.character(i)]])) {
                if (!"TF.ID" %in% colnames(GRN@connections$TF_genes.filtered[[i]])) {
                    GRN@connections$TF_genes.filtered[[i]] = dplyr::rename(GRN@connections$TF_genes.filtered[[i]], TF.ID = "TF.name")
                }
            }
        }
    }
    
    if (!is.null(GRN@connections$all.filtered)) {
        for (i in as.character(0:.getMaxPermutation(GRN))) {
            if (!is.null(GRN@connections$all.filtered[[as.character(i)]])) {
                if (!"TF.ID" %in% colnames(GRN@connections$all.filtered[[i]])) {
                    GRN@connections$all.filtered[[i]] = dplyr::rename(GRN@connections$all.filtered[[i]], TF.ID = "TF.name")
                }
            }
        }
    }
    
   
    
    
    GRN
}


.checkExistanceFilteredConnections <- function(GRN, returnLogical = FALSE) {
    
    if (is.null(GRN@connections$all.filtered[["0"]])) {
        
        if (returnLogical) {
            return(FALSE)
        } else {
            message = "Could not find filtered connections (the slot GRN@connections$all.filtered[[\"0\"]] is NULL). Run the function filterGRNAndConnectGenes."
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
        }

    }
    
    if (returnLogical) {
        return(TRUE)
    } else {
        return(NULL)
    }
}

.checkPackageRobust <- function(corMethod) {
    
    if (corMethod == "bicor") {
        packagename = "WGCNA"
        packageMessage = paste0("The package ", packagename, " is not installed, but needed here due to corMethod = \"", corMethod, "\". Please install it and re-run this function or change the value of corMethod.")
        .checkPackageInstallation(packagename, packageMessage)  
    }
    
}

.checkOutputFolder <- function(GRN, outputFolder) {
  
  if (!is.null(outputFolder)) {
    
    checkmate::assertDirectory(outputFolder, access = "w")
    
    if (!endsWith(outputFolder, .Platform$file.sep)) {
      outputFolder = paste0(outputFolder, .Platform$file.sep)
    }
    
    if (!dir.exists(outputFolder)) {
      dir.create(outputFolder)
    }
  } else {
    
    #  Re-create the output folder here and adjust to the OS-specific path separator, do not rely on what is stored in the object
    if (.Platform$OS.type == "windows") {
      GRN@config$directories$output_plots = gsub('/', ('\\'), GRN@config$directories$output_plots, fixed = TRUE)
    } else {
      GRN@config$directories$output_plots = gsub("\\", "/", GRN@config$directories$output_plots, fixed = TRUE)
    }
      
    if (!endsWith(GRN@config$directories$output_plots, .Platform$file.sep)) {
        GRN@config$directories$output_plots = paste0(GRN@config$directories$output_plots, .Platform$file.sep)
    }
    
    if (!dir.exists(GRN@config$directories$output_plots)) {
      dir.create(GRN@config$directories$output_plots, recursive = TRUE)
    }
  }
  
    if (is.null(outputFolder)) {
        return(GRN@config$directories$output_plots)
    } else {
       return(outputFolder)
    }
 
  
  
  
}


.optimizeSpaceGRN <- function(df) {
  
  if (is.null(df)) {
    message = "Data frame for optimization not found"
    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  } 
  
  df  %>%
    dplyr::mutate(TF.ID = as.factor(.data$TF.ID),
                  TF_peak.r_bin = as.factor(.data$TF_peak.r_bin),
                  peak.ID = as.factor(.data$peak.ID),
                  TF_peak.fdr_direction = as.factor(.data$TF_peak.fdr_direction),
                  TF_peak.connectionType = as.factor(.data$TF_peak.connectionType))
  
}



.getOutputFileName <- function(name) {
  
  # List of all output file names
  allNames = list(
    
    "plot_pca"                       = "PCA_sharedSamples",
    "plot_TFPeak_fdr"                = "TF_peak.fdrCurves",
    "plot_TFPeak_fdr_GC"             = "TF_peak.GCCorrection",
    "plot_TFPeak_TFActivity_QC"      = "TF_peak.TFActivity_QC",
    "plot_class_density"             = "TF_classification_densityPlots",
    "plot_class_medianClass"         = "TF_classification_stringencyThresholds",
    "plot_class_densityClass"        = "TF_classification_summaryHeatmap",
    "plot_peakGene_diag"             = "peakGene_diagnosticPlots",
    "plot_peakGene_IHW_diag"         = "peakGene_IHW.diagnosticPlots",
    "plot_connectionSummary_heatmap" = "GRN.connectionSummary_heatmap",
    "plot_connectionSummary_boxplot" = "GRN.connectionSummary_boxplot",
    "plot_generalEnrichment"         = "GRN.overall_enrichment",
    "plot_communityStats"            = "GRN.community_stats",
    "plot_communityEnrichment"       = "GRN.community_enrichment",
    "plot_generalNetworkStats"       = "GRN.overall_stats",
    "plot_TFEnrichment"              = "GRN.TF_enrichment",
    "plot_network"                   = "GRN.network_visualisation",
    "plot_correlations"              = "correlationPlots"
    
  )
  
  if (is.null(allNames[[name]])) {
    message = paste0("Name ", name, " not defined in list allNames.")
    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  } 
  
  return(allNames[[name]])
  
  
}

.getPermutationSuffixStr <- function(permutation) {
  
  if (permutation == 0) {
    suffixFile = "_original"
  } else {
    suffixFile = "_background"
  }
  
  suffixFile
}


# Helper function to automatically record the function calls to keep a better record
.addFunctionLogToObject <- function(GRN) {
  
  #listName = gsub("\\(|\\)", "", match.call()[1], perl = TRUE)
  functionName = evalq(match.call(), parent.frame(1))[1]
  listName = gsub("\\(|\\)", "", functionName, perl = TRUE)
  listName = gsub("GRaNIE::", "", listName, fixed = TRUE)
  
  if (nchar(listName) > 200) {
      message = paste0("Skipped storing function log as list name exceeds 200 characters. This should not happen and only occurs when functions from GraNIE are called in indirect ways via do.call, for example")
     futile.logger::flog.info(message)
     return(GRN)
  }
  
  # Compatibility with old objects
  if (is.null(GRN@config$functionParameters)) {
    GRN@config$functionParameters = list()
  }
  
  currentDate = gsub(" ", "_", as.character(Sys.time()))
  
  if (is.null(GRN@config$functionParameters[[listName]])) {
      GRN@config$functionParameters[[listName]] = list()
  }
  
  GRN@config$functionParameters[[listName]] [[currentDate]] = list()
  
  GRN@config$functionParameters[[listName]][[currentDate]]$call = match.call.defaults(asList = FALSE)
  GRN@config$functionParameters[[listName]][[currentDate]]$parameters = match.call.defaults()
  
  GRN
}

.getMaxPermutation <- function(GRN) {
  
  if (!is.null(GRN@config$parameters$internal$nPermutations)) {
    return(GRN@config$parameters$internal$nPermutations)
  } else {
    
    # Compatibility mode for previous object versions
    return(GRN@config$parameters$nPermutations)
  }
}

.get_combined_TF_peak_bins <- function(GRN) {
    
    fdr_steps = GRN@config$parameters$internal$stepsFDR
    if (is.null(fdr_steps)) {
        fdr_steps = GRN@config$parameters$stepsFDR
    }
    
    bin.order.pos = levels(cut(stats::rnorm(100), breaks = fdr_steps, right = FALSE, include.lowest = TRUE))
    
    # Combine the slightly different bins and merge them together
    bin.order.combined = gsub(")", "]", bin.order.pos, fixed = TRUE)
    bin.order.combined = gsub("(", "[", bin.order.combined, fixed = TRUE)
    
    bin.order.combined
    
}

.getFDR_TF_peak_vector <- function(GRN) {
    
    nEntries = length(GRN@config$functionParameters$addConnections_TF_peak)
    maxFDR = GRN@config$functionParameters$addConnections_TF_peak[[nEntries]]$parameters$maxFDRToStore
    if (is.null(maxFDR)) {
        # Old format
        maxFDR = GRN@config$functionParameters$addConnections_TF_pea$parameters$maxFDRToStore
    }
    fdr_toTest = c(0.0001, 0.001, 0.01, 0.05, 0.1, seq(0.2, 1, by = 0.1))
    fdr_toTest_filt = fdr_toTest[which(fdr_toTest <= (maxFDR + 0.001))]
    stopifnot(length(fdr_toTest_filt) > 0)
    
    fdr_toTest_filt
}


.checkForbiddenNames <- function(name, forbiddenNames) {
  
  if (name %in% forbiddenNames) {
    message = paste0("Name must not be one of the following reserved names: ", paste0(forbiddenNames, collapse = ","), ". Please choose a different one.")
    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
  }
}

.createTables_peakGeneQC <- function(peakGeneCorrelations.all.cur, networkType_details, colors_vec, range) {
    
    d = peakGeneCorrelations.all.cur %>% 
        dplyr::group_by(.data$r_positive, class, .data$peak_gene.p.raw.class) %>%
        dplyr::summarise(n = dplyr::n()) %>%
        dplyr::ungroup()
    
    # Some classes might be missing, add them here with explicit zeros
    for (r_pos in c(TRUE, FALSE)) {
        for (classCur in networkType_details) {
            for (pclassCur in levels(peakGeneCorrelations.all.cur$peak_gene.p.raw.class)) {
                
                row = which(d$r_positive == r_pos & d$class == classCur & as.character(d$peak_gene.p.raw.class) == as.character(pclassCur))
                if (length(row) == 0) {
                    d = tibble::add_row(d, r_positive = r_pos, class = classCur, peak_gene.p.raw.class = pclassCur, n = 0)
                }
            }
        }
    }
    
    # Restore the "ordered" factor for class
    d$peak_gene.p.raw.class = factor(d$peak_gene.p.raw.class, ordered = TRUE, levels =  levels(peakGeneCorrelations.all.cur$peak_gene.p.raw.class))
    
    
    # Normalization factors
    dsum = d %>%
        dplyr::group_by(.data$r_positive, .data$class) %>%
        dplyr::summarise(sum_n = sum(.data$n))
    
    
    # Summarize per bin
    d2 = d %>%
        dplyr::group_by(class, .data$peak_gene.p.raw.class) %>%
        dplyr::summarise(sum_pos = .data$n[.data$r_positive],
                         sum_neg = .data$n[!.data$r_positive]) %>%
        dplyr::mutate(ratio_pos_raw = .data$sum_pos / .data$sum_neg,
                      fraction_pos = .data$sum_pos / (.data$sum_pos + .data$sum_neg),
                      fraction_neg = 1 - .data$fraction_pos) %>%
        dplyr::ungroup()
    
    # Compare between real and background
    normFactor_real = dplyr::filter(dsum, class ==  !! (networkType_details[1])) %>%  dplyr::pull(.data$sum_n) %>% sum() /
        dplyr::filter(dsum, class ==  !! (networkType_details[2])) %>%  dplyr::pull(.data$sum_n) %>% sum()
    
    # ratio_norm not used currently, no normalization necessary here or not even useful because we dont want to normalize the r_pos and r_neg ratios: These are signal in a way. Only when comparing between real and background, we have to account for sample size for corrections
    d3 = d %>%
        dplyr::group_by(.data$peak_gene.p.raw.class, .data$r_positive) %>%
        dplyr::summarise(n_real     = .data$n[class == !! (names(colors_vec)[1]) ],
                         n_background = .data$n[class == !! (names(colors_vec)[2]) ]) %>%
        dplyr::ungroup() %>%
        dplyr::mutate(ratio_real_raw = .data$n_real / .data$n_background,
                      ratio_real_norm = .data$ratio_real_raw / normFactor_real,
                      enrichment_real      = .data$n_real / (.data$n_real + .data$n_background),
                      enrichment_real_norm = (.data$n_real / normFactor_real) / ((.data$n_real / normFactor_real) + .data$n_background)) 
    
    
    stopifnot(identical(levels(d2$peak_gene.p.raw.class), levels(d3$peak_gene.p.raw.class)))
    # 2 enrichment bar plots but combined using facet_wrap
    d2$set = "r+ / r-"; d3$set = "real / background" 
    d_merged <- tibble::tibble(peak_gene.p.raw.class = c(as.character(d2$peak_gene.p.raw.class), 
                                                         as.character(d3$peak_gene.p.raw.class)),
                               ratio = c(d2$ratio_pos_raw, d3$ratio_real_norm),
                               classAll = c(as.character(d2$class), d3$r_positive),
                               set = c(d2$set, d3$set)) %>%
        dplyr::mutate(classAll = factor(.data$classAll, levels = c(paste0("real_",range), paste0("random_",range), "TRUE", "FALSE")),
                      peak_gene.p.raw.class = factor(.data$peak_gene.p.raw.class, levels = levels(d2$peak_gene.p.raw.class)))
    
    d4 = tibble::tibble(peak_gene.p.raw.class = unique(d$peak_gene.p.raw.class), 
                        n_rpos_real = NA_integer_, n_rpos_random = NA_integer_,
                        n_rneg_real = NA_integer_, n_rneg_random = NA_integer_,
                        ratio_background_real_rpos_norm = NA_real_,
                        ratio_background_real_rneg_norm = NA_real_)
    
    for (i in seq_len(nrow(d4))) {
        row_d2 = which(d2$class == networkType_details[1] & d2$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i])
        stopifnot(length(row_d2) == 1)
        d4[i, "n_rpos_real"] = d2[row_d2, "sum_pos"] %>% unlist()
        d4[i, "n_rneg_real"] = d2[row_d2, "sum_neg"] %>% unlist()
        row_d2 = which(d2$class == paste0("random_",range) & d2$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i])
        d4[i, "n_rpos_random"] = d2[row_d2, "sum_pos"] %>% unlist()
        d4[i, "n_rneg_random"] = d2[row_d2, "sum_neg"] %>% unlist()
        
        row_d3 = which(d3$r_positive == TRUE & d3$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i])
        d4[i, "ratio_background_real_rpos_norm"] = 1 - d3[row_d3, "ratio_real_norm"] %>% unlist()
        row_d3 = which(d3$r_positive == FALSE & d3$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i])
        d4[i, "ratio_background_real_rneg_norm"] = 1 - d3[row_d3, "ratio_real_norm"] %>% unlist()
    }
    
    d4 = d4 %>%
        dplyr::mutate(ratio_rneg_rpos_real = .data$n_rneg_real / (.data$n_rneg_real + .data$n_rpos_real),
                      ratio_rneg_rpos_random = .data$n_rneg_random / (.data$n_rneg_random + .data$n_rpos_random),
                      peak_gene.p.raw.class.bin = as.numeric(.data$peak_gene.p.raw.class)) %>%
        dplyr::arrange(.data$peak_gene.p.raw.class.bin)
    
    d4_melt = reshape2::melt(d4, id  = c("peak_gene.p.raw.class.bin", "peak_gene.p.raw.class")) %>%
        dplyr::filter(grepl("ratio", .data$variable))
    
    
    list(d = d, d2 = d2, d3 = d3, d4 = d4, d4_melt = d4_melt, d_merged = d_merged)
    
}

.classFreq_label <- function(tbl_freq) {
    paste0(names(tbl_freq), " (", tbl_freq, ")")
}


.getBasic_metadata_visualization <- function(GRN, forceRerun = FALSE) {
    
    checkmate::assertClass(GRN, "GRN")
    GRN = .addFunctionLogToObject(GRN) 
    checkmate::assertFlag(forceRerun)
    
    # Get base mean expression for genes and TFs and mean accessibility from peaks
    
    
    # TODO: Do we need this for the shuffled one?
    
    if (is.null(GRN@visualization$metadata) | forceRerun) {
        
        expMeans.m = getCounts(GRN, type = "rna", permuted = FALSE, asMatrix = TRUE)
        
        baseMean = rowMeans(expMeans.m)
        expression.df = tibble::tibble(ENSEMBL_ID = getCounts(GRN, type = "rna", permuted = FALSE, includeIDColumn = TRUE)$ENSEMBL, baseMean = baseMean) %>%
            dplyr::mutate(ENSEMBL_ID = gsub("\\..+", "", .data$ENSEMBL_ID, perl = TRUE),
                          baseMean_log = log2(baseMean + 0.01))
        
        expression_TF.df = dplyr::filter(expression.df, .data$ENSEMBL_ID %in% GRN@annotation$TFs$TF.ENSEMBL) %>%
            dplyr::left_join(GRN@annotation$TFs, by = c("ENSEMBL_ID" = "TF.ENSEMBL"), multiple = "all")
        
        meanPeaks.df = tibble::tibble(peakID = getCounts(GRN, type = "peaks", permuted = FALSE)$peakID, 
                                      mean = rowMeans(getCounts(GRN, type = "peaks", permuted = FALSE, asMatrix = TRUE))) %>%
            dplyr::mutate(mean_log = log2(mean + 0.01))
        
        GRN@visualization$metadata = list("RNA_expression_genes" = expression.df,
                                          "RNA_expression_TF"    = expression_TF.df,
                                          "Peaks_accessibility"   = meanPeaks.df)
        
    } 
    
    GRN
    
}


######## Quantify source of variation ########

#' Quantify and interpret multiple sources of biological and technical variation for features (TFs, peaks, and genes) in a \code{\linkS4class{GRN}} object
#' 
#' Runs the main function \code{fitExtractVarPartModel} of the package \code{variancePartition}: Fits a linear (mixed) model to estimate contribution of multiple sources of variation while simultaneously correcting for all other variables for the features in a GRN object (TFs, peaks, and genes) given particular metadata. The function reports the fraction of variance attributable to each metadata variable.
#' \strong{Note: The results are not added to \code{GRN@connections$all.filtered}, rerun the function \code{\link{getGRNConnections}} and set \code{include_variancePartitionResults} to \code{TRUE} to do so}.
#' The results object is stored in \code{GRN@stats$variancePartition} and can be used for the various diagnostic and plotting functions from \code{variancePartition}.
#' 
#' The normalized count matrices are used as input for \code{fitExtractVarPartModel}. 
#' 
#' @template GRN 
#' @param formula Character(1). Either \code{auto} or a manually defined formula to be used for the model fitting. Default \code{auto}. Must include only terms that are part of the metadata as specified with the \code{metadata} parameter. If set to \code{auto}, the formula will be build automatically based on all metadata variables as specified with the \code{metadata} parameter. By default, numerical variables will be modeled as fixed effects, while variables that are defined as factors or can be converted to factors (characters and logical variables) are modeled as random effects as recommended by the \code{variancePartition} package.
#' @param metadata Character vector. Default \code{all}. Vector of column names from the metadata data frame that was provided when using the function 
#' \code{\link{addData}}. Must either contain the special keyword \code{all} to denote that all (!) metadata columns from \code{GRN@data$metadata} are taken
#' or if not, a subset of the column names from \code{GRN@data$metadata}to include in the model fitting for \code{fitExtractVarPartModel}..
#' @param features Character(1). Either \code{all_filtered} or \code{all}. Default \code{all_filtered}. Should \code{variancePartition} only be run for the features (TFs, peaks and genes) from the filtered set of connections (the result of \code{\link{filterGRNAndConnectGenes}}) or for all genes that are defined in the object? If set to \code{all}, the running time is greatly increased.
#' @template nCores
#' @template forceRerun
#' @param ... Additional parameters passed on to \code{variancePartition::fitExtractVarPartModel} beyond \code{exprObj}, \code{formula} and \code{data}. See the function help for more information
# #' @seealso \code{\link{plotDiagnosticPlots_featureVariation}}
#' @seealso \code{\link{addData}}
#' @seealso \code{\link{getGRNConnections}}
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function to \code{GRN@stats$variancePartition} as well as the elements \code{genes}, \code{consensusPeaks} and \code{TFs} within \code{GRN@annotation}. 
#' As noted above, the results are not added to \code{GRN@connections$all.filtered}; rerun the function \code{\link{getGRNConnections}} and set \code{include_variancePartitionResults} to \code{TRUE} to include the results in the eGRN output table.
#' @examples 
#' # See the Workflow vignette on the GRaNIE website for examples
#' # GRN = loadExampleObject()
#' # GRN = add_featureVariation(GRN, metadata = c("mt_frac"), forceRerun = TRUE)
#' @export
add_featureVariation <- function(GRN, 
                                    formula = "auto", 
                                    metadata = c("all"),
                                    features = "all_filtered", 
                                    nCores = 1,
                                    forceRerun = FALSE, 
                                    ...) {
    
    start = Sys.time()  
    
    checkmate::assertClass(GRN, "GRN")
    GRN = .addFunctionLogToObject(GRN)
    
    GRN = .makeObjectCompatible(GRN)
    
    checkmate::assertCharacter(formula, min.chars = 2, len = 1)
    
    packageMessage = paste0("The package variancePartition is not installed but required for this function.")
    .checkPackageInstallation("variancePartition", packageMessage, isWarning = FALSE)

    if (is.null(GRN@data$metadata)) {
        message = paste0("No metadata was found (GRN@data$metadata is NULL), cannot run variancePartition without metadata. Reren the addData function and provide metadata.")
        
        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)    
    }
    checkmate::assert(checkmate::checkChoice(metadata, "all"), checkmate::checkSubset(metadata, colnames(GRN@data$metadata)))

    checkmate::assertChoice(features, c("all_filtered", "all"))
    checkmate::assertIntegerish(nCores, lower = 1)
    checkmate::assertFlag(forceRerun)
    
    if (is.null(GRN@stats$variancePartition$RNA) | is.null(GRN@stats$variancePartition$peaks) | forceRerun) {
        
        if (features == "all_filtered") {
            .checkExistanceFilteredConnections(GRN)
        }
        
        # Prepare the metadata
        if ("all" %in% metadata) {
            columnsToSelect = colnames(GRN@data$metadata)
        } else {
            columnsToSelect = unique(c(metadata, "has_both"))
        }
        meta <- GRN@data$metadata %>% 
            dplyr::filter(.data$has_both == TRUE) %>%
            dplyr::select(tidyselect::one_of(columnsToSelect)) %>%
            dplyr::mutate_if(is.character, as.factor) %>%
            dplyr::mutate_if(is.logical, as.factor) %>%
            dplyr::select(-"has_both")
        
        # Remove factors with only one level
        coltypes = meta %>% dplyr::summarise_all(class)
        factorVariables  = which(coltypes[1,] == "factor")
        numericVariables = which(coltypes[1,] == "numeric")
        factorVariablesNames = colnames(coltypes)[factorVariables]
        numericVariablesNames = colnames(coltypes)[numericVariables]

        nLevels = sapply(factorVariables, function(x) {nlevels(dplyr::pull(meta, colnames(coltypes)[x]))})
        nLevelOne = which(nLevels == 1)
        if (length(nLevelOne) > 0) {
            meta = dplyr::select(meta, -tidyselect::one_of(factorVariablesNames[nLevelOne]))
            factorVariablesNames = intersect(colnames(meta), factorVariablesNames)
        }
        
        if (formula == "auto") {
            
            futile.logger::flog.info(paste0("Due to formula = \"auto\", all factors will be modeled as random effects and all numerical variables as fixed effects, as recommended in the variancePartition vignette."))
            
            # See https://bioconductor.org/packages/release/bioc/vignettes/variancePartition/inst/doc/variancePartition.pdf for details
            # Factor variables are modeled as random effect
            if (length(factorVariablesNames) > 0) {
                formulaElems = paste0("(1|", factorVariablesNames, ")")
            } else {
                formulaElems  = ""
            }
            
            if (length(numericVariablesNames) > 0) {
                if (length(factorVariablesNames) > 0) {
                    separator = " + " 
                } else {
                    separator = ""
                }
                # Numeric variables are modeled as fixed effect
                formulaElems2 = paste0(numericVariablesNames)
                
                message = paste0("add_featureVariation: Make sure that all variables from the metadata that are numeric are indeed really numeric and not (hidden) factors. The following variables will be treated as numeric: ", paste0(numericVariablesNames, collapse = ","), ". If one of these is in fact a factor, change the type in GRN@data$metadata and re-run, or provide the design formula manually")
                .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
                
            } else {
                separator = ""
                formulaElems2 = c()
            }
            
            designFormula <- stats::as.formula(paste0("~ ", paste0(formulaElems, collapse = " + "), separator, paste0(formulaElems2, collapse = " + "))) 
            
        } else {
            
            message = paste0("add_featureVariation: A custom formula has been provides. This is currently not being checked for correctness and therefore may reuslt in errors when running variancePartition. In that case, make sure the provided formula is correct.")
            .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
            designFormula = formula
        }
        
        futile.logger::flog.info(paste0("The following formula will be used for model fitting: \"", deparse1(designFormula), "\". Make sure this is appropriate."))
        
        
        # RNA and ATAC norm counts
        if (features == "all_filtered") {
            genesToInclude = unique(c(GRN@connections$all.filtered[["0"]]$TF.ENSEMBL, GRN@connections$all.filtered[["0"]]$gene.ENSEMBL))
            peaksToInclude = unique(GRN@connections$all.filtered[["0"]]$peak.ID)
            
            futile.logger::flog.info(paste0("Using ", length(peaksToInclude), " peaks and ", length(genesToInclude), " TF/genes from the filtered connections."))
            
        } else if (features == "all") {
            genesToInclude = GRN@data$RNA$counts_metadata$ID
            peaksToInclude = GRN@data$peaks$counts_metadata$peakID
            
            futile.logger::flog.info(paste0("Using all ", length(peaksToInclude), " peaks and ", length(genesToInclude), " TF/genes. This may take a long time. Consider setting features = \"all_filtered\"."))
        }
        
        data.l = list()
        data.l[["RNA"]]   <- getCounts(GRN, type = "rna",   permuted = FALSE, includeFiltered = FALSE, includeIDColumn = FALSE)
        data.l[["peaks"]] <- getCounts(GRN, type = "peaks", permuted = FALSE, includeFiltered = FALSE, includeIDColumn = FALSE)
        
        
        for (dataType in c("RNA", "peaks")) {
            
            # row_order <- matrixStats::rowVars(as.matrix(data.l[[dataType]]) ) %>% order(decreasing = T)
            # data_set <- data.l[[dataType]][row_order,]
            
            # fit model
            futile.logger::flog.info(paste0("Running variancePartition and fit models for data type ", dataType, " using ", nCores, " core(s). This may take a while."))
            varPart <- variancePartition::fitExtractVarPartModel(exprObj = data.l[[dataType]], 
                                                                 formula = designFormula, 
                                                                 data = meta, 
                                                                 BPPARAM = .initBiocParallel(nCores),
                                                                 ...)
            GRN@stats$variancePartition[[dataType]]  = varPart
            
            res = as.data.frame(GRN@stats$variancePartition[[dataType]]) %>%
                dplyr::rename_all( .funs = ~ paste0("variancePartition_", .x)) %>%
                tibble::as_tibble(rownames = "ID")
            
            # Update annotation slots
            if (dataType == "RNA") {
                
                colnames(res)[2:ncol(res)] = paste0("gene.", colnames(res)[2:ncol(res)])
                
                GRN@annotation$genes = GRN@annotation$genes %>%
                    dplyr::select(-tidyselect::starts_with("gene.variancePart")) %>%
                    dplyr::left_join(res, by = c("gene.ENSEMBL" = "ID"))
                
                colnames(res) = gsub("gene.", "TF.", colnames(res))
                
                GRN@annotation$TFs = GRN@annotation$TFs %>%
                    dplyr::select(-tidyselect::starts_with("TF.variancePart")) %>%
                    dplyr::left_join(res, by = c("TF.ENSEMBL" = "ID"))
                
                
            } else {
                
                colnames(res)[2:ncol(res)] = paste0("peak.", colnames(res)[2:ncol(res)])
                
                GRN@annotation$peaks = GRN@annotation$peaks %>%
                    dplyr::select(-tidyselect::starts_with("variancePart")) %>%
                    dplyr::left_join(res, by = c("peak.ID" = "ID"))
            }
            
        }
        
        futile.logger::flog.info(paste0("The result objects have been stored in GRN@stats$variancePartition for both RNA and peaks."))
        
    } else {
        .printDataAlreadyExistsMessage()
    }
    
    
    .printExecutionTime(start, prefix = "")
    
    GRN
    
}

.printTF <- function(GRN, ID, printName = TRUE, printEnsemblID = FALSE) {
    
    nameCombined = paste0(ID)
    if (printName | printEnsemblID) {
        
        if (printName) {
            printComma = FALSE
            TF.name = GRN@annotation$TFs %>% dplyr::filter(.data$TF.ID == ID) %>% dplyr::pull(.data$TF.name)
            stopifnot(length(TF.name) == 1)
            if (TF.name != ID) {
                nameCombined = paste0(nameCombined, " (")
                nameCombined = paste0(nameCombined, TF.name)
                printComma = TRUE
            }
           
        }
        if (printEnsemblID) {
            printParanthesis = FALSE
            if (printName & printComma) {
                nameCombined = paste0(nameCombined, ", ")
            } else {
                nameCombined = paste0(nameCombined, " (")
                printParanthesis = TRUE
            }
            TF.ENSEMBL =  GRN@annotation$TFs %>% dplyr::filter(.data$TF.ID == ID) %>% dplyr::pull(.data$TF.ENSEMBL)
            nameCombined = paste0(nameCombined, TF.ENSEMBL)
            if (printParanthesis) nameCombined = paste0(nameCombined, ")")
        }
        if ( printComma) nameCombined = paste0(nameCombined, ")")
    }
    
    nameCombined
}


.printGene <- function(GRN, ID, printName = TRUE) {
    
    nameCombined = paste0(ID)
    if (printName) {
        gene.name = GRN@annotation$genes %>% dplyr::filter(.data$gene.ENSEMBL == ID) %>% dplyr::pull(.data$gene.name)
        nameCombined = paste0(nameCombined, " (", gene.name, ")")
    }
    
    nameCombined
}


  