#' Overlap the genomic features related to the sRNA clusters 
#'
#' @details 
#' Based on genomic coordinates, assign sRNA clusters with matching annotation 
#' information. This function can be used to find the genomic features from 
#' which the sRNA clusters originate from. This includes genes or repetitive 
#' regions. An additional buffer region at the start/end of the gene is added 
#' to improve hits, and align with the assumptions about promoter regions. 
#' 
#' It is important that any alteration which were made to the genome reference
#' (FASTA) used for aligment/clustering,  such as alterations to the chromosome 
#' name, must be carried forth to the genome annotation file. See 
#' [mobileRNA::RNAmergeGenomes()] and [mobileRNA::RNAmergeAnnotations()] for 
#' more information. 
#'
#' @param data data.frame; originally generated by [mobileRNA::RNAimport()] or 
#' containing `chr`, `start` and `end` columns. 
#'
#' @param annotation path; URL, connection or GFFFile object. A genome
#' reference annotation file (.gff/.gff1/.gff2/.gff3).
#'
#' @param match character; must be either  "within" or "genes". Where 
#' "within" will return matches where the clusters can be found within any 
#' annotation, while "genes" will return matches where the clusters can be found 
#' within only genes. 
#' 
#' @param bufferRegion numeric; a buffer region in base-pairs to extend the 
#' start and end coordinates upstream and downstream respectively. 
#' 
#' 
#'@return 
#'Appends the attribute columns from the GFF file to the supplied data based on
#'overlapping genomic regions. 
#'
#'
#'
#' @export
#' @importFrom rtracklayer import
#' @importFrom GenomicRanges GRanges
#' @importFrom GenomicRanges findOverlaps
#' @importFrom S4Vectors queryHits
#' @importFrom S4Vectors subjectHits
#' @importFrom dplyr mutate
#' @importFrom rlang sym
#' @importFrom dplyr select_if
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors mcols 
#' @importFrom IRanges ranges
#' @importFrom S4Vectors elementMetadata
#' @importFrom GenomeInfoDb seqnames
#' @examples
#'  # load data 
#' data("sRNA_data")
#'
#' attributes_df <- RNAattributes(data = sRNA_data,
#'                     annotation = system.file("extdata",
#'                     "prefix_reduced_chr2_Tomato.gff.gz", package="mobileRNA"),
#'                     match = "genes")
#'
RNAattributes <- function(data, annotation, match = c("within", "genes"),
                          bufferRegion = 1000){
  if (base::missing(data) || nrow(data) == 0) {
    stop("data is missing. data must be an object of class matrix, data.frame, 
           DataFrame")
  }
  
  if (missing(annotation) || is.null(annotation) ) {
    stop("annotation parameter is missing or empty.")
  }
  
  if (is.character(annotation) || file.exists(annotation)|| 
      !grepl("\\.(gff|gff1|gff2|gff3)$",annotation)) {
    annotation <- suppressMessages(rtracklayer::import(annotation)) 
  } else 
    if(!methods::is(annotation, "GRanges")){
      stop("The annotation does not exist, must either be a path or GRange object.") 
    }
  
  if(match == "within"){
    features_gr <-  annotation
    # convert data to granges 
    data_gr <- GenomicRanges::GRanges(data$chr,
                  ranges = IRanges::IRanges(start = as.numeric(data$start), end = as.numeric(data$end))
    )
    # Find overlaps between genomic loci and adjusted GRanges
    overlaps <- suppressWarnings(GenomicRanges::findOverlaps(data_gr, features_gr)) 
    
    # Get the indices of overlapping genomic loci ie row number 
    queryHits_ot <- S4Vectors::queryHits(overlaps)
    subjectHits_ot <- S4Vectors::subjectHits(overlaps)
    
    
    # convert to dataframe 
    features_gr_df <- as.data.frame(features_gr)
    # add columns to data 
    add_cols <- colnames(features_gr_df) 
    col_diff <- setdiff(add_cols, colnames(data))
    rm_extra <- c("seqnames", "width","strand","source", "score", "phase")
    col_diff <- col_diff[!col_diff %in% rm_extra]
    
    data[,col_diff] <- NA
    if(length(subjectHits_ot) == 0 ){
      stop("No genomic features matched the sRNA clusters")
    }
    for (i in seq_along(subjectHits_ot)) {
      row_index <- subjectHits_ot[i]
      row_vals <- features_gr_df[row_index, ]
      row_vals<- row_vals[,col_diff] # only extra columns. 
      for (j in names(row_vals)) {
        if (j %in% names(data[queryHits_ot[i],])) {
          data[queryHits_ot[i],] <- data[queryHits_ot[i],] %>%
            dplyr::mutate(!!j := ifelse(is.na(!!rlang::sym(j)), row_vals[[j]], 
                                        !!rlang::sym(j)))
        }
      }
    }
    
    # remove columsn with only NAs
    data <- data  %>% dplyr::select_if(~sum(!is.na(.)) > 0)
  }
  
  if(match == "genes"){
    # find clumn with 
    meta <- as.data.frame(S4Vectors::mcols(annotation))
    # select column with cells contain "gene", but if they contain "gene" then nothing else. 
    gene_columns <- which(vapply(meta, function(x) any(grepl("^gene$", x)), FUN.VALUE = logical(1)))
    gene_col_name <- names(meta)[gene_columns]
    # select genes
    genes <- annotation[S4Vectors::elementMetadata(annotation)[,gene_col_name] == "gene"]
    # amend ranges
    adjusted_ranges <- IRanges::IRanges(
      start = as.data.frame(IRanges::ranges(genes))$start - bufferRegion,
      end = as.data.frame(IRanges::ranges(genes))$end + bufferRegion
    )
    
    #add ranges to genes info  
    adjusted_grange <- GenomicRanges::GRanges(
      GenomeInfoDb::seqnames(genes),
      ranges = adjusted_ranges,
      metadata = S4Vectors::mcols(genes)
    )
    
    # convert data to granges 
    data_gr <- GenomicRanges::GRanges(
      seqnames = data$chr,
      ranges = IRanges::IRanges(start = as.numeric(data$start), end = as.numeric(data$end))
    )
    
    # Find overlaps between genomic loci and adjusted GRanges
    overlaps <- suppressWarnings(GenomicRanges::findOverlaps(data_gr, adjusted_grange)) 
    
    # Get the indices of overlapping genomic loci ie row number 
    queryHits_ot <- S4Vectors::queryHits(overlaps)
    subjectHits_ot <- S4Vectors::subjectHits(overlaps)
    
    
    # convert to dataframe 
    adjusted_grange_df <- as.data.frame(adjusted_grange)
    # converion adds metadeta to metadatcols 
    names(adjusted_grange_df) <- sub('^metadata.', '',names(adjusted_grange_df))
    # add columns to data 
    add_cols <- colnames(adjusted_grange_df) 
    col_diff <- setdiff(add_cols, colnames(data))
    rm_extra <- c("seqnames", "width","strand","source", "score", "phase")
    col_diff <- col_diff[!col_diff %in% rm_extra]
    data[,col_diff] <- NA
    if(length(subjectHits_ot) == 0 ){
      stop("No genomic features matched the sRNA clusters")
    }
    
    for (i in seq_along(subjectHits_ot)) {
      row_index <- subjectHits_ot[i]
      row_vals <- adjusted_grange_df[row_index, ]
      row_vals<- row_vals[,col_diff] # only extra columns. 
      
      for (j in names(row_vals)) {
        if (j %in% names(data[queryHits_ot[i],])) {
          data[queryHits_ot[i],] <- data[queryHits_ot[i],] %>%
            dplyr::mutate(!!j := ifelse(is.na(!!rlang::sym(j)), row_vals[[j]], 
                                        !!rlang::sym(j)))
        }
      }
    }
    
    # remove columns with na
    data <- data[,colSums(is.na(data))<nrow(data)]
    
  }
  return(data)
}
