#' Summarise the distribution of sRNA clusters across genomic features
#'
#' @description Calculates the number of genomic features within the supplied 
#' annotations and calculates the number of sRNA clusters which overlap with 
#' these genomic features. Based on the features within the provided annotation.
#'
#' @details
#' `RNAfeatures` calculates the number or percentage of sRNA clusters which 
#' overlap with genomic features based on their genomic coordinates. 
#'
#' 
#' @seealso [mobileRNA::RNAmergeAnnotations()] to merge 2 GFF files into 1. 
#'
#' @param data data.frame; generated by [mobileRNA::RNAimport()]
#'
#' @param annotation path; URL or connection to a GFFFile object. A genome
#' reference annotation file (.gff/.gff1/.gff2/.gff3). Can be in compressed 
#' format (gzip).
#'
#' @param repeats path; URL or connection to a GFFFile object. A genome
#' reference annotation file, which only contains information on repeat
#' sequences in the genome (.gff/.gff1/.gff2/.gff3). By default, this is not
#' required, however if there is a specific repeats annotation file for the
#' genome it is suggested to supply it. Can be in compressed format (gzip).
#' 
#' @param promoterRegions numeric; defines the upstream promoter region of 
#' genes. Default is 1000, which refers to promoters set at 1Kb upstream of
#' genes
#' 
#' @param percentage logical; define whether to return the results as a 
#' percentage of the total or returned as a count value representing the 
#' number of sRNA clusters that overlap with a given genomic feature. Default is
#' `TRUE`. 
#'
#'
#' @return Returns a table containing the number or percentage of overlaps in
#' the supplied sRNA data set with genomic features within supplied annotation
#' and/or with repeats. 
#' 
#' @examples
#' data("sRNA_data")
#' features <- RNAfeatures(data = sRNA_data,
#'                        annotation = system.file("extdata",
#'                        "reduced_chr2_Tomato.gff.gz", package="mobileRNA"))
#'                        
#'@importFrom rtracklayer import
#'@importFrom GenomicRanges findOverlaps
#'@importFrom S4Vectors queryHits
#'@importFrom stats start
#'@importFrom stats end
#'@importFrom BiocGenerics strand
#'@importFrom dplyr select
#'@importFrom dplyr mutate
#'@importFrom dplyr filter
#'@importFrom GenomicRanges makeGRangesFromDataFrame
#'@importFrom scales label_percent
#'@importFrom dplyr %>%
#'@importFrom IRanges overlapsAny
#'@export
RNAfeatures <- function(data, annotation,
                        repeats = NULL,
                        promoterRegions = 1000,
                        percentage = TRUE) {
  if (base::missing(data)) {
    stop("data is missing. data must be an object of class matrix, data.frame, 
         DataFrame. ")
  }
  if (!base::inherits(data, c("matrix", "data.frame", "DataFrame"))) {
    stop("data must be an object of class matrix, data.frame, DataFrame.")
  }
  if (missing(annotation) || is.null(annotation) || annotation == "" || 
      !file.exists(annotation)) {
    stop("annotation parameter is missing or empty.")
  }
  
  annotation_info <-rtracklayer::import(annotation)
  store_f <- GRangesList()
  
  # extract types of genomic features 
  features <- levels(annotation_info$type)
  
  for (i in seq_along(features)) {
    label_feature_type <-  features[i]
    feature_type <- subset(annotation_info, type == features[i])
    store_f[[label_feature_type]] <- feature_type
  }
  
  # Convert list to GRangesList
  store_f <- GRangesList(store_f)
  
  # if genes presents, then make promorer
  if("gene" %in% names(store_f)){
    # define promoter regions
    genes <- store_f[["gene"]]
    gene_promoters <-as.data.frame(genes)
    colnames(gene_promoters)[1] <- "chr"
    if('*' %in% gene_promoters$strand){
      gene_promoters <- gene_promoters[, -match("strand", 
                                                colnames(gene_promoters))]
    }
    pos_strand_promoter <- gene_promoters %>%
      dplyr::filter(strand == "+") %>% 
      dplyr::mutate(end=start) %>%
      dplyr::mutate(start=start-promoterRegions)
    
    neg_strand_promoter <- gene_promoters %>%
      dplyr::filter(strand == "-") %>% 
      dplyr::mutate(end=start) %>%
      dplyr::mutate(start=start-promoterRegions)
    
    promoters <- rbind(pos_strand_promoter, neg_strand_promoter)
    promoters <- GenomicRanges::makeGRangesFromDataFrame(promoters)
    
    # add to list 
    store_f[["promoters"]] <- promoters
    
  }
  
  
 # if repeats present then add to store 
  if(!is.null(repeats)){
    repeats <- rtracklayer::import(repeats)
    # add repeats to list 
    store_f[["repeats"]] <- repeats
  }
  
  
  # remove special characters from grange names::
  replace_special_chars <- function(name) {
    gsub("[-_.]", " ", name)
  }
  old_res_features_names <- names(store_f)
  new_gr_names <- sapply(old_res_features_names, replace_special_chars)
  names(store_f) <- new_gr_names
  res_features_names <- names(store_f)
  
  
  # data frame
  sRNA_features_df <- matrix(0, ncol= length(res_features_names), nrow = 2)
  colnames(sRNA_features_df) <- res_features_names
  rownames(sRNA_features_df) <- c("Genome", "Dataset")
  
  # genome
  for(f in 1:length(res_features_names)){
    sRNA_features_df[1,f]  <- length(store_f[[res_features_names[f]]])
  }
  
  overlapFUN <- function(x,y){
    overlaps <- suppressWarnings(
      GenomicRanges::findOverlaps(x, y,ignore.strand=TRUE))
    # Extract unique hits
    unique_overlaps <- unique(S4Vectors::queryHits(overlaps))
    # Number of ranges in y that overlap with a range in x
    num_overlapping_ranges <- length(unique_overlaps)
    return(num_overlapping_ranges)
  }
  
  # select sample
  sRNA_df <-  data %>% dplyr::select(chr, start, end)
  sRNA_df <-  GenomicRanges::makeGRangesFromDataFrame(sRNA_df)
  for(k in 1:length(res_features_names)){
    sRNA_features_df[2,k]  <- overlapFUN(store_f[[res_features_names[k]]],sRNA_df)
  }
  
  
  if(percentage == TRUE){
    # convert to percentage
    sRNA_features_df <- data.frame(t(sRNA_features_df)) %>%
      dplyr::mutate(Genome = scales::label_percent()(Genome / sum(Genome)))%>%
      dplyr::mutate(Dataset = scales::label_percent()(Dataset / sum(Dataset)))
    # if NA, conevrt to 0 assuming all columsn are character. 
    sRNA_features_df[] <- lapply(sRNA_features_df, function(x) 
      ifelse(is.na(x), "0%", x))
    
    return(sRNA_features_df)
    
  } else
    return(t(sRNA_features_df))
  
}
