###

library(BSgenome)

.load_raw_snps <- function(filepath)
{
    COL2CLASS <- c(RefSNP_id="character",
                   alleles="character",
                   avg_het="NULL",
                   se_het="NULL",
                   chr="factor",
                   chr_pos="integer",
                   strand="factor")
    read.table(filepath, quote="", col.names=names(COL2CLASS),
               na.strings="?", colClasses=unname(COL2CLASS),
               stringsAsFactors=FALSE)
}

.bad_snps <- function(raw_snps, bad_idx)
{
    bad_snps <- raw_snps[bad_idx, "RefSNP_id"]
    nbad <- length(bad_snps)
    if (nbad > 6L)
        bad_snps <- c(head(bad_snps, n=5L), "...")
    bad_snps <- paste(bad_snps, collapse=", ")
    if (nbad > 6L)
        bad_snps <- paste0(bad_snps, " [", nbad - 5L, " more]")
    bad_snps
}

### Performs some basic sanity checks.
.check_raw_snps <- function(raw_snps, seqname)
{
    stopifnot(isSingleString(seqname))

    ## Check RefSNP_id column.
    RefSNP_id <- raw_snps[ , "RefSNP_id"]
    if (any(is.na(RefSNP_id)))
        stop(wmsg("RefSNP id is NA for some SNPs on chromosome ", seqname))
    bad_idx <- which(substr(RefSNP_id, 1L, 2L) != "rs")
    if (length(bad_idx) != 0L)
        stop(wmsg("SNP(s) on chromosome ", seqname, " ",
                  "with an RefSNP id that does not start with \"rs\": ",
                  .bad_snps(raw_snps, bad_idx)))
    bad_idx <- which(substr(RefSNP_id, 1L, 3L) == "rs0")
    if (length(bad_idx) != 0L)
        stop(wmsg("SNP(s) on chromosome ", seqname, " ",
                  "with an RefSNP id that starts with \"rs0\": ",
                  .bad_snps(raw_snps, bad_idx)))

    ## Check alleles column.
    na_idx <- which(is.na(raw_snps[ , "alleles"]))
    if (length(na_idx) != 0L)
        stop(wmsg("SNP(s) on chromosome ", seqname, " ",
                  "with unspecified alleles: ",
                  .bad_snps(raw_snps, na_idx)))

    ## Check chr column.
    bad_chr_idx <- which(!(raw_snps[ , "chr"] %in% seqname))
    if (length(bad_chr_idx) != 0L)
        stop(wmsg("SNP(s) with unexpected chr information: ",
                  .bad_snps(raw_snps, bad_chr_idx)))

    ## Check chr_pos column.
    na_idx <- which(is.na(raw_snps[ , "chr_pos"]))
    ## Should never happen because we've already filtered out SNPs with an
    ## unspecified position (see filter3_ds_flat.sh).
    if (length(na_idx) != 0L)
        stop(wmsg("SNP(s) on chromosome ", seqname, " ",
                  "at an unspecified position (despite our previous ",
                  "filtering, see filter3_ds_flat.sh): ",
                  .bad_snps(raw_snps, na_idx)))

    ## Check strand column.
    strand_levels <- levels(raw_snps[ , "strand"])
    bad_levels <- setdiff(strand_levels, c("+", "-"))
    if (length(bad_levels) != 0L)
        stop(wmsg("unexpected level(s) in column \"strand\": ",
                  paste(bad_levels, collapse=", ")))
}

### Produce a DataFrame with 3 columns:
###   1. rsid: integer vector (RefSNP_id with "rs" prefix removed)
###   2. pos: integer vector (the input chr_pos col)
###   3. alleles: raw vector (alleles as an IUPAC letter, complemented
###      if strand is -, and turned into byte value).
.cook_raw_snps <- function(raw_snps)
{
    RefSNP_id <- raw_snps[ , "RefSNP_id"]
    rsid <- substr(RefSNP_id, 3L, nchar(RefSNP_id))  # trim "rs" prefix
    rsid <- as.integer(rsid)
    stopifnot(identical(paste0("rs", rsid), RefSNP_id))

    alleles <- gsub("/", "", raw_snps[ , "alleles"], fixed=TRUE)
    minus_strand_idx <- which(raw_snps[ , "strand"] == "-")
    alleles[minus_strand_idx] <- chartr("ACGT", "TGCA",
                                        alleles[minus_strand_idx])
    alleles <- BSgenome:::encode_letters_as_bytes(mergeIUPACLetters(alleles))

    ans <- data.frame(rsid=rsid,
                      pos=raw_snps[ , "chr_pos"],
                      alleles=alleles,
                      stringsAsFactors=FALSE)
    ans <- ans[order(ans[ , "pos"]), , drop=FALSE]
    row.names(ans) <- NULL
    ans
}

### Believe it or not, but some SNPs in dbSNP are reported to be at a position
### that is beyond the end of the chromosome. For example, rs553244808 is
### reported to be at position 143544518 on chromosome 14 in GRCh38.p7, even
### though the length of this chromosome is 107043718. We drop these SNPs.
.drop_out_of_bounds_snps <- function(cooked_snps, seqlength)
{
    pos <- cooked_snps[ , "pos"]
    is_out_of_bounds <- pos < 1L | pos > seqlength
    nb_out_of_bounds <- sum(is_out_of_bounds)
    if (nb_out_of_bounds != 0L) {
        cat("  DROP ", nb_out_of_bounds, " OUT OF BOUNDS SNPS! ... ", sep="")
        keep_idx <- which(!is_out_of_bounds)
        cooked_snps <- cooked_snps[keep_idx, , drop=FALSE]
        cat("OK\n")
    }
    cooked_snps
}

.build_spatial_index <- function(pos, batchsize, seqname, seqinfo)
{
    stopifnot(is.integer(pos))
    if (is.unsorted(pos))
        stop(wmsg("'pos' must be sorted"))
    chunks <- breakInChunks(length(pos), batchsize)
    spatial_ranges <- range(relist(pos, chunks))
    GRanges(seqname, IRanges(spatial_ranges[ , 1L],
                             spatial_ranges[ , 2L]),
            batchsize=width(chunks),
            seqinfo=seqinfo)
}

### 'seqnames' must be a single string (e.g. "20 21 22")
build_OnDiskLongTable <- function(tmp_dir, seqnames, chr_prefix="ch",
                                  batchsize=200000L)
{
    cat("\n")
    cat("***************** START build_OnDiskLongTable() ******************\n")

    seqnames <- strsplit(seqnames, " ", fixed=TRUE)[[1L]]

    seqinfo <- BSgenome:::read_seqinfo_table("seqinfo.txt", genome="GRCh38.p7")

    rowids <- vector("list", length=length(seqnames))
    names(rowids) <- seqnames

    append <- FALSE
    for (seqname in seqnames) {
        cat("\n")
        cat("Process SNPs for chromosome ", seqname, ":\n", sep="")

        filename <- paste0(chr_prefix, seqname, "_raw_snps.txt")
        cat("  Load the raw SNPs from ", filename, " ... ", sep="")
        filepath <- file.path(tmp_dir, filename)
        raw_snps <- .load_raw_snps(filepath)
        cat("OK [", nrow(raw_snps), " raw SNPs loaded]\n", sep="")

        cat("  Check the raw SNPs ... ", sep="")
        .check_raw_snps(raw_snps, seqname)
        cat("OK\n")

        cat("  Cook the raw SNPs ... ", sep="")
        cooked_snps <- .cook_raw_snps(raw_snps)
        cat("OK\n")

        seqlength <- seqlengths(seqinfo)[[seqname]]
        cooked_snps <- .drop_out_of_bounds_snps(cooked_snps, seqlength)

        rowids[[seqname]] <- cooked_snps[ , 1L]

        df <- cooked_snps[ , -1L, drop=FALSE]
        spatial_index <- .build_spatial_index(df[ , "pos"], batchsize,
                                              seqname, seqinfo)
        fmt <- "%s the SNPs %s OnDiskLongTable directory structure"
        if (!append) {
            msg <- sprintf(fmt, "Save", "as an")
        } else {
            msg <- sprintf(fmt, "Append", "to")
        }
        cat("  ", msg, " ... ", sep="")
        writeOnDiskLongTable(df, spatial_index=spatial_index,
                                 append=append)
        append <- TRUE
        cat("OK\n")
    }

    cat("\n")

    rowids <- unlist(rowids, recursive=FALSE, use.names=FALSE)
    cat("Add the RefSNP ids to OnDiskLongTable directory structure ... ")
    ## Using compress="xz" reduces the size on disk by < 2% but makes further
    ## loading of the row ids (with readRDS()) 8x slower. Not worth it!
    #writeOnDiskLongTableRowids(rowids, compress="xz")
    writeOnDiskLongTableRowids(rowids)
    cat("OK\n")

    cat("\n")
    cat("****************** END build_OnDiskLongTable() *******************\n")
    cat("Total number of SNPs written to disk: ", length(rowids), "\n", sep="")
}

