###

library(BSgenome)

loadingRawXtraSNPs <- function(filepath)
{
    COLNAMES <- c("RefSNP_id", "snpClass", "alleles",
                  "chr_pos", "ctg_start", "ctg_end", "loctype", "strand")
    snpClass_levels <- c("in-del", "heterozygous", "microsatellite",
                         "named-locus", "no-variation", "mixed",
                         "multinucleotide-polymorphism")
    df <- read.table(filepath, sep="\t", quote="",
                     col.names=COLNAMES, na.strings="?",
                     stringsAsFactors=FALSE)
    snpClass <- factor(df$snpClass, levels=snpClass_levels)
    idx <- which(is.na(df$snpClass))
    if (length(idx) != 0L) {
        cat("SNPs with an unknown class:\n")
        print(df[idx, ])
        stop("some SNPs have an unknown class (see above list)")
    }
    df$snpClass <- snpClass
    df
}

### Performs some basic sanity checking. Drop SNPs that are too insane.
checkRawXtraSNPs <- function(raw_xtra_snps, seqname)
{
    if (any(is.na(raw_xtra_snps$RefSNP_id)))
        stop("some RefSNP ids are NA on ", seqname)

    ## Should never happen because we've already filtered out SNPs with an
    ## unspecified location (see filter3_ds_flat.sh).
    chr_pos_is_NA <- is.na(raw_xtra_snps$chr_pos)
    if (any(chr_pos_is_NA)) {
        msg <- c("SNPs with RefSNP ids ",
                 paste(raw_xtra_snps$RefSNP_id[chr_pos_is_NA], collapse=", "),
                 " on ", seqname, " still have an unspecified location ",
                 "despite our previous filtering -- dropping them now")
        warning(paste(msg, collapse=""), immediate.=TRUE)
        raw_xtra_snps <- raw_xtra_snps[!chr_pos_is_NA , ]
    }

    idx2 <- which(raw_xtra_snps$loctype == 2L)
    if (length(idx2) != 0L) {
        cat("Extra SNPs with a loctype of 2:\n")
        print(raw_xtra_snps[idx2, ])
        warning("some xtra SNPs have a loctype of 2 ",
                "(see above list). Dropping them!", immediate.=TRUE)
        raw_xtra_snps <- raw_xtra_snps[-idx2, ]
    }

    idx3 <- which(raw_xtra_snps$loctype == 3L)
    idx <- which((raw_xtra_snps$ctg_end - raw_xtra_snps$ctg_start)[idx3] != 1L)
    if (length(idx) != 0L) {
        cat("Insertions with a non zero-width range:\n")
        print(raw_xtra_snps[idx3[idx], ])
        warning("some insertions have a non zero-width range ",
                "(see above list). Dropping them!", immediate.=TRUE)
        raw_xtra_snps <- raw_xtra_snps[-idx3[idx], ]
    }
    raw_xtra_snps
}

### Produce a data frame with 7 columns:
###   1. RefSNP_id: integer vector ("rs" prefix removed)
###   2. snpClass: factor
###   3. alleles: character vector (the original alleles col)
###   4. start: integer vector
###   5. width: integer vector
###   6. strand: raw vector
###   7. loctype: raw vector
cookRawXtraSNPs <- function(raw_xtra_snps, seqname)
{
    ids <- raw_xtra_snps$RefSNP_id
    if (!all(substr(ids, 1, 2) == "rs"))
        stop("some RefSNP ids do not start with \"rs\" on ", seqname)
    ids <- substr(ids, 3, nchar(ids))
    if (any(substr(ids, 1L, 1L) == "0"))
        stop("some RefSNP ids start with \"rs0\" on ", seqname)
    ids <- as.integer(ids)

    ## Infer the start and width based on chr_pos, ctg_start, ctg_end, and
    ## loctype.
    ## According to ftp://ftp.ncbi.nih.gov/snp/00readme.txt, loctypes 1 and 3
    ## are insertion and deletion, but when looking at the data it actually
    ## seems to be the other way around.
    ans_start <- raw_xtra_snps$chr_pos
    ans_width <- raw_xtra_snps$ctg_end - raw_xtra_snps$ctg_start + 1L

    ## loctype 1: deletion.

    ## loctype 2: exact. There should be none of these.

    ## loctype 3: insertion.
    idx3 <- which(raw_xtra_snps$loctype == 3L)
    idx <- which((raw_xtra_snps$ctg_end - raw_xtra_snps$ctg_start)[idx3] != 1L)
    if (length(idx) != 0L) {
        cat("Insertions with a non zero-width range:\n")
        print(raw_xtra_snps[idx3[idx], ])
        stop("some insertions have a non zero-width range (see above list)")
    }
    ans_start[idx3] <- ans_start[idx3] + 1L
    ans_width[idx3] <- 0L

    ## loctype 4: range-insertion.

    ## loctype 5: range-exact.

    ## loctype 6: range-deletion.

    is_on_plus <- raw_xtra_snps$strand == "+"
    is_on_minus <- raw_xtra_snps$strand == "-"
    idx <- which(!(is_on_plus | is_on_minus))
    if (length(idx) != 0L) {
        cat("SNPs with a non-recognized strand:\n")
        print(raw_xtra_snps[idx, ])
        stop("some SNPs have a non-recognized strand (see above list)")
    }
    ans_strand <- raw(nrow(raw_xtra_snps))
    minus_idx <- which(is_on_minus)
    ans_strand[minus_idx] <- as.raw(1L)

    ans <- data.frame(RefSNP_id=ids,
                      snpClass=raw_xtra_snps$snpClass,
                      alleles=raw_xtra_snps$alleles,
                      start=ans_start,
                      width=ans_width,
                      strand=ans_strand,
                      loctype=as.raw(raw_xtra_snps$loctype),
                      stringsAsFactors=FALSE)
    ans <- ans[order(ans$start), ]
    row.names(ans) <- NULL
    ans
}

### 'shortseqnames' must be a single string (e.g. "20 21 22")
loadAndSerializeXtraSNPs <- function(path, shortseqnames, chr_prefix="ch")
{
    cat("\n")
    cat("**************** START loadAndSerializeXtraSNPs() ****************\n")

    shortseqnames <- strsplit(shortseqnames, " ", fixed=TRUE)[[1L]]
    seqnames <- paste0(chr_prefix, shortseqnames)

    rowids <- vector("list", length=length(seqnames))
    names(rowids) <- seqnames
    append <- FALSE
    for (seqname in seqnames) {
        cat("\n")
        cat("Process raw xtra SNPs for ", seqname, ":\n", sep="")

        cat("Load the SNPs ... ", sep="")
        filepath <- file.path(path, paste0(seqname, "_raw_xtra_snps.txt"))
        raw_xtra_snps <- loadingRawXtraSNPs(filepath)
        cat("OK\n")

        cat("Check the SNPs ... ", sep="")
        raw_xtra_snps <- checkRawXtraSNPs(raw_xtra_snps, seqname)
        cat("OK\n")

        cat("Cook the SNPs ... ", sep="")
        xtra_snplocs <- cookRawXtraSNPs(raw_xtra_snps, seqname)
        cat("OK\n")

        rowids[[seqname]] <- xtra_snplocs[ , 1L]
        xtra_snplocs <- xtra_snplocs[ , -1L]
        if (!append)
            cat("Save the SNPs (", nrow(xtra_snplocs), ") ",
                "as an OnDiskLongTable object ... ", sep="")
        else
            cat("Append the SNPs (", nrow(xtra_snplocs), ") ",
                "to OnDiskLongTable object ... ", sep="")
        saveAsOnDiskLongTable(xtra_snplocs, append=append, batch_label=seqname)
        cat("OK\n")
        append <- TRUE
    }

    cat("\n")
    rowids <- unlist(rowids, recursive=FALSE, use.names=FALSE)
    cat("Saving RefSNP ids as OnDiskLongTable row ids ... ", sep="")
    saveRowidsForOnDiskLongTable(rowids)
    cat("OK\n")

    cat("\n")
    cat("***************** END loadAndSerializeXtraSNPs() *****************\n")
}

