\name{XtraSNPlocs.Hsapiens.dbSNP144.GRCh38}
\docType{package}

\alias{XtraSNPlocs.Hsapiens.dbSNP144.GRCh38-package}
\alias{XtraSNPlocs.Hsapiens.dbSNP144.GRCh38}


\title{The XtraSNPlocs.Hsapiens.dbSNP144.GRCh38 package}

\description{
  Extra SNP locations and alleles for Homo sapiens extracted
  from NCBI dbSNP Build 144. The source data files used for this
  package were created by NCBI on May 30, 2015, and contain SNPs
  mapped to reference genome GRCh38.p2 (a patched version of GRCh38
  that doesn't alter chromosomes 1-22, X, Y, MT).

  While the \pkg{SNPlocs.Hsapiens.dbSNP144.GRCh38} package contains only
  molecular variations of class \emph{snp}, this package contains molecular
  variations of other classes (\emph{in-del}, \emph{heterozygous},
  \emph{microsatellite}, \emph{named-locus}, \emph{no-variation},
  \emph{mixed}, and \emph{multinucleotide-polymorphism}).
}

\details{
  SNPs from dbSNP were filtered to keep only those satisfying the 3
  following criteria:
  \itemize{
    \item The SNP is NOT a single-base substitution (i.e. its class
          is NOT \emph{snp}) but is a molecular variation that belongs to
          any other class supported by dbSNP: \emph{in-del},
          \emph{heterozygous}, \emph{microsatellite}, \emph{named-locus},
          \emph{no-variation}, \emph{mixed}, or
          \emph{multinucleotide-polymorphism}.

    \item The SNP is marked as notwithdrawn.

    \item A \emph{single} location on the reference genome (GRCh38.p2)
          is reported for the SNP, and this location is on chromosomes
          1-22, X, Y, or MT.
  }
}

\note{
  The source data files used for this package are the same as those used for
  the \pkg{SNPlocs.Hsapiens.dbSNP144.GRCh38} package and were created by the
  dbSNP Development Team at NCBI on May 30, 2015.
}

\references{
  SNP Home at NCBI:
  \url{http://www.ncbi.nlm.nih.gov/snp}

  dbSNP Human BUILD 144 announcement:
  \url{http://www.ncbi.nlm.nih.gov/mailman/pipermail/dbsnp-announce/2015q2/000163.html}

  GRCh38.p2 assembly:
  \url{http://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.28/}

  hg38 genome at UCSC:
  \url{http://genome.ucsc.edu/cgi-bin/hgGateway?db=hg38}

  Note that hg38 and GRCh38 are the same assemblies (i.e. the 455 genomic
  sequences in both of them are the same), except that they use different
  conventions to name the sequences (i.e. for the chromosome and scaffold
  names).
}

\author{H. Pages}

\seealso{
  \itemize{
    \item The \pkg{SNPlocs.Hsapiens.dbSNP144.GRCh38} package for SNPs of
          class \emph{snp}.

    \item \link[BSgenome]{XtraSNPlocs} objects in the \pkg{BSgenome} software
          package for how to access the data stored in this package.

    \item The \link[GenomicRanges]{GRanges} class in the \pkg{GenomicRanges}
          package.

    \item The \pkg{VariantAnnotation} software package to annotate variants
          with respect to location and amino acid coding.
  }
}

\examples{
## ---------------------------------------------------------------------
## A. BASIC USAGE
## ---------------------------------------------------------------------
snps <- XtraSNPlocs.Hsapiens.dbSNP144.GRCh38
snpcount(snps)

## Get the location, RefSNP id, and alleles for all "extra SNPs" on
## chromosome 22 and MT:
my_snps1 <- snpsBySeqname(snps, c("ch22", "chMT"), c("RefSNP_id", "alleles"))
my_snps1

## Get the location and alleles for some RefSNP ids:
my_rsids <- c("rs367617508", "rs398104919", "rs3831697", "rs372470289",
              "rs141568169", "rs34628976", "rs67551854")
my_snps2 <- snpsById(snps, my_rsids, c("RefSNP_id", "alleles"))
my_snps2

## ---------------------------------------------------------------------
## B. COMPUTE AND ADD REFERENCE ALLELE AS AN ADDITIONAL METADATA COLUMN
## ---------------------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg38)
genome <- BSgenome.Hsapiens.UCSC.hg38

## Before we can call getSeq(genome, my_snps1), we need to harmonize the
## seqinfo components of 'genome' and 'my_snps1':
seqlevelsStyle(my_snps1)  # dbSNP
seqlevelsStyle(genome)  # UCSC
seqlevelsStyle(my_snps1) <- seqlevelsStyle(genome)
genome(my_snps1) <- "hg38"

ref_allele1 <- getSeq(genome, my_snps1)
ref_allele1[ref_allele1 == ""] <- "-"
mcols(my_snps1)$ref_allele <- ref_allele1
my_snps1

## ---------------------------------------------------------------------
## C. COMPARE ALLELES REPORTED BY dbSNP WITH REFERENCE ALLELE
## ---------------------------------------------------------------------
alleles1 <- mcols(my_snps1)$alleles
alleles1 <- CharacterList(strsplit(alleles1, "/", fixed=TRUE))
disagrees_idx <- which(all(as.character(ref_allele1) != alleles1))
my_snps1[disagrees_idx]
length(disagrees_idx) / length(my_snps1)  # 0.003261601
## Conclusion: 0.33% of the "extra SNPs" in dbSNP have reported alleles
## that disagree with the computed reference allele :-/
}

\keyword{package}
