\name{SNPlocs.Hsapiens.dbSNP150.GRCh38}
\docType{package}

\alias{SNPlocs.Hsapiens.dbSNP150.GRCh38-package}
\alias{SNPlocs.Hsapiens.dbSNP150.GRCh38}

\alias{COMPATIBLE_BSGENOMES}


\title{The SNPlocs.Hsapiens.dbSNP150.GRCh38 package}

\description{
  SNP positions and alleles for Homo sapiens extracted
  from NCBI dbSNP Build 150. The source data files used for this
  package were created by NCBI between March 12-14, 2017, and contain
  SNPs mapped to reference genome GRCh38.p7 (a patched version of GRCh38
  that doesn't alter chromosomes 1-22, X, Y, MT).
}

\details{
  SNPs from dbSNP were filtered to keep only those satisfying the 4
  following criteria:
  \enumerate{
    \item The SNP is a single-base substitution i.e. its class is \emph{snp}.
          Other classes supported by dbSNP are: \emph{in-del},
          \emph{heterozygous}, \emph{microsatellite}, \emph{named-locus},
          \emph{no-variation}, \emph{mixed}, and
          \emph{multinucleotide-polymorphism}.
          These SNPs are NOT included in
          \pkg{SNPlocs.Hsapiens.dbSNP150.GRCh38}
          but are available in separate package
          \pkg{XtraSNPlocs.Hsapiens.dbSNP150.GRCh38}.

    \item The SNP is marked as notwithdrawn.

    \item A \emph{single} position on the reference genome (GRCh38.p7)
          is reported for the SNP, and this position is on chromosome
          1-22, X, Y, or MT.

    \item The SNP is not \emph{out of bounds}, that is, its reported position
          is not beyond the end of the chromosome. Believe it or not, but some
          SNPs in dbSNP are actually out of bounds. For example, rs553244808
          is reported to be at position 143544518 on chromosome 14 in assembly
          GRCh38.p7, even though the length of this chromosome is 107043718!
  }

  SNPlocs packages always store the alleles corresponding to the \emph{plus}
  strand, whatever the strand reported by dbSNP is (which is achieved by
  storing the complement of the alleles reported by dbSNP for SNPs located
  on the minus strand).
  In other words, in a SNPlocs package, all the SNPs are considered to be
  on the plus strand and everything is reported with respect to that strand. 
}

\note{
  The SNPs in this package can be "injected" in BSgenome.Hsapiens.NCBI.GRCh38
  or BSgenome.Hsapiens.UCSC.hg38 and will land at the correct position.

  See \code{?\link[BSgenome]{injectSNPs}} in the \pkg{BSgenome} software
  package for more information about the SNP injection mechanism.
}

\references{
  SNP Home at NCBI:
  \url{https://www.ncbi.nlm.nih.gov/snp}

  dbSNP Human BUILD 150 announcement:
  \url{https://www.ncbi.nlm.nih.gov/mailman/pipermail/dbsnp-announce/2017q2/000175.html}

  GRCh38.p7 assembly:
  \url{https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.33/}

  hg38 genome at UCSC:
  \url{http://genome.ucsc.edu/cgi-bin/hgGateway?db=hg38}

  Note that hg38 and GRCh38 are the same assemblies (i.e. the 455 genomic
  sequences in both of them are the same), except that they use different
  conventions to name the sequences (i.e. for the chromosome and scaffold
  names).
}

\author{H. Pagès}

\seealso{
  \itemize{
    \item The \pkg{XtraSNPlocs.Hsapiens.dbSNP150.GRCh38} package for SNPs of
          class other than \emph{snp}.

    \item \code{\link[BSgenome]{snpcount}} in the \pkg{BSgenome} software
          package for how to access the data stored in this package.

    \item \code{\link[Biostrings]{IUPAC_CODE_MAP}} in the \pkg{Biostrings}
          package.

    \item The \link[GenomicRanges]{GRanges} class in the \pkg{GenomicRanges}
          package.

    \item \code{\link[BSgenome]{injectSNPs}} in the \pkg{BSgenome} software
          package for SNP injection.

    \item The \pkg{VariantAnnotation} software package to annotate variants
          with respect to location and amino acid coding.
  }
}

\examples{
## ---------------------------------------------------------------------
## A. BASIC USAGE
## ---------------------------------------------------------------------
snps <- SNPlocs.Hsapiens.dbSNP150.GRCh38
snpcount(snps)

## Get the positions and alleles of all SNPs on chromosome 22:
chr22_snps <- snpsBySeqname(snps, "22")
chr22_snps

## Get the positions and alleles of all SNPs on chromosomes 22 and MT:
snpsBySeqname(snps, c("22", "MT"))

## ---------------------------------------------------------------------
## B. EXTRACT SNP INFORMATION FOR A SET OF RS IDS
## ---------------------------------------------------------------------
my_rsids <- c("rs2639606", "rs75264089", "rs73396229", "rs55871206",
              "rs10932221", "rs56219727", "rs73709730", "rs55838886",
              "rs3734153", "rs79381275", "rs1516535")
## Note that the 1st call to snpsById() takes a long time but subsequent
## calls are expected to be slightly faster.
my_snps <- snpsById(snps, my_rsids)
my_snps

## Translate the IUPAC ambiguity codes used to represent the alleles
## into nucleotides:
IUPAC_CODE_MAP[mcols(my_snps)$alleles_as_ambig]

## ---------------------------------------------------------------------
## C. INJECTION IN THE REFERENCE GENOME
## ---------------------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg38)
genome <- BSgenome.Hsapiens.UCSC.hg38
genome

genome2 <- injectSNPs(genome, "SNPlocs.Hsapiens.dbSNP150.GRCh38")
genome2  # note the additional line "with SNPs injected from..."

alphabetFrequency(genome$chr22)
alphabetFrequency(genome2$chr22)

## Get the number of nucleotides that were modified by this injection:
neditAt(genome$chr22, genome2$chr22)  # 3974040

## ---------------------------------------------------------------------
## D. SOME BASIC QUALITY CONTROL (WITH SURPRISING RESULTS!)
## ---------------------------------------------------------------------

## Note that dbSNP can assign distinct ids to SNPs located at the same
## position:
any(duplicated(mcols(chr22_snps)$RefSNP_id))  # rs ids are all distinct...
any(duplicated(chr22_snps))  # but some positions are repeated!

which(duplicated(chr22_snps))[1:5]  # 443, 614, 1506, 1564, 2429
chr22_snps[2428:2429]  # rs400232 and rs879813061 share the same position
                       # (11282150) and alleles (Y, i.e. C/T)

## Also note that not all SNP alleles are consistent with the GRCh38
## genomic sequences, that is, the alleles reported for a given SNP are
## not necessarily compatible with the nucleotide found at the SNP
## position in GRCh38. For example, to get the number of inconsistent
## SNPs on chr1:
chr1_snps <- snpsBySeqname(snps, "1")
chr1_alleles <- mcols(chr1_snps)$alleles_as_ambig
chr1_alleles <- DNAString(paste(chr1_alleles, collapse=""))
nchar(chr1_alleles)  # 23591605 SNPs on chr1
neditAt(genome$chr1[pos(chr1_snps)], chr1_alleles, fixed=FALSE)
## ==> 38412 SNPs (0.16%) are inconsistent with GRCh38 chr1!
}

\keyword{package}
