% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/processStudy.R
\encoding{UTF-8}
\name{computePoolSyntheticAncestryGr}
\alias{computePoolSyntheticAncestryGr}
\title{Run a PCA analysis and a K-nearest neighbors analysis on a small set
of synthetic data using all 1KG profiles except the ones used to generate
the synthetic profiles}
\usage{
computePoolSyntheticAncestryGr(
  gdsProfile,
  sampleRM,
  spRef,
  studyIDSyn,
  np = 1L,
  listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"),
  fieldPopInfAnc = "SuperPop",
  kList = seq(2, 15, 1),
  pcaList = seq(2, 15, 1),
  algorithm = c("exact", "randomized"),
  eigenCount = 32L,
  missingRate = 0.025,
  verbose = FALSE
)
}
\arguments{
\item{gdsProfile}{an object of class
\code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, the
opened Profile GDS file.}

\item{sampleRM}{a \code{vector} of \code{character} strings representing
the identifiers of the 1KG reference profiles that should not be used to
create the reference PCA. There should be one per sub-continental
population. Those profiles are
removed because those have been used to generate the synthetic profiles
that are going to be analysed here. The sub-continental
identifiers are used as names for the \code{vector}.}

\item{spRef}{\code{vector} of \code{character} strings representing the
known super population ancestry for the 1KG profiles. The 1KG profile
identifiers are used as names for the \code{vector}.}

\item{studyIDSyn}{a \code{character} string corresponding to the study
identifier.
The study identifier must be present in the Profile GDS file.}

\item{np}{a single positive \code{integer} representing the number of
threads. Default: \code{1L}.}

\item{listCatPop}{a \code{vector} of \code{character} string
representing the list of possible ancestry assignations. Default:
\code{("EAS", "EUR", "AFR", "AMR", "SAS")}.}

\item{fieldPopInfAnc}{a \code{character} string representing the name of
the column that will contain the inferred ancestry for the specified
dataset. Default: \code{"SuperPop"}.}

\item{kList}{a \code{vector} of \code{integer} representing  the list of
values tested for the  \emph{K} parameter. The \emph{K} parameter represents the
number of neighbors used in the K-nearest neighbor analysis. If \code{NULL},
the value \code{seq(2,15,1)} is assigned.
Default: \code{seq(2,15,1)}.}

\item{pcaList}{a \code{vector} of \code{integer} representing  the list of
values tested for the  \emph{D} parameter. The \emph{D} parameter represents the
number of dimensions used in the PCA analysis.  If \code{NULL},
the value \code{seq(2,15,1)} is assigned.
Default: \code{seq(2,15,1)}.}

\item{algorithm}{a \code{character} string representing the algorithm used
to calculate the PCA. The 2 choices are "exact" (traditional exact
calculation) and "randomized" (fast PCA with randomized algorithm
introduced in Galinsky et al. 2016). Default: \code{"exact"}.}

\item{eigenCount}{a single \code{integer} indicating the number of
eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA}
function; if 'eigenCount' <= 0, then all eigenvectors are returned.
Default: \code{32L}.}

\item{missingRate}{a \code{numeric} value representing the threshold
missing rate at with the SNVs are discarded; the SNVs are retained in the
\link[SNPRelate]{snpgdsPCA} function
with "<= missingRate" only; if \code{NaN}, no missing threshold.
Default: \code{0.025}.}

\item{verbose}{a \code{logical} indicating if message information should be
printed. Default: \code{FALSE}.}
}
\value{
a \code{list} containing the following entries:
\describe{
\item{sample.id}{ a \code{vector} of \code{character} strings representing
the identifiers of the synthetic profiles. }
\item{sample1Kg}{ a \code{vector} of \code{character} strings representing
the identifiers of the reference 1KG profiles used to generate the
synthetic profiles. }
\item{sp}{ a \code{vector} of \code{character} strings representing the
known ancestry for the reference 1KG profiles used to generate the
synthetic profiles. }
\item{matKNN}{ a \code{data.frame} containing 4 columns. The first column
'sample.id' contains the name of the synthetic profile. The second column
'D' represents the dimension D used to infer the ancestry. The third column
'K' represents the number of neighbors K used to infer the ancestry. The
fourth column 'SuperPop' contains the inferred ancestry. }
}
}
\description{
The function runs a PCA analysis using 1 synthetic profile
from each sub-continental population. The reference profiles used to
create those synthetic profiles are first removed from the list
of 1KG reference profiles that generates the reference PCA. Then, the
retained synthetic
profiles are projected on the 1KG PCA space. Finally, a K-nearest neighbors
analysis using a range of K and D values is done.
}
\examples{

## Required library
library(gdsfmt)

## Load the known ancestry for the demo 1KG reference profiles
data(demoKnownSuperPop1KG)


# The name of the synthetic study
studyID <- "MYDATA.Synthetic"

samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165",
    "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013",
    "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789",
    "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712",
    "NA19731", "NA20528", "NA20908")
names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS",
    "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU",
    "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH")

## Path to the demo Profile GDS file is located in this package
dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS")

## Open the Profile GDS file
gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds"))

## Run a PCA analysis and a K-nearest neighbors analysis on a small set
## of synthetic data
results <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile,
    sampleRM=samplesRM, studyIDSyn=studyID, np=1L,
    spRef=demoKnownSuperPop1KG,
    kList=seq(10,15,1), pcaList=seq(10,15,1), eigenCount=15L)

## The ancestry inference for the synthetic data using
## different K and D values
head(results$matKNN)

## Close Profile GDS file (important)
closefn.gds(gdsProfile)

}
\references{
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
}
\author{
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
}
