% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/processStudy.R
\encoding{UTF-8}
\name{inferAncestry}
\alias{inferAncestry}
\title{Run most steps leading to the ancestry inference call on a specific
DNA profile}
\usage{
inferAncestry(
  profileFile,
  pathProfileGDS,
  fileReferenceGDS,
  fileReferenceAnnotGDS,
  chrInfo,
  syntheticRefDF,
  genoSource = c("snp-pileup", "generic", "VCF", "bam"),
  np = 1L,
  verbose = FALSE
)
}
\arguments{
\item{profileFile}{a \code{character} string representing the path and the
file name of the genotype file or the bam if genoSource is snp-pileup the
fine extension must be .txt.gz, if VCF the extension must be .vcf.gz}

\item{pathProfileGDS}{a \code{character} string representing the path to
the directory where the GDS Profile files will be created.
Default: \code{NULL}.}

\item{fileReferenceGDS}{a \code{character} string representing the file
name of the Population Reference GDS file. The file must exist.}

\item{fileReferenceAnnotGDS}{a \code{character} string representing the
file name of the Population Reference GDS Annotation file. The file
must exist.}

\item{chrInfo}{a \code{vector} of positive \code{integer} values
representing the length of the chromosomes. See 'details' section.}

\item{syntheticRefDF}{a \code{data.frame} containing a subset of
reference profiles for each sub-population present in the Reference GDS
file. The \code{data.frame} must have those columns:
\describe{
\item{sample.id}{ a \code{character} string representing the sample
identifier. }
\item{pop.group}{ a \code{character} string representing the
subcontinental population assigned to the sample. }
\item{superPop}{ a \code{character} string representing the
super-population assigned to the sample. }
}}

\item{genoSource}{a \code{character} string with four possible values:
'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files
are generated by snp-pileup (Facets) or are a generic format CSV file
with at least those columns:
'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'.
The 'Count' is the depth at the specified position;
'FileR' is the depth of the reference allele and
'File1A' is the depth of the specific alternative allele.
Finally the file can be a VCF file with at least those genotype
fields: GT, AD, DP.}

\item{np}{a single positive \code{integer} specifying the number of
threads to be used. Default: \code{1L}.}

\item{verbose}{a \code{logical} indicating if messages should be printed
to show how the different steps in the function. Default: \code{FALSE}.}
}
\value{
a \code{list} containing 4 entries:
\describe{
\item{\code{pcaSample}}{ a \code{list} containing the information related
to the eigenvectors. The \code{list} contains those 3 entries:
\describe{
\item{\code{sample.id}}{ a \code{character} string representing the unique
identifier of the current profile.}
\item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing
the eigenvectors for the reference profiles.}
\item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the
eigenvectors for the current profile projected on the PCA from the
reference profiles.}
}
}
\item{\code{paraSample}}{ a \code{list} containing the results with
different \code{D} and \code{K} values that lead to optimal parameter
selection. The \code{list} contains those entries:
\describe{
\item{\code{dfPCA}}{ a \code{data.frame} containing statistical results
on all combined synthetic results done with a fixed value of \code{D} (the
number of dimensions). The \code{data.frame} contains those columns:
\describe{
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions).}
\item{\code{median}}{ a \code{numeric} representing the median of the
minimum AUROC obtained (within super populations) for all combination of
the fixed \code{D} value and all tested \code{K} values. }
\item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum
AUROC obtained (within super populations) for all combination of the fixed
\code{D} value and all tested \code{K} values. }
\item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile
of the minimum AUROC obtained (within super populations) for all
combination of the fixed \code{D} value and all tested \code{K} values. }
\item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value
(the number of neighbors) for a fixed \code{D} value. }
}
}
\item{\code{dfPop}}{ a \code{data.frame} containing statistical results on
all combined synthetic results done with different values of \code{D} (the
number of dimensions) and \code{K} (the number of neighbors).
The \code{data.frame} contains those columns:
\describe{
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions).}
\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
number of neighbors).}
\item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy
obtained by grouping all the synthetic results by super-populations, for
the specified values of \code{D} and \code{K}.}
\item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained
by grouping all the synthetic results for the specified values of \code{D}
and \code{K}.}
\item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy
of the confusion matrix obtained by grouping all the synthetic results for
the specified values of \code{D} and \code{K}.}
}
}
\item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by
super-population. The \code{data.frame} contains
those columns:
\describe{
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions).}
\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
number of neighbors).}
\item{\code{Call}}{ a \code{character} string representing the
super-population.}
\item{\code{L}}{ a \code{numeric} representing the lower value of the 95\%
confidence interval for the AUROC obtained for the fixed values of
super-population, \code{D} and \code{K}.}
\item{\code{AUROC}}{ a \code{numeric} representing  the AUROC obtained for the
fixed values of super-population, \code{D} and \code{K}.}
\item{\code{H}}{ a \code{numeric} representing the higher value of the 95\%
confidence interval for the AUROC obtained for the fixed values of
super-population, \code{D} and \code{K}.}
}
}
\item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value
(the number of dimensions) for the specific profile.}
\item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value
(the number of neighbors) for the specific profile.}
\item{\code{listD}}{ a \code{numeric} representing the optimal \code{D}
values (the number of dimensions) for the specific profile. More than one
\code{D} is possible.}
}
}
\item{\code{KNNSample}}{  a \code{data.frame} containing the inferred ancestry
for different values of \code{K} and \code{D}. The \code{data.frame}
contains those columns:
\describe{
\item{\code{sample.id}}{ a \code{character} string representing the unique
identifier of the current profile.}
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions) used to infer the ancestry. }
\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
number of neighbors) used to infer the ancestry. }
\item{\code{SuperPop}}{ a \code{character} string representing the inferred
ancestry for the specified \code{D} and \code{K} values.}
}
}
\item{\code{KNNSynthetic}}{  a \code{data.frame} containing the inferred ancestry
for each synthetic data for different values of \code{K} and \code{D}.
The \code{data.frame}
contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop"
\describe{
\item{\code{sample.id}}{ a \code{character} string representing the unique
identifier of the current synthetic data.}
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions) used to infer the ancestry. }
\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
number of neighbors) used to infer the ancestry. }
\item{\code{infer.superPop}}{ a \code{character} string representing the inferred
ancestry for the specified \code{D} and \code{K} values.}
\item{\code{ref.superPop}}{ a \code{character} string representing the known
ancestry from the reference}
}
}
\item{\code{Ancestry}}{ a \code{data.frame} containing the inferred
ancestry for the current profile. The \code{data.frame} contains those
columns:
\describe{
\item{\code{sample.id}}{ a \code{character} string representing the unique
identifier of the current profile.}
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions) used to infer the ancestry.}
\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
number of neighbors) used to infer the ancestry.}
\item{\code{SuperPop}}{ a \code{character} string representing the inferred
ancestry.}
}
}
}
}
\description{
This function runs most steps leading to the ancestry inference
call on a specific RNA profile. First, the function creates the
Profile GDS file for the specific profile using the information from a
RDS Sample description file and the Population Reference GDS file.
}
\examples{

## Required library for GDS
library(SNPRelate)

## Path to the demo 1KG GDS file is located in this package
dataDir <- system.file("extdata", package="RAIDS")

#################################################################
## The 1KG GDS file and the 1KG SNV Annotation GDS file
## need to be located in the same directory
## Note that the 1KG GDS file used for this example is a
## simplified version and CANNOT be used for any real analysis
#################################################################
path1KG <- file.path(dataDir, "tests")

fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds")
fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds")

#################################################################
## The Sample SNP pileup files (one per sample) need
## to be located in the same directory.
#################################################################
demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz")

#################################################################
## The path where the Profile GDS Files (one per sample)
## will be created need to be specified.
#################################################################
pathProfileGDS <- file.path(tempdir(), "out.tmp")

####################################################################
## Fix seed to ensure reproducible results
####################################################################
set.seed(3043)

gds1KG <- snpgdsOpen(fileReferenceGDS)
dataRef <- select1KGPop(gds1KG, nbProfiles=2L)
closefn.gds(gds1KG)

## Required library for this example to run correctly
if (requireNamespace("Seqinfo", quietly=TRUE) &&
     requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) {

    ## Chromosome length information
    ## chr23 is chrX, chr24 is chrY and chrM is 25
    chrInfo <- Seqinfo::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25]

    \donttest{

        res <- inferAncestry(profileFile=demoProfileEx1,
            pathProfileGDS=pathProfileGDS,
            fileReferenceGDS=fileReferenceGDS,
            fileReferenceAnnotGDS=fileAnnotGDS,
            chrInfo=chrInfo,
            syntheticRefDF=dataRef,
            genoSource="snp-pileup")

        unlink(pathProfileGDS, recursive=TRUE, force=TRUE)

    }
}

}
\references{
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
}
\author{
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
}
