\name{GOHyperGAll}
\alias{GOHyperGAll}
\alias{GOHyperGAll_Subset}
\alias{GOHyperGAll_Simplify}
\alias{GOCluster_Report}
\alias{makeCATdb}
\alias{goBarplot}
\title{
	GO term enrichment analysis for large numbers of gene sets 
}
\description{
To test a sample population of genes for over-representation of GO terms, the
core function \code{GOHyperGAll} computes for all nodes in the three GO networks
(BP, CC and MF) an enrichment test based on the hypergeometric distribution and 
returns the corresponding raw and Bonferroni corrected p-values. 
Subsequently, a filter function supports GO Slim analyses using default or 
custom GO Slim categories. Several convenience functions are provided to process
large numbers of gene sets (e.g. clusters from partitioning results) and to 
visualize the results. 

Note: \code{GOHyperGAll} provides similar utilities as the \code{GOHyperG}
function in the \code{GOstats} package. The main difference is that
\code{GOHyperGAll} simplifies processing of large numbers of gene sets, as well
as the usage of custom array-to-gene and gene-to-GO mappings.
}
\usage{
## Generate gene-to-GO mappings and store as catDB object 
makeCATdb(myfile, lib = NULL, org = "", colno = c(1, 2, 3), idconv = NULL, 
            rootUK=FALSE)

## Enrichment function
GOHyperGAll(catdb, gocat = "MF", sample, Nannot = 2)

## GO slim analysis
GOHyperGAll_Subset(catdb, GOHyperGAll_result, sample = test_sample, 
            type = "goSlim", myslimv)

## Reduce GO term redundancy 
GOHyperGAll_Simplify(GOHyperGAll_result, gocat = "MF", cutoff = 0.001, correct = TRUE)

## Batch analysis of many gene sets
GOCluster_Report(catdb, setlist, id_type = "affy", method = "all", CLSZ = 10, 
            cutoff = 0.001, gocats = c("MF", "BP", "CC"), myslimv = "default",
            correct = TRUE, recordSpecGO = NULL, ...)

## Bar plot of GOCluster_Report results
goBarplot(GOBatchResult, gocat)
}
%- maybe also 'usage' for other objects documented here.
\arguments{
  \item{myfile}{
	File with gene-to-GO mappings. Sample files can be downloaded from geneontology.org (http://geneontology.org/GO.downloads.annotations.shtml) or from BioMart as shown in example below.
}
  \item{colno}{
	Column numbers referencing in \code{myfile} the three target columns containing GOID, GeneID and GOCAT, in that order.
}
  \item{org}{
	Optional argument. Currently, the only valid option is \code{org="Arabidopsis"} to get rid of transcript duplications in this particular annotation. 
}
  \item{lib}{
	If the gene-to-GO mappings are obtained from a \code{*.db} package from Bioconductor then the package name can be specified under the \code{lib} argument of the \code{sampleDFgene2GO} function. 
}
  \item{idconv}{
	Optional id conversion \code{data.frame}
}
  \item{catdb}{
	\code{catdb} object storing mappings of genes to annotation categories. For details, see ?"SYSargs-class".
}
  \item{rootUK}{
	If the argument \code{rootUK} is set to \code{TRUE} then the root nodes are treated as terminal nodes to account for the new unknown terms.
}
  \item{sample}{
	\code{character vector} containing the test set of gene identifiers 
}
  \item{Nannot}{
	Defines the minimum number of direct annotations per GO node from the sample 
        set to determine the number of tested hypotheses for the p-value adjustment.  
}
  \item{gocat}{
	Specifies the GO type, can be assigned one of the following character values: "MF", "BP" and "CC". 
}
  \item{GOHyperGAll_result}{
	\code{data.frame} generated by \code{GOHyperGAll}
}
  \item{type}{
	The function \code{GOHyperGAll_Subset} subsets the \code{GOHyperGAll}
	results by directly assigned GO nodes or custom \code{goSlim} categories. 
	The argument \code{type} can be assigned the values \code{goSlim} or \code{assigned}.
}
  \item{myslimv}{
	optional argument to provide custom \code{goSlim} vector
}
  \item{cutoff}{
	p-value cutoff for GO terms to show in result \code{data.frame}
}
  \item{correct}{
	If \code{TRUE} the function will favor the selection of terminal (informationich) 
        GO terms that have at the same time a large number of sample matches. 
}
  \item{setlist}{
	\code{list} of \code{character vectors} containing gene IDs (or array feature 
	IDs). The names of the \code{list} components correspond to the set labels, 
	e.g. DEG comparisons or cluster IDs.
}
  \item{id_type}{
	specifies type of IDs in input, can be assigned \code{gene} or \code{affy}
}
  \item{method}{
	Specifies analysis type. Current options are \code{all} for \code{GOHyperGAll}, 
	\code{slim} for \code{GOHyperGAll_Subset} or \code{simplify} for 
	\code{GOHyperGAll_Simplify}.
}
  \item{CLSZ}{
	minimum gene set (cluster) size to consider. Gene sets below this cutoff 
	will be ignored.
}
  \item{gocats}{
	Specifies GO type, can be assigned the values "MF", "BP" and "CC". 
}
  \item{recordSpecGO}{
	argument to report in the result \code{data.frame} specific GO IDs for any 
	of the 3 ontologies disregarding whether they meet the specified p-value \code{cutoff}, e.g: \code{recordSpecGO=c("GO:0003674", "GO:0008150", "GO:0005575")} 
}
  \item{GOBatchResult}{
	\code{data.frame} generated by \code{GOCluster_Report}
}
  \item{\dots}{
	additional arguments to pass on
}
}
\details{
	\code{GOHyperGAll_Simplify}: The result data frame from \code{GOHyperGAll}
	will often contain several connected GO terms with significant scores which 
	can complicate the interpretation of large sample sets. To reduce this redundancy,
	the function \code{GOHyperGAll_Simplify} subsets the data frame
	by a user specified p-value cutoff and removes from it all GO nodes with 
        overlapping children sets (OFFSPRING), while the best scoring nodes are 
	retained in the result \code{data.frame}.

	\code{GOCluster_Report}: performs the three types of GO term enrichment 
	analyses in batch mode: \code{GOHyperGAll}, \code{GOHyperGAll_Subset} or 
	\code{GOHyperGAll_Simplify}. It processes many gene sets (e.g. gene expression 
        clusters) and returns the results conveniently organized in a single result data frame.
}
\value{
	\code{makeCATdb} generates \code{catDB} object from file. 
}
\references{
This workflow has been published in Plant Physiol (2008) 147, 41-57.
}
\author{
Thomas Girke
}
\seealso{
\code{GOHyperGAll_Subset}, \code{GOHyperGAll_Simplify}, \code{GOCluster_Report}, \code{goBarplot}
}
\examples{
\dontrun{

## Obtain annotations from BioMart
listMarts() # To choose BioMart database
m <- useMart("ENSEMBL_MART_PLANT"); listDatasets(m) 
m <- useMart("ENSEMBL_MART_PLANT", dataset="athaliana_eg_gene")
listAttributes(m) # Choose data types you want to download
go <- getBM(attributes=c("go_accession", "tair_locus", 
            "go_namespace_1003"), mart=m)
go <- go[go[,3]!="",]; go[,3] <- as.character(go[,3])
write.table(go, "GOannotationsBiomart_mod.txt", quote=FALSE, 
            row.names=FALSE, col.names=FALSE, sep="\t")

## Create catDB instance (takes a while but needs to be done only once)
catdb <- makeCATdb(myfile="GOannotationsBiomart_mod.txt", lib=NULL, org="",
                    colno=c(1,2,3), idconv=NULL)
catdb

## Create catDB from Bioconductor annotation package
# catdb <- makeCATdb(myfile=NULL, lib="ath1121501.db", org="", 
                    colno=c(1,2,3), idconv=NULL)

## AffyID-to-GeneID mappings when working with AffyIDs 
# affy2locusDF <- systemPipeR:::.AffyID2GeneID(map = "ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2010-12-20.txt",
                                                download=TRUE)
# catdb_conv <- makeCATdb(myfile="GOannotationsBiomart_mod.txt", lib=NULL, org="", 
                            colno=c(1,2,3), idconv=list(affy=affy2locusDF))
# systemPipeR:::.AffyID2GeneID(catdb=catdb_conv, 
                                affyIDs=c("244901_at", "244902_at"))

## Next time catDB can be loaded from file
save(catdb, file="catdb.RData") 
load("catdb.RData")

## Perform enrichment test on single gene set
test_sample <- unique(as.character(catmap(catdb)$D_MF[1:100,"GeneID"]))
GOHyperGAll(catdb=catdb, gocat="MF", sample=test_sample, Nannot=2)[1:20,]

## GO Slim analysis by subsetting results accordingly
GOHyperGAll_result <- GOHyperGAll(catdb=catdb, gocat="MF", sample=test_sample, Nannot=2)
GOHyperGAll_Subset(catdb, GOHyperGAll_result, sample=test_sample, type="goSlim") 

## Reduce GO term redundancy in 'GOHyperGAll_results'
simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result, gocat="MF", 
                                    cutoff=0.001, correct=T)
# Returns the redundancy reduced data set. 
data.frame(GOHyperGAll_result[GOHyperGAll_result[,1] %in% simplifyDF[,1], -8], GO_OL_Match=simplifyDF[,2])

## Batch Analysis of Gene Clusters
testlist <- list(Set1=test_sample)
GOBatchResult <- GOCluster_Report(catdb=catdb, setlist=testlist, method="all", 
                                    id_type="gene", CLSZ=10, cutoff=0.001, 
                                    gocats=c("MF", "BP", "CC"), 
                                    recordSpecGO=c("GO:0003674", "GO:0008150", "GO:0005575"))

## Plot 'GOBatchResult' as bar plot
goBarplot(GOBatchResult, gocat="MF")

}
}
\keyword{utilities}
