% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/make_trim.R
\name{make_trim}
\alias{make_trim}
\title{Make trimmed and quality filtered fastq files}
\usage{
make_trim(
  input,
  output,
  indels = TRUE,
  concat = 12,
  check_mem = FALSE,
  threads = 1,
  chunk_size = NULL,
  polyG = c(type = NULL, min = NULL, mismatch = NULL),
  adapt_3_set = c(type = "hard_rm", min = 10, mismatch = 0.1),
  adapt_3 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTA",
  adapt_5_set = c(type = NULL, min = NULL, mismatch = NULL),
  adapt_5 = NULL,
  nucleotide_range = c(min = NULL, max = NULL),
  quality = c(threshold = 20, percent = 0.8)
)
}
\arguments{
\item{input}{Character indicating the path to a directory containing fastq
formatted sequence files either compressed (.fastq.gz) or text (.fastq).}

\item{output}{Character indicating the path to a directory where the
compressed trimmed fastq files should be saved (as basename trim.fastq.gz).}

\item{indels}{Logical whether indels should be allowed in the mismatch between
read and adaptor sequences. Note that indel trimming will only be done on
fragments that have failed to be trimmed by other means. Default=TRUE.}

\item{concat}{Integer setting the threshold for trimming concatemer-like
adaptors. Even when adaptor synthesis and ligation are strictly controlled,
concatemer-like ("di-") adaptor sequences are formed. When \code{concat} is
an integer, \code{make_trim} will search all trimmed sequences for
additional adaptor sequence using a shorter version of the provided adaptor
in \code{adapt_3}. The length of the short adaptor is controlled by
\code{adapt_3_set$min}. If an additional, shorter, adaptor sequence is
found in a read, trimming will only occur if the resulting sequence is <=
\strong{concat} shorter in length than the first trimmed sequence. Thus, if
\code{concat}=12, a read with a NNNNN-XXXXX-YYYYYYYYY composition, where N
is 'real' nucleotides and X/Y are two independent adaptor sequences, the
trimming will result in NNNNN-XXXXX. If instead \code{concat}=5 then
trimming will result in NNNNN. Note that setting \code{concat} too low will
result in trimming of real nucleotides that just happend to share sequence
with the adaptor. As default \code{concat}=12, which have been carefully
evaluated in relation to the risk of trimming real sequence. If
\code{concat}=NULL, concatemer-like adaptor trimming will not be done.}

\item{check_mem}{Logical whether a memory check should be performed. A memory
check estimates the approximate memory usage given the fastq file sizes and
number of threads. The check happens before entering the parallel trimming
loop, and will give an immediate warning given intermediary risky memory
estimates, and an error if the risk is very high. Note, this option depends
on the parallel and benchmarkme packages. (default=FALSE)}

\item{threads}{Integer stating the number of parallel jobs. Note, that
reading multiple fastq files drains memory, using up to 10Gb per fastq
file. To avoid crashing the system due to memory shortage, make sure that
each thread on the machine have at least 10 Gb of memory available, unless
your fastq files are very small or very large. Use
\code{parallel::detectcores()} to see available threads on the machine.
Never use all threads! To run large fastq on low end computer use
'chunk_size'.}

\item{chunk_size}{Integer indicating the number of reads to be read into
memory in each chunk. When chunk_size=NULL (default), the whole fastq will
be processed as one. Thus, running 5 threads will keep 5 full fastq files
in memory simultaneously (fast but demanding). Setting chunk_size to an
integer will instead read only a portion of each fastq at a time and then
append the trimmed results on disc (slow but less demanding). Try using
chunk_size=1000000 reads, less will be extremely slow. Rather decrease
threads if you encounter memory problems.}

\item{polyG}{Character vector with three inputs named 'type', 'min' and
  'mismatch' that controls poly G trimming. This trimming might be necessary
  for fastq files from two channel illumina sequencers (e.g. NextSeq and
  NovaSeq) where no color signals are interpreted as 'G', but where read
  failure sometimes also results in long stretches of 'no-signal-Gs'.

  Works similar to \code{adapt_3_set} but instead of an adaptor sequence a
  poly 'G' string is constructed from the 'min' input option. Thus, if
  polyG=c(type="hard_rm", min=10, mismatch=0.1) then sequences containing
  'GGGGGGGGGGNNNNN...etc' with 10 percent mismatch will be removed from the
  output fastq file.}

\item{adapt_3_set}{Character vector with three inputs named 'type', 'min' and
'mismatch' that controls 3' trimming. As default,
adapt_3_set=c(type="hard_rm", min=10, mismatch=0.1).
 
 Options for each input are: \describe{
  \item{\strong{type}=}{
      '\emph{hard_trim}': Trims all up to the very last nucleotide.\cr
      '\emph{hard_rm}': Same as 'hard_trim' but removes untrimmed 
      sequences.\cr
      '\emph{hard_save}': Same as 'hard_trim' but saves all untrimmed 
      sequences in new fastq file.\cr
      '\emph{soft_trim}': No trimming on the last 3' nucleotides equal 
      to 1/2 of 'min'.\cr
      '\emph{soft_rm}': Same as 'soft_trim' but removes untrimmed 
      sequences.\cr
      '\emph{soft_save}': Same as 'soft_trim' but saves all untrimmed 
      sequences in new fastq file.
      }
  \item{\strong{min}=}{Integer that controls the short 
      adaptor/poly-G/concatamer trimming and
      soft trimming. When min=10, a short version of the adaptor is
      generated containing the 10 first nucleotides. The short version of
      the adaptor therefore controls the minimum number of overlaps between
      adaptor/poly-G. Short adaptor trimming will only occur on untrimmed
      reads that have failed trimming with full adaptor. The min option also
      controls how many terminal nucleotides that will escape trimming when
      type is set to 'soft' (1/2 of min).
      }
      
  \item{\strong{mismatch}=}{
      Numeric controlling the percent mismatch. 
      For instance, if min=10 and
      mismatch=0.1, then 1 mismatch is allowed in the minimum overlap.
      }
    }}

\item{adapt_3}{Character string defining the 3' adaptor sequence. In most
library preparation protocols, the 3' adaptor is the only adaptor needed
trimming.}

\item{adapt_5_set}{Currently not supported, but will be in future updates.
Same as \code{adapt_3_set} but controls the behavior of 5' trimming when a
sequence is provided in \code{adapt_5}.}

\item{adapt_5}{Character string defining a 5' adaptor sequence. 5' adaptor
sequences are only trimmed in specialized protocols or in very short
paired-end sequencing.}

\item{nucleotide_range}{Numeric vector with two inputs named 'min' and 'max', that
controls the sequence size filter. For example, if
\code{nucleotide_range=c(min=15, max=50)} the function will extract sequences in
the range between 15-50 nucleotides after trimming. As default,
\code{nucleotide_range=c(min=NULL, max=NULL)} and will retain all trimmed
sequences.}

\item{quality}{Numeric vector with two inputs named 'threshold' and 'percent'
that controls the fastq quality filter. For example, if
\code{quality=c(threshold=20, percent=0.8)} (default) the function will
extract sequences that pass a fastq phred score >=20 in 80 percent of the
nucleotides after trimming. If \code{quality=c(threshold=NULL,
percent=NULL)} then all sequences will be retained (not default!).}
}
\value{
Exports a compressed trimmed fastq file to the output folder (file
  name: basename+trim.fastq.gz). If any type="save" has been set, an
  additional fastq file (file name includes 'REMOVED') will be exported. In
  addition, overview statisics (progress report) will be returned as a
  data.frame.
}
\description{
\code{make_trim} uses parallel processing to generate trimmed reads.
}
\details{
Given a file path to sequence files in fastq format (extension .fastq.gz or
.fastq) this function will first trim the sequences from adaptor sequence, and
then apply sequence size and phred score quality filters. On systems with low
memory or very large fastq files, users may choose to handle each file in
chunks on the expense of time performance.
}
\examples{

############################################################ 
### Seqpac fastq trimming with the make_trim function 
### (for more streamline options see the make_counts function) 
 ############################################################ 
### Seqpac fastq trimming with the make_counts function 
### using default settings for NEBNext small RNA adaptor 

# Seqpac includes strongly down-sampled smallRNA fastq.
sys_path = system.file("extdata", package = "seqpac", mustWork = TRUE)
input <- list.files(path = sys_path, pattern = "fastq", all.files = FALSE,
                full.names = TRUE)

# Run make_trim using NEB-next adaptor
# (Here we store the trimmed files in the input folder)
# (but you may choose where to store them using the output option)

input <- unique(dirname(input))  # mak_trim searches for fastq in input dir
output <- tempdir()              # save in tempdir


list.files(input, pattern = "fastq") #before

prog_report  <-  make_trim(
       input=input, output=output, threads=1, 
       adapt_3_set=c(type="hard_rm", min=10, mismatch=0.1), 
       adapt_3="AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTA", 
       polyG=c(type="hard_trim", min=20, mismatch=0.1),
       nucleotide_range=c(min=14, max=70),
       quality=c(threshold=20, percent=0.8))
       
list.files(path = output, pattern = "fastq") #after

# How did it go? Check progress report:  
prog_report     
       

## The principle:
# input <- "/some/path/to/untrimmed_fastq_folder"
# output <- "/some/path/to/output_folder_for_trimmed_fastq"
# 
# prog_report  <-  make_trim(
#      input=input, output=output, 
#      threads=<how_many_parallel>, 
#      check_mem=<should_memory_be_checked?>, 
#      adapt_3_set=<options_for_main_trimming, 
#      adapt_3=<sequence_to be trimmed, 
#      polyG=<Illumina_Nextseq_type_poly_G_trimming>,
#      nucleotide_range=<what_sequence_range_to_save>,
#      quality=<quality_filtering_options>))
}
\seealso{
\url{https://github.com/OestLab/seqpac} for updates on the current package.

Other PAC generation: 
\code{\link{PAC_check}()},
\code{\link{PAC_create}()},
\code{\link{make_PAC}()},
\code{\link{make_counts}()},
\code{\link{make_cutadapt}()},
\code{\link{make_pheno}()},
\code{\link{merge_lanes}()}
}
\concept{PAC generation}
