Filtering and counting spike-in reads

getSpikeInCounts(
  dataset.gr,
  si_pattern = NULL,
  si_names = NULL,
  field = "score",
  sample_names = NULL,
  expand_ranges = FALSE,
  ncores = getOption("mc.cores", 2L)
)

removeSpikeInReads(
  dataset.gr,
  si_pattern = NULL,
  si_names = NULL,
  field = "score",
  ncores = getOption("mc.cores", 2L)
)

getSpikeInReads(
  dataset.gr,
  si_pattern = NULL,
  si_names = NULL,
  field = "score",
  ncores = getOption("mc.cores", 2L)
)

Arguments

dataset.gr: A GRanges object or a list of GRanges objects.
si_pattern: A regular expression that matches spike-in chromosomes. Can be used in addition to, or as an alternative to si_names.
si_names: A character vector giving the names of the spike-in chromosomes. Can be used in addition to, or as an alternative to si_pattern.
field: The metadata field in dataset.gr that contains readcounts. If each range is an individual read, set field = NULL.
sample_names: An optional character vector used to rename the datasets in dataset.gr
expand_ranges: Logical indicating if ranges in dataset.gr should be treated as descriptions of single molecules (FALSE), or if ranges should be treated as representing multiple adjacent positions with the same signal (TRUE). See getCountsByRegions.
ncores: The number of cores to use for computations.

Value

A dataframe containing total readcounts, experimental (non-spike-in) readcounts, and spike-in readcounts.

Author

Mike DeBerardine

Examples

#--------------------------------------------------#
# Make list of dummy GRanges
#--------------------------------------------------#

gr1_rep1 <- GRanges(seqnames = c("chr1", "chr2", "spikechr1", "spikechr2"),
                    ranges = IRanges(start = 1:4, width = 1),
                    strand = "+")
gr2_rep2 <- gr2_rep1 <- gr1_rep2 <- gr1_rep1

# set readcounts
score(gr1_rep1) <- c(1, 1, 1, 1) # 2 exp + 2 spike = 4 total
score(gr2_rep1) <- c(2, 2, 1, 1) # 4 exp + 2 spike = 6 total
score(gr1_rep2) <- c(1, 1, 2, 1) # 2 exp + 3 spike = 5 total
score(gr2_rep2) <- c(4, 4, 2, 2) # 8 exp + 4 spike = 12 total

grl <- list(gr1_rep1, gr2_rep1,
            gr1_rep2, gr2_rep2)

names(grl) <- c("gr1_rep1", "gr2_rep1",
                "gr1_rep2", "gr2_rep2")

grl
#> $gr1_rep1
#> GRanges object with 4 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]      chr1         1      + |         1
#>   [2]      chr2         2      + |         1
#>   [3] spikechr1         3      + |         1
#>   [4] spikechr2         4      + |         1
#>   -------
#>   seqinfo: 4 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep1
#> GRanges object with 4 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]      chr1         1      + |         2
#>   [2]      chr2         2      + |         2
#>   [3] spikechr1         3      + |         1
#>   [4] spikechr2         4      + |         1
#>   -------
#>   seqinfo: 4 sequences from an unspecified genome; no seqlengths
#> 
#> $gr1_rep2
#> GRanges object with 4 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]      chr1         1      + |         1
#>   [2]      chr2         2      + |         1
#>   [3] spikechr1         3      + |         2
#>   [4] spikechr2         4      + |         1
#>   -------
#>   seqinfo: 4 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep2
#> GRanges object with 4 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]      chr1         1      + |         4
#>   [2]      chr2         2      + |         4
#>   [3] spikechr1         3      + |         2
#>   [4] spikechr2         4      + |         2
#>   -------
#>   seqinfo: 4 sequences from an unspecified genome; no seqlengths
#> 

#--------------------------------------------------#
# Count spike-in reads
#--------------------------------------------------#

# by giving names of all spike-in chromosomes
getSpikeInCounts(grl, si_names = c("spikechr1", "spikechr2"), ncores = 1)
#>     sample total_reads exp_reads spike_reads
#> 1 gr1_rep1           4         2           2
#> 2 gr2_rep1           6         4           2
#> 3 gr1_rep2           5         2           3
#> 4 gr2_rep2          12         8           4

# or by matching the string/regular expression "spike" in chromosome names
getSpikeInCounts(grl, si_pattern = "spike", ncores = 1)
#>     sample total_reads exp_reads spike_reads
#> 1 gr1_rep1           4         2           2
#> 2 gr2_rep1           6         4           2
#> 3 gr1_rep2           5         2           3
#> 4 gr2_rep2          12         8           4

#--------------------------------------------------#
# Filter out spike-in reads
#--------------------------------------------------#

removeSpikeInReads(grl, si_pattern = "spike", ncores = 1)
#> $gr1_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]     chr1         1      + |         1
#>   [2]     chr2         2      + |         1
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]     chr1         1      + |         2
#>   [2]     chr2         2      + |         2
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr1_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]     chr1         1      + |         1
#>   [2]     chr2         2      + |         1
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]     chr1         1      + |         4
#>   [2]     chr2         2      + |         4
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 

#--------------------------------------------------#
# Return spike-in reads
#--------------------------------------------------#

getSpikeInReads(grl, si_pattern = "spike", ncores = 1)
#> $gr1_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1] spikechr1         3      + |         1
#>   [2] spikechr2         4      + |         1
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1] spikechr1         3      + |         1
#>   [2] spikechr2         4      + |         1
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr1_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1] spikechr1         3      + |         2
#>   [2] spikechr2         4      + |         1
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1] spikechr1         3      + |         2
#>   [2] spikechr2         4      + |         2
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>