Randomly subsample reads according to spike-in normalization

subsampleBySpikeIn(
  dataset.gr,
  si_pattern = NULL,
  si_names = NULL,
  ctrl_pattern = NULL,
  ctrl_names = NULL,
  batch_norm = TRUE,
  RPM_units = FALSE,
  field = "score",
  sample_names = NULL,
  expand_ranges = FALSE,
  ncores = getOption("mc.cores", 2L)
)

Arguments

dataset.gr, si_pattern, si_names, ctrl_pattern, ctrl_names, batch_norm, field, sample_names, expand_ranges, ncores

See getSpikeInNFs

RPM_units

If set to TRUE, the final readcount values will be converted to units equivalent to/directly comparable with RPM for the negative control(s). If field = NULL, the GRanges objects will be converted to disjoint "basepair-resolution" GRanges objects, with normalized readcounts contained in the "score" metadata column.

Value

An object parallel to dataset.gr, but with fewer reads. E.g. if dataset.gr is a list of GRanges, the output is a list of the same GRanges, but in which each GRanges has fewer reads.

Details

Note that if field = NULL,

Author

Mike DeBerardine

Examples

#--------------------------------------------------#
# Make list of dummy GRanges
#--------------------------------------------------#
gr1_rep1 <- GRanges(seqnames = c("chr1", "chr2", "spikechr1", "spikechr2"),
                    ranges = IRanges(start = 1:4, width = 1),
                    strand = "+")
gr2_rep2 <- gr2_rep1 <- gr1_rep2 <- gr1_rep1

# set readcounts
score(gr1_rep1) <- c(1, 1, 1, 1) # 2 exp + 2 spike = 4 total
score(gr2_rep1) <- c(2, 2, 1, 1) # 4 exp + 2 spike = 6 total
score(gr1_rep2) <- c(1, 1, 2, 1) # 2 exp + 3 spike = 5 total
score(gr2_rep2) <- c(4, 4, 2, 2) # 8 exp + 4 spike = 12 total

grl <- list(gr1_rep1, gr2_rep1,
            gr1_rep2, gr2_rep2)
names(grl) <- c("gr1_rep1", "gr2_rep1",
                "gr1_rep2", "gr2_rep2")

grl
#> $gr1_rep1
#> GRanges object with 4 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]      chr1         1      + |         1
#>   [2]      chr2         2      + |         1
#>   [3] spikechr1         3      + |         1
#>   [4] spikechr2         4      + |         1
#>   -------
#>   seqinfo: 4 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep1
#> GRanges object with 4 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]      chr1         1      + |         2
#>   [2]      chr2         2      + |         2
#>   [3] spikechr1         3      + |         1
#>   [4] spikechr2         4      + |         1
#>   -------
#>   seqinfo: 4 sequences from an unspecified genome; no seqlengths
#> 
#> $gr1_rep2
#> GRanges object with 4 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]      chr1         1      + |         1
#>   [2]      chr2         2      + |         1
#>   [3] spikechr1         3      + |         2
#>   [4] spikechr2         4      + |         1
#>   -------
#>   seqinfo: 4 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep2
#> GRanges object with 4 ranges and 1 metadata column:
#>        seqnames    ranges strand |     score
#>           <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]      chr1         1      + |         4
#>   [2]      chr2         2      + |         4
#>   [3] spikechr1         3      + |         2
#>   [4] spikechr2         4      + |         2
#>   -------
#>   seqinfo: 4 sequences from an unspecified genome; no seqlengths
#> 

#--------------------------------------------------#
# (The simple spike-in NFs)
#--------------------------------------------------#

# see examples for getSpikeInNFs for more
getSpikeInNFs(grl, si_pattern = "spike", ctrl_pattern = "gr1",
              method = "SNR", ncores = 1)
#> [1] 1.00 1.00 1.00 0.75

#--------------------------------------------------#
# Subsample the GRanges according to the spike-in NFs
#--------------------------------------------------#

ss <- subsampleBySpikeIn(grl, si_pattern = "spike", ctrl_pattern = "gr1",
                         ncores = 1)
ss
#> $gr1_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <integer>
#>   [1]     chr1         1      + |         1
#>   [2]     chr2         2      + |         1
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <integer>
#>   [1]     chr1         1      + |         2
#>   [2]     chr2         2      + |         2
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr1_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <integer>
#>   [1]     chr1         1      + |         1
#>   [2]     chr2         2      + |         1
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <integer>
#>   [1]     chr1         1      + |         3
#>   [2]     chr2         2      + |         3
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 

lapply(ss, function(x) sum(score(x))) # total reads in each
#> $gr1_rep1
#> [1] 2
#> 
#> $gr2_rep1
#> [1] 4
#> 
#> $gr1_rep2
#> [1] 2
#> 
#> $gr2_rep2
#> [1] 6
#> 

# Put in units of RPM for the negative control
ssr <- subsampleBySpikeIn(grl, si_pattern = "spike", ctrl_pattern = "gr1",
                          RPM_units = TRUE, ncores = 1)

ssr
#> $gr1_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]     chr1         1      + |     5e+05
#>   [2]     chr2         2      + |     5e+05
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]     chr1         1      + |     1e+06
#>   [2]     chr2         2      + |     1e+06
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr1_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]     chr1         1      + |     5e+05
#>   [2]     chr2         2      + |     5e+05
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $gr2_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#>       seqnames    ranges strand |     score
#>          <Rle> <IRanges>  <Rle> | <numeric>
#>   [1]     chr1         1      + |   1500000
#>   [2]     chr2         2      + |   1500000
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 

lapply(ssr, function(x) sum(score(x))) # total signal in each
#> $gr1_rep1
#> [1] 1e+06
#> 
#> $gr2_rep1
#> [1] 2e+06
#> 
#> $gr1_rep2
#> [1] 1e+06
#> 
#> $gr2_rep2
#> [1] 3e+06
#>