R/normalization.R
subsampleBySpikeIn.Rd
Randomly subsample reads according to spike-in normalization
subsampleBySpikeIn(
dataset.gr,
si_pattern = NULL,
si_names = NULL,
ctrl_pattern = NULL,
ctrl_names = NULL,
batch_norm = TRUE,
RPM_units = FALSE,
field = "score",
sample_names = NULL,
expand_ranges = FALSE,
ncores = getOption("mc.cores", 2L)
)
See getSpikeInNFs
If set to TRUE
, the final readcount values will be
converted to units equivalent to/directly comparable with RPM
for
the negative control(s). If field = NULL
, the GRanges objects will
be converted to disjoint
"basepair-resolution"
GRanges
objects, with normalized readcounts contained in the "score" metadata
column.
An object parallel to dataset.gr
, but with fewer reads. E.g.
if dataset.gr
is a list of GRanges, the output is a list of the same
GRanges, but in which each GRanges has fewer reads.
Note that if field = NULL
,
#--------------------------------------------------#
# Make list of dummy GRanges
#--------------------------------------------------#
gr1_rep1 <- GRanges(seqnames = c("chr1", "chr2", "spikechr1", "spikechr2"),
ranges = IRanges(start = 1:4, width = 1),
strand = "+")
gr2_rep2 <- gr2_rep1 <- gr1_rep2 <- gr1_rep1
# set readcounts
score(gr1_rep1) <- c(1, 1, 1, 1) # 2 exp + 2 spike = 4 total
score(gr2_rep1) <- c(2, 2, 1, 1) # 4 exp + 2 spike = 6 total
score(gr1_rep2) <- c(1, 1, 2, 1) # 2 exp + 3 spike = 5 total
score(gr2_rep2) <- c(4, 4, 2, 2) # 8 exp + 4 spike = 12 total
grl <- list(gr1_rep1, gr2_rep1,
gr1_rep2, gr2_rep2)
names(grl) <- c("gr1_rep1", "gr2_rep1",
"gr1_rep2", "gr2_rep2")
grl
#> $gr1_rep1
#> GRanges object with 4 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <numeric>
#> [1] chr1 1 + | 1
#> [2] chr2 2 + | 1
#> [3] spikechr1 3 + | 1
#> [4] spikechr2 4 + | 1
#> -------
#> seqinfo: 4 sequences from an unspecified genome; no seqlengths
#>
#> $gr2_rep1
#> GRanges object with 4 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <numeric>
#> [1] chr1 1 + | 2
#> [2] chr2 2 + | 2
#> [3] spikechr1 3 + | 1
#> [4] spikechr2 4 + | 1
#> -------
#> seqinfo: 4 sequences from an unspecified genome; no seqlengths
#>
#> $gr1_rep2
#> GRanges object with 4 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <numeric>
#> [1] chr1 1 + | 1
#> [2] chr2 2 + | 1
#> [3] spikechr1 3 + | 2
#> [4] spikechr2 4 + | 1
#> -------
#> seqinfo: 4 sequences from an unspecified genome; no seqlengths
#>
#> $gr2_rep2
#> GRanges object with 4 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <numeric>
#> [1] chr1 1 + | 4
#> [2] chr2 2 + | 4
#> [3] spikechr1 3 + | 2
#> [4] spikechr2 4 + | 2
#> -------
#> seqinfo: 4 sequences from an unspecified genome; no seqlengths
#>
#--------------------------------------------------#
# (The simple spike-in NFs)
#--------------------------------------------------#
# see examples for getSpikeInNFs for more
getSpikeInNFs(grl, si_pattern = "spike", ctrl_pattern = "gr1",
method = "SNR", ncores = 1)
#> [1] 1.00 1.00 1.00 0.75
#--------------------------------------------------#
# Subsample the GRanges according to the spike-in NFs
#--------------------------------------------------#
ss <- subsampleBySpikeIn(grl, si_pattern = "spike", ctrl_pattern = "gr1",
ncores = 1)
ss
#> $gr1_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <integer>
#> [1] chr1 1 + | 1
#> [2] chr2 2 + | 1
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $gr2_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <integer>
#> [1] chr1 1 + | 2
#> [2] chr2 2 + | 2
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $gr1_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <integer>
#> [1] chr1 1 + | 1
#> [2] chr2 2 + | 1
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $gr2_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <integer>
#> [1] chr1 1 + | 3
#> [2] chr2 2 + | 3
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
lapply(ss, function(x) sum(score(x))) # total reads in each
#> $gr1_rep1
#> [1] 2
#>
#> $gr2_rep1
#> [1] 4
#>
#> $gr1_rep2
#> [1] 2
#>
#> $gr2_rep2
#> [1] 6
#>
# Put in units of RPM for the negative control
ssr <- subsampleBySpikeIn(grl, si_pattern = "spike", ctrl_pattern = "gr1",
RPM_units = TRUE, ncores = 1)
ssr
#> $gr1_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <numeric>
#> [1] chr1 1 + | 5e+05
#> [2] chr2 2 + | 5e+05
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $gr2_rep1
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <numeric>
#> [1] chr1 1 + | 1e+06
#> [2] chr2 2 + | 1e+06
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $gr1_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <numeric>
#> [1] chr1 1 + | 5e+05
#> [2] chr2 2 + | 5e+05
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $gr2_rep2
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <numeric>
#> [1] chr1 1 + | 1500000
#> [2] chr2 2 + | 1500000
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
lapply(ssr, function(x) sum(score(x))) # total signal in each
#> $gr1_rep1
#> [1] 1e+06
#>
#> $gr2_rep1
#> [1] 2e+06
#>
#> $gr1_rep2
#> [1] 1e+06
#>
#> $gr2_rep2
#> [1] 3e+06
#>