Subset regions of interest by quantiles of overlapping signal

A convenience function to subset regions of interest by the amount of signal they contain, according to their quantile (i.e. their signal ranks).

subsetRegionsBySignal(
  regions.gr,
  dataset.gr,
  quantiles = c(0.5, 1),
  field = "score",
  order.by.rank = FALSE,
  density = FALSE,
  keep.signal = FALSE,
  expand_ranges = FALSE
)

Arguments

regions.gr: A GRanges object containing regions of interest.
dataset.gr: A GRanges object in which signal is contained in metadata (typically in the "score" field).
quantiles: A value pair giving the lower quantile and upper quantile of regions to keep. Regions with signal quantiles below the lower quantile are removed, and likewise for regions with signal quantiles above the upper quantile. Quantiles must be in range (0, 1). An empty GRanges object is returned if the lower quantile is set to 1 or if the upper quantile is set to 0.
field: The metadata field of dataset.gr to be counted, typically "score".
order.by.rank: If TRUE, the output regions are sorted based on the amount of overlapping signal (in decreasing order). If FALSE (the default), genes are sorted by their positions.
density: A logical indicating whether signal counts should be normalized to the width (chromosomal length) of ranges in regions.gr. By default, no length normalization is performed.
keep.signal: Logical indicating if signal counts should be kept. If set to TRUE, the signal for each range (length-normalized if density = TRUE) are kept as a new Signal metadata column in the output GRanges object.
expand_ranges: Logical indicating if ranges in dataset.gr should be treated as descriptions of single molecules (FALSE), or if ranges should be treated as representing multiple adjacent positions with the same signal (TRUE). See getCountsByRegions.

Value

A GRanges object of length length(regions.gr) * (upper_quantile - lower_quantile).

Author

Mike DeBerardine

Examples

data("PROseq") # load included PROseq data
data("txs_dm6_chr4") # load included transcripts

txs_dm6_chr4
#> GRanges object with 339 ranges and 2 metadata columns:
#>         seqnames          ranges strand |     tx_name     gene_id
#>            <Rle>       <IRanges>  <Rle> | <character> <character>
#>     [1]     chr4        879-5039      + | FBtr0346692 FBgn0267363
#>     [2]     chr4     42774-43374      + | FBtr0344900 FBgn0266617
#>     [3]     chr4     44774-46074      + | FBtr0340499 FBgn0265633
#>     [4]     chr4     56497-60974      + | FBtr0333704 FBgn0264617
#>     [5]     chr4     56497-63124      + | FBtr0333705 FBgn0264617
#>     ...      ...             ...    ... .         ...         ...
#>   [335]     chr4 1192419-1196848      - | FBtr0100543 FBgn0039924
#>   [336]     chr4 1192419-1196848      - | FBtr0100544 FBgn0039924
#>   [337]     chr4 1225089-1230713      - | FBtr0100406 FBgn0027101
#>   [338]     chr4 1225737-1230713      - | FBtr0100402 FBgn0027101
#>   [339]     chr4 1225737-1230713      - | FBtr0100404 FBgn0027101
#>   -------
#>   seqinfo: 7 sequences from dm6 genome

#--------------------------------------------------#
# get the top 50% of transcripts by signal
#--------------------------------------------------#

subsetRegionsBySignal(txs_dm6_chr4, PROseq)
#> GRanges object with 170 ranges and 2 metadata columns:
#>         seqnames          ranges strand |     tx_name     gene_id
#>            <Rle>       <IRanges>  <Rle> | <character> <character>
#>     [1]     chr4    69326-101419      + | FBtr0100246 FBgn0085432
#>     [2]     chr4    69326-108694      + | FBtr0089159 FBgn0085432
#>     [3]     chr4    69326-108694      + | FBtr0334301 FBgn0085432
#>     [4]     chr4    69326-109770      + | FBtr0100245 FBgn0085432
#>     [5]     chr4    69326-110059      + | FBtr0308615 FBgn0085432
#>     ...      ...             ...    ... .         ...         ...
#>   [166]     chr4 1192419-1196848      - | FBtr0100543 FBgn0039924
#>   [167]     chr4 1192419-1196848      - | FBtr0100544 FBgn0039924
#>   [168]     chr4 1225089-1230713      - | FBtr0100406 FBgn0027101
#>   [169]     chr4 1225737-1230713      - | FBtr0100402 FBgn0027101
#>   [170]     chr4 1225737-1230713      - | FBtr0100404 FBgn0027101
#>   -------
#>   seqinfo: 7 sequences from dm6 genome

#--------------------------------------------------#
# get the middle 50% of transcripts by signal
#--------------------------------------------------#

subsetRegionsBySignal(txs_dm6_chr4, PROseq, quantiles = c(0.25, 0.75))
#> GRanges object with 169 ranges and 2 metadata columns:
#>         seqnames          ranges strand |     tx_name     gene_id
#>            <Rle>       <IRanges>  <Rle> | <character> <character>
#>     [1]     chr4     56497-60974      + | FBtr0333704 FBgn0264617
#>     [2]     chr4     56497-63124      + | FBtr0333705 FBgn0264617
#>     [3]     chr4   136272-140083      + | FBtr0089450 FBgn0052000
#>     [4]     chr4   136272-143123      + | FBtr0089448 FBgn0052000
#>     [5]     chr4   136272-143123      + | FBtr0089449 FBgn0052000
#>     ...      ...             ...    ... .         ...         ...
#>   [165]     chr4 1192419-1196848      - | FBtr0100543 FBgn0039924
#>   [166]     chr4 1192419-1196848      - | FBtr0100544 FBgn0039924
#>   [167]     chr4 1225089-1230713      - | FBtr0100406 FBgn0027101
#>   [168]     chr4 1225737-1230713      - | FBtr0100402 FBgn0027101
#>   [169]     chr4 1225737-1230713      - | FBtr0100404 FBgn0027101
#>   -------
#>   seqinfo: 7 sequences from dm6 genome

#--------------------------------------------------#
# get the top 10% of transcripts by signal, and sort them by highest signal
#--------------------------------------------------#

subsetRegionsBySignal(txs_dm6_chr4, PROseq, quantiles = c(0.9, 1),
                      order.by.rank = TRUE)
#> GRanges object with 34 ranges and 2 metadata columns:
#>        seqnames        ranges strand |     tx_name     gene_id
#>           <Rle>     <IRanges>  <Rle> | <character> <character>
#>    [1]     chr4 649041-663103      - | FBtr0310542 FBgn0051992
#>    [2]     chr4  69653-114270      + | FBtr0309803 FBgn0085432
#>    [3]     chr4  69326-110059      + | FBtr0308615 FBgn0085432
#>    [4]     chr4  69326-109770      + | FBtr0100245 FBgn0085432
#>    [5]     chr4  69326-108694      + | FBtr0334301 FBgn0085432
#>    ...      ...           ...    ... .         ...         ...
#>   [30]     chr4 649971-659983      - | FBtr0089097 FBgn0051992
#>   [31]     chr4 184225-193489      - | FBtr0089150 FBgn0039890
#>   [32]     chr4 184225-193489      - | FBtr0089149 FBgn0039890
#>   [33]     chr4 132220-140083      + | FBtr0089447 FBgn0052000
#>   [34]     chr4 132220-140083      + | FBtr0089446 FBgn0052000
#>   -------
#>   seqinfo: 7 sequences from dm6 genome

#--------------------------------------------------#
# remove the most extreme 10% of regions, and keep scores
#--------------------------------------------------#

subsetRegionsBySignal(txs_dm6_chr4, PROseq, quantiles = c(0.05, 0.95),
                      keep.signal = TRUE)
#> GRanges object with 305 ranges and 3 metadata columns:
#>         seqnames          ranges strand |     tx_name     gene_id    Signal
#>            <Rle>       <IRanges>  <Rle> | <character> <character> <integer>
#>     [1]     chr4     42774-43374      + | FBtr0344900 FBgn0266617        59
#>     [2]     chr4     44774-46074      + | FBtr0340499 FBgn0265633        13
#>     [3]     chr4     56497-60974      + | FBtr0333704 FBgn0264617       126
#>     [4]     chr4     56497-63124      + | FBtr0333705 FBgn0264617       263
#>     [5]     chr4    90355-110059      + | FBtr0112657 FBgn0085432      1464
#>     ...      ...             ...    ... .         ...         ...       ...
#>   [301]     chr4 1192419-1196848      - | FBtr0100543 FBgn0039924       454
#>   [302]     chr4 1192419-1196848      - | FBtr0100544 FBgn0039924       454
#>   [303]     chr4 1225089-1230713      - | FBtr0100406 FBgn0027101       766
#>   [304]     chr4 1225737-1230713      - | FBtr0100402 FBgn0027101       665
#>   [305]     chr4 1225737-1230713      - | FBtr0100404 FBgn0027101       665
#>   -------
#>   seqinfo: 7 sequences from dm6 genome

Subset regions of interest by quantiles of overlapping signal

Arguments

Value

See also

Author

Examples