使用R按照指定宽度计算覆盖率

  • A+
所属分类:Script

如何使用R来按照bin(比如说每200碱基宽)来计数覆盖度并生成一个bedgraph文件呢?
这个其实就是所谓的resample问题。可以使用?tileGenome来解决。

library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg19) ## human
tiles <- tileGenome(seqinfo(Hsapiens), tilewidth=200, cut.last.tile.in.chrom=TRUE) ##
tail(tiles[seqnames(tiles)=="chr1"]) ##
head(tiles[seqnames(tiles)=="chr2"]) ##
library(Rsamtools)
library(GenomicAlignments)
binnedAverage <- function(bins, numvar, mcolname)
{
    stopifnot(is(bins, "GRanges"))
    stopifnot(is(numvar, "RleList"))
    stopifnot(identical(seqlevels(bins), names(numvar)))
    bins_per_chrom <- split(ranges(bins), seqnames(bins))
    means_list <- lapply(names(numvar),
        function(seqname) {
            views <- Views(numvar[[seqname]],
                           bins_per_chrom[[seqname]])
            viewMeans(views)
        })
    new_mcol <- unsplit(means_list, as.factor(seqnames(bins)))
    mcols(bins)[[mcolname]] <- new_mcol
    bins
}
 
library(rtracklayer)
bams <- dir(pattern="*.bam$")
for(bam in bams){
	dat <- readGAlignmentsFromBam(bam)
	dat <- granges(dat)
	score <- coverage(dat)
	tiles.s <- keepSeqlevels(tiles, names(score))
	bins <- binnedAverage(tiles.s, score, "score")
	export(bins, gsub(".bam", ".bedgraph", bam), "BedGraph")
}

注意到viewMeans是和rowMeans类似的一种函数,相应的,它也有viewSums, viewMins, viewMaxs等。

如果我们需要的是reads tags计数,而不是coverage,应该怎么办呢?

这个问题的解决思路是

1.将reads区分为两部分,一部分是不会跨窗口的,另外的是会跨窗口的。

2.将会跨窗口的按每个窗口分配一次,并使其的宽度设定为1.

3.将不会跨窗口的reads宽度设置为1.

4.计算coverage。

5.计算binnedSums。

library(GenomicRanges)
library(BSgenome.Mmusculus.UCSC.mm10) ## mouse
tiles <- tileGenome(seqinfo(Mmusculus), tilewidth=200, cut.last.tile.in.chrom=TRUE) ##
edges <- tiles
width(edges) <- 1
tail(tiles[seqnames(tiles)=="chr1"]) ##
head(tiles[seqnames(tiles)=="chr2"]) ##
library(Rsamtools)
library(GenomicAlignments)
binnedSums <- function(bins, numvar, mcolname)
{
    stopifnot(is(bins, "GRanges"))
    stopifnot(is(numvar, "RleList"))
    stopifnot(identical(seqlevels(bins), names(numvar)))
    bins_per_chrom <- split(ranges(bins), seqnames(bins))
    means_list <- lapply(names(numvar),
        function(seqname) {
            views <- Views(numvar[[seqname]],
                           bins_per_chrom[[seqname]])
            viewSums(views)
        })
    new_mcol <- unsplit(means_list, as.factor(seqnames(bins)))
    mcols(bins)[[mcolname]] <- new_mcol
    bins
}
 
library(rtracklayer)
bams <- dir(pattern="*.bam$")
for(bam in bams){
	dat <- readGAlignmentsFromBam(bam)
	dat <- granges(dat)
	ol <- findOverlaps(dat, edges)
	dat.ol <- dat[queryHits(ol)]
	dat.nol <- dat[-queryHits(ol)]
	dat.ols <- dat.ol ## note the bin size must larger than reads length
	width(dat.ols) <- 1
	dat.ole <- dat.ol
	start(dat.ole) <- end(dat.ole)
	width(dat.nol) <- 1
	dat <- c(dat.nol, dat.ols, dat.ole)
	dat <- sort(dat)
	score <- coverage(dat)
	tiles.s <- keepSeqlevels(tiles, names(score))
	bins <- binnedSums(tiles.s, score, "score")
	bins <- bins[bins$score>0]
	export(bins, gsub(".bam", ".bedgraph", bam), "BedGraph")
}

原文来自:http://pgfe.umassmed.edu/ou/archives/3709

 

ybzhao

发表评论

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen: