生信分析进阶3 - pysam操作bam文件统计unique reads和mapped reads高级技巧合辑

pysam操作bam文件统计unique reads和mapped reads高级技巧

1. Linux服务器读取bam文件

服务器查看bam常用方法。

# bam_path： bam文件路径
samtools view -h bam_path|grep -v '^@'|less -S

samtools view查看

2. samtools + python os库读取bam文件

缺点速度较慢。

import os

# 读取bam
# bam_path： bam文件路径
read_lines = os.popen("samtools view {} -h".format(bam_path)).readlines()

# 遍历bam文件每行
for index, line in enumerate(read_lines):
    # 去除末尾\n
    line = line.strip()
    
    # 过滤@开头行
    if line.startswith('@'):
        continue
    else:
        # 打印关键信息
        print(index, '\t', line)

3. 使用pysam对bam进行读取操作

速度快，需要注意：samtools的坐标是1-base，而pysam坐标是0-base。

import pysam

bam_path = "/path/sample.bam"

######  方法1 ###### 
# 读取bam文件
bam_file = pysam.AlignmentFile(bam_path, 'rb')
# 读取sam文件，可直接读取
# bam_file = pysam.AlignmentFile(bam_path, 'r')
# 读取cram文件
# bam_file = pysam.AlignmentFile(bam_path, 'rc')

for line in bam_file:
    record = line
    print(record)
    break

# 关闭bam
bam_file.close()

###### 方法2 ###### 
# 无需close关闭
with pysam.AlignmentFile(bam_path, 'rb') as bam_file :
	for line in bam_file :
		print(line )
		break

打印record

4. 使用pysam对bam进行写入操作

bam_path = "/path/sample.bam"
bam_out_path = "/path/sample-out.bam"

with pysam.AlignmentFile(bam_out_path, 'wb', template=bam_file) as write_bam_file:
   # 写入读取的record至bam文件末尾
	write_bam_file.write(record)

5. 使用pysam检查bam index是否存在

# 检查是否存在bam index, 存在返回True, 否则为False
bam_file.check_index()

# 与samtools联用, 不存在index则samtools index生成
if not bam_file.check_index():
    os.system("samtools index {}".format(bam_path))

6. 使用pysam对bam进行排序

bam_out_path = "/path/sample.sorted.bam"
# 对bam进行sort排序
pysam.sort("-o", bam_sort_path, bam_path)

7. 使用pysam提取基因组某个区域的比对结果

region = bam_file.fetch(contig='chr1', start=60000, stop=60200)
print(region)

8. 使用pysam提取基因组某个区域的reads数量

# 返回reads数目值
region_reads_count = bam_file.count(contig='chr1', start=500000, stop=1000000)
print(region_reads_count)
# 56604

9. 使用pysam提取基因组某个区域的每个碱基覆盖深度

# 返回区间每个碱基的的覆盖深度
region_coverage = bam_file.count_coverage(contig='chr5', start=5000000, stop=5000100)
print(region_coverage)

返回值

10. 使用pysam获取bam文件中每条染色体的mapped和unmapped reads数量

bam_file.get_index_statistics()
# [IndexStats(contig='chr1', mapped=3115437, unmapped=3233, total=3118670),
#  IndexStats(contig='chr2', mapped=2426504, unmapped=2287, total=2428791),
#  IndexStats(contig='chr3', mapped=2092895, unmapped=1806, total=2094701),
#  IndexStats(contig='chr4', mapped=1189202, unmapped=1058, total=1190260),
#  IndexStats(contig='chr5', mapped=1357775, unmapped=1167, total=1358942),
#  IndexStats(contig='chr6', mapped=1772859, unmapped=1572, total=1774431),
#  IndexStats(contig='chr7', mapped=1484016, unmapped=1528, total=1485544),
#  IndexStats(contig='chr8', mapped=1122131, unmapped=1034, total=1123165),
#  IndexStats(contig='chr9', mapped=1341301, unmapped=1486, total=1342787),
#  IndexStats(contig='chr10', mapped=1046607, unmapped=909, total=1047516),
#  IndexStats(contig='chr11', mapped=1731600, unmapped=1797, total=1733397),
#  IndexStats(contig='chr12', mapped=1829623, unmapped=1534, total=1831157),
#  IndexStats(contig='chr13', mapped=568294, unmapped=420, total=568714),
#  IndexStats(contig='chr14', mapped=1013704, unmapped=907, total=1014611),
#  IndexStats(contig='chr15', mapped=1270915, unmapped=1089, total=1272004),
#  IndexStats(contig='chr16', mapped=1937126, unmapped=2096, total=1939222),
#  IndexStats(contig='chr17', mapped=2175991, unmapped=2195, total=2178186),
#  IndexStats(contig='chr18', mapped=547370, unmapped=530, total=547900),
#  IndexStats(contig='chr19', mapped=1790984, unmapped=2367, total=1793351),
#  IndexStats(contig='chr20', mapped=693052, unmapped=775, total=693827),
#  IndexStats(contig='chr21', mapped=421361, unmapped=496, total=421857),
#  IndexStats(contig='chr22', mapped=967377, unmapped=1140, total=968517),
#  IndexStats(contig='chrX', mapped=2187343, unmapped=1680, total=2189023),
#  IndexStats(contig='chrY', mapped=35027, unmapped=27, total=35054),
#  IndexStats(contig='chrMT', mapped=1334891, unmapped=1494, total=1336385)]

11. 使用pysam获取bam文件每条染色体长度

# 比对上reads数量
bam_file.mapped

# 未比对上的reads数量
bam_file.unmapped

###################
chrom_names = bam_file.references
chrom_lengths = bam_file.lengths

# 打印数据类型
print(type(chrom_names))
# <class 'tuple'>

# zip压缩2个元组
for chrom, length in zip(chrom_names, chrom_lengths):
    print(chrom, ':', length)

# chr1 : 249250621
# chr2 : 243199373
# chr3 : 198022430
# chr4 : 191154276
# chr5 : 180915260
# chr6 : 171115067
# chr7 : 159138663
# chr8 : 146364022
# chr9 : 141213431
# chr10 : 135534747
# chr11 : 135006516
# chr12 : 133851895
# chr13 : 115169878
# chr14 : 107349540
# chr15 : 102531392
# chr16 : 90354753
# chr17 : 81195210
# chr18 : 78077248
# chr19 : 59128983
# chr20 : 63025520
# chr21 : 48129895
# chr22 : 51304566
# chrX : 155270560
# chrY : 59373566
# chrMT : 16569

12. 使用pysam操作bam文件用法


with pysam.AlignmentFile(bam_path, 'rb') as bam_file:
    # bam_file对象每一行都是一条reads
    for reads in bam_file:
        # 打印与reads配对的另一条reads
        line_match = bam_file.mate(reads)
        print(line_match)

        # 判断是否是PCR重复序列
        print(reads.is_duplicate)

        # 判断是否是成对reads
        print(reads.is_paired)

        # 判断是否是左端reads
        print(reads.is_read1)

        # 判断是否是右端reads
        print(reads.is_read2)

        # 判断是否是质控失败
        print(reads.is_qcfail)

        # 判断是否比对到参考基因组
        print(reads.is_unmapped)

        # 输出reads的比对质量
        print(reads.mapping_quality)

        # 输出reads比对到参考基因组上的起始和结束坐标
        print(reads.get_blocks())
        # [(10179, 10206), (10206, 10227), (10228, 10273), (10273, 10297), (10297, 10303), (10303, 10325)]

        # 输出在指定区域坐标的重叠区域长度
        print(reads.get_overlap(500000,600000))
        # 0
        
        # 输出reads在比对到参考基因组的上参考序列
        print(reads.get_reference_sequence())
        # tAACCCTAACCCTAACCCTAACCCTAACCCTAACCCtAACCCtAACCCTAAcCCCtAACCCtAACCCTAAACCCtAAACCCTAACCCTAACCCTAACCCtAACCCtAACCCcAACCCCAACCCCAaCCCCAACCCcAACCCcAACC
        
        break

13. 使用pysam计算指定区域的unique mapped的reads数量

# 计算指定区域的unique mapping的reads数目
import pysam

#### 输入参数 ####
chr='chr16'
start=1250000
stop=1260000

bam_file = pysam.AlignmentFile(bam_path, 'rb')

# 定义集合，避免reads重复，其大小就是unique reads的数量
region_reads = set()
count = 0

for read in bam_file.fetch(chr, start, stop):
    region_reads.add(read.query_name)
    count = count +1

# 打印结果
print('{} unique reads in {}:{}-{}'.format(len(region_reads), chr, start, stop))
print('{} alignment in {}:{}-{}'.format(count, chr, start, stop))

# 4778 unique reads in chr16:1250000-1260000
# 9514 alignment in chr16:1250000-1260000

14. 使用pysam计算非重叠区间（窗口）的unique mapping的 reads数量

import pysam

# 设置统计窗口大小100kb
bin_size = 100000

bam_file = pysam.AlignmentFile(bam_path, "rb")
print(bam_file.references)
print(bam_file.lengths)

# 将结果写入文件
with open("bam.window.reads.count", 'w') as fwrite:
    # 写入header
    fwrite.write("ref\tstart\tstop\tunique_reads\tmapped_reads\n")
    
    # 遍历bam文件中比对到参考基因组上的染色体名称
    for i in range(len(bam_file.references)):
        
        # 获取比对到参考基因组上的染色体名称和长度
        refname = bam_file.references[i]
        seqlen = bam_file.lengths[i]
    
        # 以bin_size窗口大小将染色体分割为j个窗口
        mapped_count = 0
        for j in range(1, seqlen, bin_size):
            
            # pysam为0-base, 坐标需-1
            # 获取stop坐标， j为start坐标
            if j + bin_size - 1 < bam_file.lengths[i]:
                stop = j + bin_size - 1
            else:
                stop = bam_file.lengths[i]
                
            region_reads = set()
            # 获取unique_reads, mapped_reads
            for read in bam_file.fetch(refname, j, stop):
                region_reads.add(read.query_name)
                mapped_count += 1

            # 写入文件
            fwrite.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(refname, j, stop, len(region_reads), mapped_count))