pysam操作bam文件统计unique reads和mapped reads高级技巧
1. Linux服务器读取bam文件
服务器查看bam常用方法。
# bam_path: bam文件路径
samtools view -h bam_path|grep -v '^@'|less -S
2. samtools + python os库读取bam文件
缺点速度较慢。
import os
# 读取bam
# bam_path: bam文件路径
read_lines = os.popen("samtools view {} -h".format(bam_path)).readlines()
# 遍历bam文件每行
for index, line in enumerate(read_lines):
# 去除末尾\n
line = line.strip()
# 过滤@开头行
if line.startswith('@'):
continue
else:
# 打印关键信息
print(index, '\t', line)
3. 使用pysam对bam进行读取操作
速度快,需要注意:samtools的坐标是1-base,而pysam坐标是0-base。
import pysam
bam_path = "/path/sample.bam"
###### 方法1 ######
# 读取bam文件
bam_file = pysam.AlignmentFile(bam_path, 'rb')
# 读取sam文件,可直接读取
# bam_file = pysam.AlignmentFile(bam_path, 'r')
# 读取cram文件
# bam_file = pysam.AlignmentFile(bam_path, 'rc')
for line in bam_file:
record = line
print(record)
break
# 关闭bam
bam_file.close()
###### 方法2 ######
# 无需close关闭
with pysam.AlignmentFile(bam_path, 'rb') as bam_file :
for line in bam_file :
print(line )
break
4. 使用pysam对bam进行写入操作
bam_path = "/path/sample.bam"
bam_out_path = "/path/sample-out.bam"
with pysam.AlignmentFile(bam_out_path, 'wb', template=bam_file) as write_bam_file:
# 写入读取的record至bam文件末尾
write_bam_file.write(record)
5. 使用pysam检查bam index是否存在
# 检查是否存在bam index, 存在返回True, 否则为False
bam_file.check_index()
# 与samtools联用, 不存在index则samtools index生成
if not bam_file.check_index():
os.system("samtools index {}".format(bam_path))
6. 使用pysam对bam进行排序
bam_out_path = "/path/sample.sorted.bam"
# 对bam进行sort排序
pysam.sort("-o", bam_sort_path, bam_path)
7. 使用pysam提取基因组某个区域的比对结果
region = bam_file.fetch(contig='chr1', start=60000, stop=60200)
print(region)
8. 使用pysam提取基因组某个区域的reads数量
# 返回reads数目值
region_reads_count = bam_file.count(contig='chr1', start=500000, stop=1000000)
print(region_reads_count)
# 56604
9. 使用pysam提取基因组某个区域的每个碱基覆盖深度
# 返回区间每个碱基的的覆盖深度
region_coverage = bam_file.count_coverage(contig='chr5', start=5000000, stop=5000100)
print(region_coverage)
10. 使用pysam获取bam文件中每条染色体的mapped和unmapped reads数量
bam_file.get_index_statistics()
# [IndexStats(contig='chr1', mapped=3115437, unmapped=3233, total=3118670),
# IndexStats(contig='chr2', mapped=2426504, unmapped=2287, total=2428791),
# IndexStats(contig='chr3', mapped=2092895, unmapped=1806, total=2094701),
# IndexStats(contig='chr4', mapped=1189202, unmapped=1058, total=1190260),
# IndexStats(contig='chr5', mapped=1357775, unmapped=1167, total=1358942),
# IndexStats(contig='chr6', mapped=1772859, unmapped=1572, total=1774431),
# IndexStats(contig='chr7', mapped=1484016, unmapped=1528, total=1485544),
# IndexStats(contig='chr8', mapped=1122131, unmapped=1034, total=1123165),
# IndexStats(contig='chr9', mapped=1341301, unmapped=1486, total=1342787),
# IndexStats(contig='chr10', mapped=1046607, unmapped=909, total=1047516),
# IndexStats(contig='chr11', mapped=1731600, unmapped=1797, total=1733397),
# IndexStats(contig='chr12', mapped=1829623, unmapped=1534, total=1831157),
# IndexStats(contig='chr13', mapped=568294, unmapped=420, total=568714),
# IndexStats(contig='chr14', mapped=1013704, unmapped=907, total=1014611),
# IndexStats(contig='chr15', mapped=1270915, unmapped=1089, total=1272004),
# IndexStats(contig='chr16', mapped=1937126, unmapped=2096, total=1939222),
# IndexStats(contig='chr17', mapped=2175991, unmapped=2195, total=2178186),
# IndexStats(contig='chr18', mapped=547370, unmapped=530, total=547900),
# IndexStats(contig='chr19', mapped=1790984, unmapped=2367, total=1793351),
# IndexStats(contig='chr20', mapped=693052, unmapped=775, total=693827),
# IndexStats(contig='chr21', mapped=421361, unmapped=496, total=421857),
# IndexStats(contig='chr22', mapped=967377, unmapped=1140, total=968517),
# IndexStats(contig='chrX', mapped=2187343, unmapped=1680, total=2189023),
# IndexStats(contig='chrY', mapped=35027, unmapped=27, total=35054),
# IndexStats(contig='chrMT', mapped=1334891, unmapped=1494, total=1336385)]
11. 使用pysam获取bam文件每条染色体长度
# 比对上reads数量
bam_file.mapped
# 未比对上的reads数量
bam_file.unmapped
###################
chrom_names = bam_file.references
chrom_lengths = bam_file.lengths
# 打印数据类型
print(type(chrom_names))
# <class 'tuple'>
# zip压缩2个元组
for chrom, length in zip(chrom_names, chrom_lengths):
print(chrom, ':', length)
# chr1 : 249250621
# chr2 : 243199373
# chr3 : 198022430
# chr4 : 191154276
# chr5 : 180915260
# chr6 : 171115067
# chr7 : 159138663
# chr8 : 146364022
# chr9 : 141213431
# chr10 : 135534747
# chr11 : 135006516
# chr12 : 133851895
# chr13 : 115169878
# chr14 : 107349540
# chr15 : 102531392
# chr16 : 90354753
# chr17 : 81195210
# chr18 : 78077248
# chr19 : 59128983
# chr20 : 63025520
# chr21 : 48129895
# chr22 : 51304566
# chrX : 155270560
# chrY : 59373566
# chrMT : 16569
12. 使用pysam操作bam文件用法
with pysam.AlignmentFile(bam_path, 'rb') as bam_file:
# bam_file对象每一行都是一条reads
for reads in bam_file:
# 打印与reads配对的另一条reads
line_match = bam_file.mate(reads)
print(line_match)
# 判断是否是PCR重复序列
print(reads.is_duplicate)
# 判断是否是成对reads
print(reads.is_paired)
# 判断是否是左端reads
print(reads.is_read1)
# 判断是否是右端reads
print(reads.is_read2)
# 判断是否是质控失败
print(reads.is_qcfail)
# 判断是否比对到参考基因组
print(reads.is_unmapped)
# 输出reads的比对质量
print(reads.mapping_quality)
# 输出reads比对到参考基因组上的起始和结束坐标
print(reads.get_blocks())
# [(10179, 10206), (10206, 10227), (10228, 10273), (10273, 10297), (10297, 10303), (10303, 10325)]
# 输出在指定区域坐标的重叠区域长度
print(reads.get_overlap(500000,600000))
# 0
# 输出reads在比对到参考基因组的上参考序列
print(reads.get_reference_sequence())
# tAACCCTAACCCTAACCCTAACCCTAACCCTAACCCtAACCCtAACCCTAAcCCCtAACCCtAACCCTAAACCCtAAACCCTAACCCTAACCCTAACCCtAACCCtAACCCcAACCCCAACCCCAaCCCCAACCCcAACCCcAACC
break
13. 使用pysam计算指定区域的unique mapped的reads数量
# 计算指定区域的unique mapping的reads数目
import pysam
#### 输入参数 ####
chr='chr16'
start=1250000
stop=1260000
bam_file = pysam.AlignmentFile(bam_path, 'rb')
# 定义集合,避免reads重复,其大小就是unique reads的数量
region_reads = set()
count = 0
for read in bam_file.fetch(chr, start, stop):
region_reads.add(read.query_name)
count = count +1
# 打印结果
print('{} unique reads in {}:{}-{}'.format(len(region_reads), chr, start, stop))
print('{} alignment in {}:{}-{}'.format(count, chr, start, stop))
# 4778 unique reads in chr16:1250000-1260000
# 9514 alignment in chr16:1250000-1260000
14. 使用pysam计算非重叠区间(窗口)的unique mapping的 reads数量
import pysam
# 设置统计窗口大小100kb
bin_size = 100000
bam_file = pysam.AlignmentFile(bam_path, "rb")
print(bam_file.references)
print(bam_file.lengths)
# 将结果写入文件
with open("bam.window.reads.count", 'w') as fwrite:
# 写入header
fwrite.write("ref\tstart\tstop\tunique_reads\tmapped_reads\n")
# 遍历bam文件中比对到参考基因组上的染色体名称
for i in range(len(bam_file.references)):
# 获取比对到参考基因组上的染色体名称和长度
refname = bam_file.references[i]
seqlen = bam_file.lengths[i]
# 以bin_size窗口大小将染色体分割为j个窗口
mapped_count = 0
for j in range(1, seqlen, bin_size):
# pysam为0-base, 坐标需-1
# 获取stop坐标, j为start坐标
if j + bin_size - 1 < bam_file.lengths[i]:
stop = j + bin_size - 1
else:
stop = bam_file.lengths[i]
region_reads = set()
# 获取unique_reads, mapped_reads
for read in bam_file.fetch(refname, j, stop):
region_reads.add(read.query_name)
mapped_count += 1
# 写入文件
fwrite.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(refname, j, stop, len(region_reads), mapped_count))