Multiple genome alignment

生信百科 · 公众号 · 医学 · 2017-06-16 07:38

正文

今天爆笑钮出差了，由我代发。

多个全基因组比对是个比较麻烦的事儿，基因组太大难免会带来各种比对错误。[Lastz] (http://www.bx.psu.edu/~rsharris/lastz/lastz-1.04.00.tar.gz) 是最近比较流行的方法用于全基因组比对。但是结果整理有点小复杂。

这里我们打算alignment 13个oryza属的choloraplast genome, 开始的时候打算用muscle直接进行全基因组比对，但是不知什么原因老是报错没法今进行。没办法只能试一下lastz了。

基本思路是：选一个标准序列，将其余12个genome分别和这个序列比对，然后提取具有多态的位点进行合并(目前只提取了SNP，indel有点麻烦，后面再update）

1. 准备fasta文件

将需要比对的不同基因组分别存为fasta格式文件

2. 分别比对

lastz基本比对命令为:

(输出格式为maf)

lastz Oryza_rufipogon.fa Oryza_longistaminata.fa --notransition --step=20 --nogapped --format=maf > ruf_longistaminata.maf

提供一个python程序，进行两两比对：

#!/usr/bin/python

# coding=utf-8

#python MultGenAln.py [fa folderName]

import glob,os,sys

Lst = glob.glob(sys.argv[1]+"*.fa")
ref = "Oryza_rufipogon.fa"refFa = ref.split("/")[-1]
faLst = []
for i in Lst:
    _fa = i.split("/")[-1]for fa in Lst:    
    if fa != ref:
        cmd = "lastz %s %s --notransition --step=20 --nogapped --format=maf > %s.maf"%(ref,fa,"ref_%s"%fa.split("_")[-1].split(".")[0])
        os.system(cmd)        
        print fa

3. 提取polymorphism位点

#!/usr/bin/python

# coding=utf-8

#python extractPoly.py [mafFile] [outHpFile]

import sys, re

maf = open(sys.argv[1],'rU')

out = open(sys.argv[2],'w')
for line in maf:    
if re.search(r"^s", line) and not re.search(r"rufipogon", line):
        acc = line.rstrip().split()[1]        
        break
maf.seek(0,0)

head = "Position\tO.rufipogon\t%s\n"%acc
out.write(head)
tagNum = 0
tag = 0
tag2 = 0
for line in maf:    
if re.search(r"^a",line):
        tag = 0
        tag2 = 0elif re.search(r"^s\ O\.rufipogon",line):
        tag = 1
        _line = line.rstrip().split()
        start_ruf = int(_line[2])
        end_ruf = int(_line[3])+1-start_ruf
        ruf = _line[6]    
elif re.search(r"^s",line) and not re.search(r"rufipogon",line):
        tag2 = 1
        _line = line.rstrip().split()
        seq = _line[6]    
else:
        tag = 3
        if tag == 1 and tag2 == 1:
            seqTag = []        
for i in range(len(ruf)):            
    if ruf[i]!=seq[i]:
                seqTag.append(ruf[i])
                count = seqTag.count("-")
                out.write(str(start_ruf+i-count)+"\t"+ruf[i]+"\t"+seq[i]+"\n")
        tagNum += 1
        print tagNum
maf.close()
out.close()

4. 将所有hp文件进行合并

#!/usr/bin/python

# coding=utf-8

# python mergeHp.py

import glob

lst = glob.glob("./*_*")

out = open("merged.hp",'w')

dic = {}
keys = []
for Lst in lst:
    Name = Lst.split("_")[1]    
    print Name
    dic[Name]={}
    f = open(Lst,'rU')
    f.readline()    
    for line in f:
        _line = line.rstrip().split("\t")        
        if _line[1]!="-":
            pos = "_".join(_line[:2])
            keys.append(pos)
            dic[Name][pos]=_line[-1]
    f.close()    
    print f
def sortNum(x):
    return int(x.split("_")[0])
keys = list(set(keys))
keys = sorted(keys,key=sortNum)#print keys

head = "Position\tO.rufipogon"
for i in lst:
    Name = i.split("_")[-1]
    head = head+"\t"+Name

out.write(head+"\n")
for i in keys:
    line = i.split("_")    
    for key in lst:
        key = key.split("_")[-1]        
        if i in dic[key].keys():
            line.append(dic[key][i])        
        else:
            line.append(line[1])
    _line = "\t".join(line)
    _line = _line+"\n"
    out.write(_line)    
    print _line
out.close()

欢迎关注分享：

QQ群：生信百科 575383226