1.序列比较算法(全局序列比对及局部序列比对的python实现)
前言算法思想介绍实现功能及实现方法运行结果演示源代码遇到的问题及总结
前言
阶段性地完成了DNA序列比较算法,还有很多不足和需要完善的地方有待日后改进。
算法思想介绍
一个很详细完整的算法介绍
双序列全局比对及算法 Needleman-Wunsch 算法:动态规划法 输入值:两条序列、替换记分矩阵以确定不同字母间的相似度得分,以及空位罚分
双序列局部比对算法 局部比对的计算公式在全局比对的基础上增加了第四个元素“0”。得分矩阵初始值仍是0,但第一行和第一列与全局比对不同,全是0。
实现功能及实现方法
使用已定义的记分矩阵 设置需比对的序列、gap大小及要进行的比对选择打印最高得分的序列比对结果 方法:倒序查找最终路进行序列比对打印得分矩阵及比对路径 方法:使用递归和栈记录最终路径
运行结果演示
双序列全局比对
输入序列:atcggtac;aatcgta 输入序列:atcggt;aacg
双序列局部比对
输入序列:atccga;tcga 输入序列:acgtc;cg
源代码
from numpy
import *
import copy
from matplotlib
import pyplot
as plt
from pandas
import *
def GlobalScoreMatrix(m
,n
,w
,replace
,s
,path
,senquence1
,senquence2
,gap
):
for i
in range(m
):
for j
in range(n
):
if i
== 0 and j
== 0:
s
[i
][j
] = 0
elif i
-1 < 0:
s
[i
][j
] = s
[i
][j
- 1] + gap
path
[i
,j
,0] = 1
elif j
-1 < 0:
s
[i
][j
] = s
[i
- 1][j
] + gap
path
[i
,j
,1] = 1
else:
s
[i
][j
] = max(s
[i
- 1][j
- 1] + w
[replace
[senquence2
[i
- 1]]][replace
[senquence1
[j
- 1]]],
s
[i
- 1][j
] + gap
, s
[i
][j
- 1] + gap
)
if s
[i
- 1][j
- 1] + w
[replace
[senquence2
[i
- 1]]][replace
[senquence1
[j
- 1]]] == s
[i
][j
]:
path
[i
,j
,2] = 1
if s
[i
- 1][j
] + gap
== s
[i
][j
]:
path
[i
,j
,1] = 1
if s
[i
][j
- 1] + gap
== s
[i
][j
]:
path
[i
,j
,0] = 1
def LocalScoreMatrix(m
,n
,w
,replace
,s
,path
,senquence1
,senquence2
,gap
):
for i
in range(1,m
):
for j
in range(1,n
):
s
[i
][j
] = max(0,s
[i
- 1][j
- 1] + w
[replace
[senquence2
[i
- 1]]][replace
[senquence1
[j
- 1]]],
s
[i
- 1][j
] + gap
, s
[i
][j
- 1] + gap
)
if s
[i
- 1][j
- 1] + w
[replace
[senquence2
[i
- 1]]][replace
[senquence1
[j
- 1]]] == s
[i
][j
]:
path
[i
,j
,2] = 1
if s
[i
- 1][j
] + gap
== s
[i
][j
]:
path
[i
,j
,1] = 1
if s
[i
][j
- 1] + gap
== s
[i
][j
]:
path
[i
,j
,0] = 1
def FindGlobalPath(i
,j
,path
,OnePath
,LastGlobalPath
):
if i
== 0 and j
== 0:
OnePath
.append
((i
, j
))
LastGlobalPath
.append
(copy
.deepcopy
(OnePath
))
OnePath
.pop
()
else:
for k
in range(3):
if path
[i
,j
,k
] == 1:
if k
== 0:
OnePath
.append
((i
,j
))
FindGlobalPath
(i
,j
- 1,path
,OnePath
,LastGlobalPath
)
OnePath
.pop
()
elif k
== 1:
OnePath
.append
((i
, j
))
FindGlobalPath
(i
- 1, j
, path
,OnePath
,LastGlobalPath
)
OnePath
.pop
()
else:
OnePath
.append
((i
, j
))
FindGlobalPath
(i
- 1, j
- 1, path
,OnePath
,LastGlobalPath
)
OnePath
.pop
()
def FindLocalPath(i
, j
, path
, OnePath
, LastLocalPath
):
if all(path
[i
][j
] == [0, 0, 0]):
OnePath
.append
((i
, j
))
LastLocalPath
.append
(copy
.deepcopy
(OnePath
))
OnePath
.pop
()
else:
for k
in range(3):
if path
[i
, j
, k
] == 1:
if k
== 0:
OnePath
.append
((i
, j
))
FindLocalPath
(i
, j
- 1, path
, OnePath
, LastLocalPath
)
OnePath
.pop
()
elif k
== 1:
OnePath
.append
((i
, j
))
FindLocalPath
(i
- 1, j
, path
, OnePath
, LastLocalPath
)
OnePath
.pop
()
else:
OnePath
.append
((i
, j
))
FindLocalPath
(i
- 1, j
- 1, path
, OnePath
, LastLocalPath
)
OnePath
.pop
()
def ShowContrastResult(LastPath
,senquence1
,senquence2
):
for k
,aPath
in enumerate(LastPath
):
rowS
= ''
colS
= ''
for i
in range(len(aPath
) -1,0,-1):
if aPath
[i
][0] == aPath
[i
- 1][0]:
rowS
+= senquence1
[aPath
[i
- 1][1] - 1]
colS
+= '-'
elif aPath
[i
][1] == aPath
[i
- 1][1]:
colS
+= senquence2
[aPath
[i
- 1][0] -1]
rowS
+= '-'
else:
rowS
+= senquence1
[aPath
[i
- 1][1] - 1]
colS
+= senquence2
[aPath
[i
- 1][0] - 1]
print("======比对结果",k
+1,"======")
print("序列1:",rowS
)
print("序列2:",colS
)
def judgePath(point
, LastPath
):
for aPath
in LastPath
:
if point
in aPath
:
return True
return False
def ShowPaths(senquence1
, senquence2
, LastPath
):
s1
= "0" + senquence1
s2
= "0" + senquence2
col
= list(s1
)
row
= list(s2
)
fig
= plt
.figure
(figsize
=(5, 5))
ax
= fig
.add_subplot
(111, frameon
=True, xticks
=[], yticks
=[], )
the_table
= plt
.table
(cellText
=s
, rowLabels
=row
, colLabels
=col
, rowLoc
='right',
loc
='center', cellLoc
='bottom right', bbox
=[0, 0, 1, 1])
the_table
.set_fontsize
(8)
for i
in range(m
):
for j
in range(n
):
for k
in range(3):
if path
[i
, j
, k
] == 1:
if k
== 0:
if judgePath
((i
, j
), LastPath
):
plt
.annotate
('', xy
=(j
/ n
, (2 * m
- 2 * i
- 1) / (2 * (m
+ 1))),
xytext
=((2 * j
+ 1) / (2 * n
), (2 * m
- 2 * i
- 1) / (2 * (m
+ 1))),
arrowprops
=dict(arrowstyle
="->", color
='r', connectionstyle
="arc3"))
else:
plt
.annotate
('', xy
=(j
/ n
, (2 * m
- 2 * i
- 1) / (2 * (m
+ 1))),
xytext
=((2 * j
+ 1) / (2 * n
), (2 * m
- 2 * i
- 1) / (2 * (m
+ 1))),
arrowprops
=dict(arrowstyle
="->", connectionstyle
="arc3"))
elif k
== 1:
if judgePath
((i
, j
), LastPath
):
plt
.annotate
('', xy
=((2 * j
+ 1) / (2 * n
), (2 * m
- 2 * i
- 1) / (2 * (m
+ 1))),
xytext
=((2 * j
+ 1) / (2 * n
), (m
- i
) / (m
+ 1)),
arrowprops
=dict(arrowstyle
="<-", color
='r', connectionstyle
="arc3"))
else:
plt
.annotate
('', xy
=((2 * j
+ 1) / (2 * n
), (2 * m
- 2 * i
- 1) / (2 * (m
+ 1))),
xytext
=((2 * j
+ 1) / (2 * n
), (m
- i
) / (m
+ 1)),
arrowprops
=dict(arrowstyle
="<-", connectionstyle
="arc3"))
elif k
== 2:
if judgePath
((i
, j
), LastPath
):
plt
.annotate
('', xy
=((2 * j
+ 1) / (2 * n
), (2 * m
- 2 * i
- 1) / (2 * (m
+ 1))),
xytext
=(j
/ n
, (m
- i
) / (m
+ 1)),
arrowprops
=dict(arrowstyle
="<-", color
='r', connectionstyle
="arc3"))
else:
plt
.annotate
('', xy
=((2 * j
+ 1) / (2 * n
), (2 * m
- 2 * i
- 1) / (2 * (m
+ 1))),
xytext
=(j
/ n
, (m
- i
) / (m
+ 1)),
arrowprops
=dict(arrowstyle
="<-", connectionstyle
="arc3"))
plt
.show
()
replace
= {'A':0,'G':1,'C':2,'T':3}
w
= [[10,-1,-3,-4],[-1,7,-5,-3],[-3,-5,9,0],[-4,-3,0,8]]
senquence1
= input("请输入序列1:").upper
()
senquence2
= input("请输入序列2:").upper
()
gap
= int(input("请输入gap:"))
choise
= int(input("请选择要进行的序列比对(1-全局序列比对 2-局部序列比对) : "))
m
= len(senquence2
) + 1
n
= len(senquence1
) + 1
s
= zeros
((m
,n
))
path
= zeros
((m
,n
,3))
OnePath
= []
LastGlobalPath
= []
LastLocalPath
= []
if choise
== 1:
GlobalScoreMatrix
(m
,n
,w
,replace
,s
,path
,senquence1
,senquence2
,gap
)
FindGlobalPath
(m
-1,n
-1,path
,OnePath
,LastGlobalPath
)
ShowContrastResult
(LastGlobalPath
,senquence1
,senquence2
)
ShowPaths
(senquence1
, senquence2
, LastGlobalPath
)
elif choise
== 2:
LocalScoreMatrix
(m
, n
, w
, replace
, s
, path
, senquence1
, senquence2
, gap
)
row
= where
(s
== np
.max(s
))[0]
col
= where
(s
== np
.max(s
))[1]
for i
in range(len(row
)):
FindLocalPath
(row
[i
], col
[i
], path
, OnePath
, LastLocalPath
)
ShowContrastResult
(LastLocalPath
, senquence1
, senquence2
)
ShowPaths
(senquence1
, senquence2
, LastLocalPath
)
else:
print("无效选择!")
遇到的问题及总结
思维误区: 最初在存储最终路径的问题上,认为在每次路径递归至初始结点时才将其入栈,导致遇到分岔路则无法记录前一条路的完整路径 经过高人指点后解决:每次到达一个结点就将其入栈,递归结束后将其出栈
思维误区2:最初以采用存储每个点的前一点坐标为存储方式,导致所有得分矩阵只能存储一条路径 再次经过高人指点后解决:改存储方式为存储每个点计算得来的方向
递归终止条件存储最终路径 -涉及问题:list的赋值、浅拷贝、深拷贝 Python List的赋值方法
画出路径矩阵表格 -涉及问题:表格与画布大小不一致且导致无法确定箭头坐标 解决方式:使用bbox参数
获取得分矩阵中最大值的索引 - 涉及问题:获取where结果的索引值
局部比对递归终止条件 - 涉及问题:列表比较 any() all()
总结 这次的作业因为拖延很晚才开始,遇到的问题也绝不仅仅只有以上几个,最深刻的还是最开始的思维误区,直接卡到崩溃,但其他问题也都耗费了或多或少的时间才解决,更有无数小问题调试了无数遍才得以做出这个半成品。其实还有很多待完善的地方,比如输入的字符判断,比如一致度和相似度的计算,比如图形化界面的编写,写出来的代码也还有很多不成熟的地方,但是因为时间问题,暂时就到这了,有时间再改进咯。