【精选】生物信息学（4）

您所在的位置：网站首页 › blast序列比对原理 › 【精选】生物信息学（4）

【精选】生物信息学（4）

2023-11-07 08:16| 来源: 网络整理| 查看: 265

生物信息学系列博客索引

生物信息学（1）——双序列比对之Needleman-Wunsch(NW)算法详解及C++实现生物信息学（2）——双序列比对之Smith-Waterman(SW)算法详解生物信息学（3）——双序列比对之BLAST算法简介生物信息学（4）——多序列比对之CLUSTAL算法详解及C++实现生物信息学（5）——基于CUDA的多序列比对并行算法的设计与代码实现项目gitee地址，内附源码、论文等文档信息

1. CLUSTAL简介

CLUSTAL算法由 Feng 和 Doolittle等人于1987年提出，是一个渐进比对算法。渐进比对算法的基本思想是重复地利用双序列比对算法, 先由两个序列的比对开始, 逐渐添加新序列, 直到一个序列簇中的所有序列都加入为止。但是不同的添加顺序会产生不同的比对结果，因此, 确定合适的比对顺序是渐进比对算法的一个关键问题。而两个序列越相似，就越能获取到高的比对效果，因此, 整个序列的比对应该从最相似的两个序列开始。

2. CLUSTAL算法过程详解 2.1 两两比对

构建一个n×n(n为序列簇中序列数量)的矩阵，横坐标与纵坐标均为整个序列簇成员，然后将序列两两比对，即用双序列比对算法计算矩阵的右上三角区域，记录两序列比对的相似度与结果。本博客使用生物信息学（1）介绍的NW算法进行两两比对。设有五个序列ABCDE,则可以构建如下图的矩阵，两两比对，获取得分在这里插入图片描述

2.2 构建向导树

CLUSTAL算法是利用邻接法，根据矩阵构建向导树，而我将其简化成将矩阵的右上三角元素入队，然后根据序列比对相似度排序，这样做更容易理解一些。在这里插入图片描述

2.3 渐近比对

根据向导树顺序或者队列出队顺序，逐步比对，比对规则为:如果序列 1 和序列 2 均不在已经添加的序列当中，默认序列 1 与 A 序列进行比对，并调整，序列 1 或序列 2 在已经添加的序列之中，以在添加序列的序列为基准，进行调整。

调整指的是已在最终多序列比对序列中的基准 A1 序列和待规划序列中的 A2 序列进行调整的过程。A1 与 A2 序列如下图 3-3，内容相同，中间会因为序列比对产生的不规律的’-’符，目的是将两个序列通过添加补充’-’达到最简完全相同。在这里插入图片描述

3. 运行结果

在这里插入图片描述

4. CLUSTAL算法C++实现 #include #include #include #include #include #include #include #include //声明命名空间std using namespace std; #define MATCH 1 #define DIS_MATCH -1 #define INDEL -3 #define INDEL_CHAR '-' class ResUnit { //一次双序列比对后的结果 public: string str1; //原始序列1 string str2; //原始序列2 string res1; //结果序列1 string res2; //结果序列2 int score; //序列总得分，反映两个序列的相似程度 int tag; //禁止迭代多次 }; class SingleSeq { //一个序列被整合后的样子 public: string str; //一个序列的原始序列 string res; //一个序列被整合后的样子 }; struct BacktrackingUnit { int goUp; //是否向上回溯 int goLeftUp; //是否向左上回溯 int goLeft; //是否向左回溯 int score; //得分矩阵第(i, j)这个单元的分值 }; typedef struct BacktrackingUnit *unitLine; int max3(int a, int b, int c); int myCompare(char a, char b); ResUnit traceback(unitLine** item, const int i, const int j, string str1, string str2, string res1, string res2, int n, ResUnit resUnit); ResUnit NeedlemanWunch(string str1, string str2); void getResUnitMatrix(string *s, int length, ResUnit **res); int ifStrInQueueFinish(string str, vector queue_finish); vector RegularTwo(ResUnit tag, ResUnit temp, vector queue_finish); vector RegularSeq1(ResUnit tag, vector queue_finish, int item); vector RegularSeq2(ResUnit tag, vector queue_finish, int item); struct SequenceUnit { string *str1; //匹配序列1 string *str2; //匹配序列2 int score; }; bool complare(const ResUnit &a, const ResUnit &b) { return a.score > b.score; } //主方法入口 int main() { clock_t startTime, endTime; startTime = clock();//计时开始 //clust算法 const int sequence_groups = 31; string *ss = new string[sequence_groups]; ss[0] = "ACCTTGGGAAAATTCCGGGACA"; ss[1] = "AACGGAAAATTCCGGGACCTT"; ss[2] = "AATTCCGGAATTCCGGGACA"; ss[3] = "AATTCGGAAAATTCCGGGACA"; ss[4] = "AATTCCGGAAAATTCCGACA"; ss[5] = "AACCCGGAAAATTCCGGGACA"; ss[6] = "AATTCCGGAAAACTCGGGACA"; ss[7] = "AATGGAAAATTCCGCGGCA"; ss[8] = "AATGTCCGGAAAATTCCGGGACA"; ss[9] = "ATGCGGAAAATTCCCAAGGGACA"; ss[10] = "AATCGGAAATTATTCCGGGACA"; ss[11] = "CCTCCCGGATTCCGGGAGCA"; ss[12] = "AACTGCGGAAAATTCCGGGACA"; ss[13] = "AATTCCGGAAAATTCCGGATCCA"; ss[14] = "AATATCCGAAAATTCCGGGACA"; ss[15] = "AATTCCGGAAAATCCATCCACA"; ss[16] = "AATCGGAAAATTCGGGACA"; ss[17] = "AACCGGAAAATTCCCCTGGGACA"; ss[18] = "ATTTCCGGAAATTCCGGGACA"; ss[19] = "AATTCCGGAAATTCCGGCCACA"; ss[20] = "AAGGCGGAAAATTCGCCGGACA"; ss[21] = "AATGGCGAAAATTCCGGGACA"; ss[22] = "AATCCGGAAAATTCCTCCACA"; ss[23] = "AATTTCGGGAAAATTCCGGGACA"; ss[24] = "AATTCCGGAAAATTCCGGGCTACA"; ss[25] = "AACCTCGGAAAATTCCGGGACA"; ss[26] = "AATTAACCGGAAAATGGGGACA"; ss[27] = "ACCATCCGGAAAATTCCGGGACA"; ss[28] = "AATTCCGGAAAATTCATGCGGACA"; ss[29] = "AATTCCCATGAAAATTCCGGGACA"; ss[30] = "AACCTAGGAAAATTCCGGGACA"; vector queue_initial(0); //定义等待整合的队列，是一个ResUnit对象vector vector queue_finish(0); //定义整合完毕的队列，是一个String类型的vector vector::iterator it; //迭代器，访问queue_initial vector::iterator it_finish; //迭代器，访问queue_finish //结果矩阵，，二维向量 ResUnit **res; res = new ResUnit*[sequence_groups]; for (int i = 0; i res[i][j] = ResUnit(); } } getResUnitMatrix(ss, sequence_groups, res); //开始多序列比对 //定义队列长度 int queue_length = ((sequence_groups - 1)*sequence_groups) / 2; cout //放入容器 queue_initial.push_back(res[i][j]); } } //排序 sort(queue_initial.begin(), queue_initial.end(), complare); //最多循环queue_length次 for (int i = 0;i SingleSeq singleSeq1, singleSeq2; singleSeq1.str = queue_initial.at(i).str1; singleSeq1.res = queue_initial.at(i).res1; singleSeq2.str = queue_initial.at(i).str2; singleSeq2.res = queue_initial.at(i).res2; //如果结果队列已经有元素，，且又来了俩不相干的，却很匹配的序列对 if (queue_finish.size()>0) { //将结果队列第一个的序列和queue_initial.at(i).str1进行双序列比对 ResUnit temp = NeedlemanWunch(queue_finish.front().str, queue_initial.at(i).str1); //进行规整操作 queue_finish = RegularTwo(queue_initial.at(i), temp, queue_finish); } else { queue_finish.push_back(singleSeq1); queue_finish.push_back(singleSeq2); } } //str1在，str2不在 else if (ifStrInQueueFinish(queue_initial.at(i).str1, queue_finish) > -1 && ifStrInQueueFinish(queue_initial.at(i).str2, queue_finish) int item = ifStrInQueueFinish(queue_initial.at(i).str2, queue_finish); queue_finish = RegularSeq2(queue_initial.at(i), queue_finish, item); } } //声明一个迭代器，来访问vector容器 cout cout if (E2[i] == E1[j]) { i++; j++; } else { if (E2[i] == '-') { E1.insert(j, "-"); F.insert(j, "-"); } else if (E1[j] == '-') { E2.insert(i, "-"); A2.insert(i, "-"); } } } if (i == E2.length()) { //E2先到头 for (int k = 0; k //E1先到头 for (int k = 0; k if (A1[i] == A2[j]) { i++; j++; } else { if (A1[i] == '-') { A2.insert(j, "-"); E1.insert(j, "-"); F.insert(j, "-"); } else if (A2[j] == '-') { A1.insert(i, "-"); for (it = queue_finish.begin();it != queue_finish.end();it++) { it->res = it->res.insert(i, "-"); } } } } if (i == A1.length()) { //A1先到头 for (int k = 0; k it->res = it->res + tempStr; } } else if (j == A2.length()) { //A2先到头 for (int k = 0; k SingleSeq main_seq = queue_finish.at(item);//找到和seq1相同的序列 string A1 = main_seq.res; string A2 = tag.res1; string E = tag.res2; string tempStr = ""; vector::iterator it;//声明一个迭代器，来访问vector容器 int i = 0, j = 0; while (A1 != A2 && i i++; j++; } else { if (A1[i] == '-') { A2.insert(j, "-"); E.insert(j, "-"); } else if (A2[j] == '-') { //遍历queue_finish,给queue_finish内res洗头 A1.insert(i, "-"); for (it = queue_finish.begin();it != queue_finish.end();it++) { it->res = it->res.insert(i, "-"); } } } } if (i == A1.length()) { //A1先到头 for (int k = 0; k it->res = it->res + tempStr; } } else if (j == A2.length()) { //A2先到头 for (int k = 0; k SingleSeq main_seq = queue_finish.at(item);//找到和seq1相同的序列 string A1 = main_seq.res; string A2 = tag.res2; string E = tag.res1; string tempStr = ""; vector::iterator it;//声明一个迭代器，来访问vector容器 int i = 0, j = 0; while (A1 != A2 && i i++; j++; } else { if (A1[i] == '-') { A2.insert(j, "-"); E.insert(j, "-"); } else if (A2[j] == '-') { //遍历queue_finish,给queue_finish内res洗头 A1.insert(i, "-"); for (it = queue_finish.begin();it != queue_finish.end();it++) { it->res = it->res.insert(i, "-"); } } } } if (i == A1.length()) { //A1先到头 for (int k = 0; k it->res = it->res + tempStr; } } else if (j == A2.length()) { //A2先到头 for (int k = 0; k int i = 0; vector::iterator it;//声明一个迭代器，来访问vector容器 for (it = queue_finish.begin();it != queue_finish.end();it++) { if (str == it->str) return i; i++; } return -1; } /** 循环比较一组序列的值，返回一个ResUnit对象数组，二维，且是个倒三角形状其中，s是一个字符串类型的数组，存储等待序列比对的一组数据 */ void getResUnitMatrix(string *s, int length, ResUnit **res) { int sLength = length; cout for (int j = i + 1;j int temp = a > b ? a : b; return temp > c ? temp : c; } /** 比较两个字符类型属于什么，match，dismatch，indel */ int myCompare(char a, char b) { if (a == b) return MATCH; else if (a == ' ' || b == ' ') return INDEL; else return DIS_MATCH; } ResUnit traceback(unitLine** item, const int i, const int j, string str1, string str2, string res1, string res2, int n, ResUnit resUnit) { unitLine temp = item[i][j]; if (resUnit.tag != 1) { if (!(i || j)) { // 到矩阵单元(0, 0)才算结束，这代表初始的两个字符串的每个字符都被比对到了 resUnit.str1 = str1; resUnit.str2 = str2; resUnit.res1 = res1; resUnit.res2 = res2; resUnit.tag = 1; return resUnit; } if (temp->goUp) { // 向上回溯一格 res1 = str1[i - 1] + res1; res2 = INDEL_CHAR + res2; resUnit = traceback(item, i - 1, j, str1, str2, res1, res2, n + 1, resUnit); } if (temp->goLeftUp) { // 向左上回溯一格 res1 = str1[i - 1] + res1; res2 = str2[j - 1] + res2; resUnit = traceback(item, i - 1, j - 1, str1, str2, res1, res2, n + 1, resUnit); } if (temp->goLeft) { // 向左回溯一格 res1 = INDEL_CHAR + res1; res2 = str2[j - 1] + res2; resUnit = traceback(item, i, j - 1, str1, str2, res1, res2, n + 1, resUnit); } return resUnit; } else { return resUnit; } } ResUnit NeedlemanWunch(string str1, string str2) { //字符串str1,str2长度 const int m = str1.length(); const int n = str2.length(); int m1, m2, m3, mm; unitLine **unit; // 初始化 if ((unit = (unitLine **)malloc(sizeof(unitLine*) * (m + 1))) == NULL) { fputs("Error: Out of space!\n", stderr); exit(1); } for (int i = 0; i fputs("Error: Out of space!\n", stderr); exit(1); } for (int j = 0; j fputs("Error: Out of space!\n", stderr); exit(1); } unit[i][j]->goUp = 0; unit[i][j]->goLeftUp = 0; unit[i][j]->goLeft = 0; } } unit[0][0]->score = 0; for (int i = 1; i unit[0][j]->score = INDEL * j; unit[0][j]->goLeft = 1; } // 动态规划算法计算得分矩阵每个单元的分值 for (int i = 1; i m1 = unit[i - 1][j]->score + INDEL; m2 = unit[i - 1][j - 1]->score + myCompare(str1[i - 1], str2[j - 1]); m3 = unit[i][j - 1]->score + INDEL; mm = max3(m1, m2, m3); unit[i][j]->score = mm; //判断路径来源 if (m1 == mm) unit[i][j]->goUp = 1; if (m2 == mm) unit[i][j]->goLeftUp = 1; if (m3 == mm) unit[i][j]->goLeft = 1; } } //开始回溯 ResUnit res; res.tag = 0; res = traceback(unit, m, n, str1, str2, "", "", 0, res); res.score = unit[m][n]->score; //释放内存 for (int i = 0; i free(unit[i][j]); } free(unit[i]); } free(unit); //返回值 return res; } 5. 总结

虽然CLUSTAL算法具有较高的精度，但是由于其构造导向树的距离，即两两比对的过程，需要迭代调用双序列比对算法，

这将使得算法的时间复杂度来到n的四次方，数据愈多，需要消耗的时间就指数增加，对CPU造成巨大压力。

下一篇将利用CUDA平台调度GPU提供的并行计算能力加速CLUSTAL执行速度。

如果有什么问题，请私信联系我或者在评论区留言码字不易，若有帮助，给个关注和赞呗

在这里插入图片描述

【本文地址】

【精选】生物信息学（4）

【精选】生物信息学（4）

今日新闻

推荐新闻