u_int64_tsubstr_kmer_int(string&str,int start,int length) {u_int64_t val =0; for (size_t i = start; i < start + length; i++) { val = val <<2; val += (str[i] >>1&3); } return val; }
然后我们使用下面的一组位运算来获得反向补码。
u_int64_trevComp(u_int64_t x,size_t sizeKmer) {u_int64_t res = x; res = ((res >>2&0x3333333333333333) | (res &0x3333333333333333) <<2); res = ((res >>4&0x0F0F0F0F0F0F0F0F) | (res &0x0F0F0F0F0F0F0F0F) <<4); res = ((res >>8&0x00FF00FF00FF00FF) | (res &0x00FF00FF00FF00FF) <<8); res = ((res >>16&0x0000FFFF0000FFFF) | (res &0x0000FFFF0000FFFF) <<16); res = ((res >>32&0x00000000FFFFFFFF) | (res &0x00000000FFFFFFFF) <<32); res = res ^0xAAAAAAAAAAAAAAAA; return (res >> (2* (32- sizeKmer))); }
计算 K-mer 直方图
我们可以扩展上述函数来获取序列的 K-mer 直方图,这在生物信息学分析中非常常用。在第一个函数中,我们获得给定 k 值的所有可能的 K-mer。这是为了通过仅更新查找表来使计算更快。
unordered_map<int,double> kSizeMers(int size) { unordered_map<int,double> kmers; for (u_int64_t i =0; i < (int)pow(4, size); i++) {kmers[min(i,revComp(i, size))] =0; } return kmers; } vector<double> getKmersProfile(string&line,int size,bool normalize,unordered_map<int,double> &KMERS) {double total =0;long len =0;u_int64_t val =0; unordered_map<int,double> kmers;kmers.insert(KMERS.begin(),KMERS.end()); vector<double>stats(kmers.size(),0); for (int i =0; i < (int)line.length(); i++) {if (!(line[i] =='A'||line[i] =='C'||line[i] =='G'||line[i] =='T')) { val =0; len =0;continue; } val = (val <<2); val = val & (int)(pow(2,2* size) -1); val += (line[i] >>1&3); len++; if (len == size) { // use val as the kmer for counting len--;kmers[min(val,revComp(val, size))]++; total++; } } int i =0;for (unordered_map<int,double>::iterator it =kmers.begin(); it !=kmers.end(); ++it) {if (normalize) {stats[i] =it->second /max(1.0, total); }else {stats[i] =it->second; } i++; } return stats; }