1 #include "htslib/sam.h"     2 #include "htslib/bgzf.h"    12 #include "trim_primer_quality.h"    13 #include "primer_bed.h"    15 int32_t get_pos_on_query(uint32_t *cigar, uint32_t ncigar, int32_t pos, int32_t ref_start){
    18   int32_t ql = 0, rl = ref_start;
    19   for (uint32_t i = 0; i < ncigar; ++i){
    20     cig  = bam_cigar_op(cigar[i]);
    21     n = bam_cigar_oplen(cigar[i]);
    22     if (bam_cigar_type(cig) & 2) { 
    24     if (bam_cigar_type(cig) & 1) 
    30     if (bam_cigar_type(cig) & 1) 
    36 int32_t get_pos_on_reference(uint32_t *cigar, uint32_t ncigar, uint32_t pos, uint32_t ref_start){
    39   uint32_t ql = 0, rl = ref_start;
    40   for (uint32_t i = 0; i < ncigar; ++i){
    41     cig  = bam_cigar_op(cigar[i]);
    42     n = bam_cigar_oplen(cigar[i]);
    43     if (bam_cigar_type(cig) & 1) { 
    45     if (bam_cigar_type(cig) & 2) 
    51     if (bam_cigar_type(cig) & 2) 
    57 void reverse_qual(uint8_t *q, 
int l){
    58   for (
int i = 0; i < l/2; ++i){
    65 void reverse_cigar(uint32_t *cigar, 
int l){
    66   for (
int i = 0; i < l/2; ++i){
    67     cigar[i]^=cigar[l-i-1];
    68     cigar[l-i-1]^=cigar[i];
    69     cigar[i]^=cigar[l-i-1];
    73 double mean_quality(uint8_t *a, 
int s, 
int e){
    75   for (
int i = s; i < e; ++i){
    82 cigar_ quality_trim(bam1_t* r, uint8_t qual_threshold, uint8_t sliding_window){
    83   uint32_t *ncigar = (uint32_t*) malloc(
sizeof(uint32_t) * (r->core.n_cigar + 1)), 
    84     *cigar = bam_get_cigar(r);
    85   uint8_t *qual = bam_get_qual(r);
    88     reverse_qual(qual, r->core.l_qseq);
    91   int del_len, cig, temp;
    92   uint32_t i = 0, j = 0;
    93   while(m >= qual_threshold && (i < r->core.l_qseq)){
    94     m = mean_quality(qual, i, i+sliding_window);
    96     if(i > r->core.l_qseq - sliding_window)
    99   del_len = r->core.l_qseq - i;
   100   start_pos = get_pos_on_reference(cigar, r->core.n_cigar, del_len, r->core.pos); 
   101   if(bam_is_rev(r) && start_pos <= r->core.pos) {
   111     reverse_cigar(cigar, r->core.n_cigar);
   113   reverse_cigar(cigar, r->core.n_cigar); 
   114   while(i < r->core.n_cigar){
   116       ncigar[j] = cigar[i];
   121     cig  = bam_cigar_op(cigar[i]);
   122     n = bam_cigar_oplen(cigar[i]);
   123     if ((bam_cigar_type(cig) & 1)){ 
   125     ncigar[j] = bam_cigar_gen(n, BAM_CSOFT_CLIP);
   126       } 
else if (del_len < n){
   127     ncigar[j] = bam_cigar_gen(del_len, BAM_CSOFT_CLIP);
   131       n = std::max(n - del_len, 0);
   132       del_len = std::max(del_len - temp, 0);
   134     ncigar[j] = bam_cigar_gen(n, cig);
   140   reverse_cigar(ncigar, j); 
   142     reverse_cigar(ncigar, j);
   151 void print_cigar(uint32_t *cigar, 
int nlength){
   152   for (
int i = 0; i < nlength; ++i){
   153     std::cout << ((cigar[i]) & BAM_CIGAR_MASK);
   154     std::cout << 
"-" << ((cigar[i]) >> BAM_CIGAR_SHIFT) << 
" ";
   158 cigar_ primer_trim(bam1_t *r, int32_t new_pos){
   159   uint32_t *ncigar = (uint32_t*) malloc(
sizeof(uint32_t) * (r->core.n_cigar + 1)), 
   160     *cigar = bam_get_cigar(r);
   161   uint32_t i = 0, j = 0;
   162   int del_len, cig, temp;
   164     del_len = bam_cigar2qlen(r->core.n_cigar, bam_get_cigar(r)) - get_pos_on_query(cigar, r->core.n_cigar, new_pos, r->core.pos) - 1;
   166     reverse_cigar(cigar, r->core.n_cigar);
   170     del_len = get_pos_on_query(cigar, r->core.n_cigar, new_pos, r->core.pos);
   173   while(i < r->core.n_cigar){
   175       ncigar[j] = cigar[i];
   180     cig  = bam_cigar_op(cigar[i]);
   181     n = bam_cigar_oplen(cigar[i]);
   182     if ((bam_cigar_type(cig) & 1)){ 
   184     ncigar[j] = bam_cigar_gen(n, BAM_CSOFT_CLIP);
   185       } 
else if (del_len < n){
   186     ncigar[j] = bam_cigar_gen(del_len, BAM_CSOFT_CLIP);
   190       n = std::max(n - del_len, 0);
   191       del_len = std::max(del_len - temp, 0);
   193     ncigar[j] = bam_cigar_gen(n, cig);
   200     reverse_cigar(ncigar, j);
   209 void replace_cigar(bam1_t *b, 
int n, uint32_t *cigar){
   210   if (n != b->core.n_cigar) {
   211     int o = b->core.l_qname + b->core.n_cigar * 4;
   212     if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) {
   213       b->m_data = b->l_data + (n - b->core.n_cigar) * 4;
   214       kroundup32(b->m_data);
   215       b->data = (uint8_t*)realloc(b->data, b->m_data);
   217     memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o);
   218     memcpy(b->data + b->core.l_qname, cigar, n * 4);
   219     b->l_data += (n - b->core.n_cigar) * 4;
   221   } 
else memcpy(b->data + b->core.l_qname, cigar, n * 4);
   224 int16_t get_overlapping_primer_indice(bam1_t* r, std::vector<primer> primers){
   225   uint32_t query_pos, start_pos, *cigar = bam_get_cigar(r);
   227     start_pos = bam_endpos(r)-1;
   228     query_pos = start_pos + (bam_cigar2qlen(r->core.n_cigar, cigar) - get_pos_on_query(cigar, r->core.n_cigar, start_pos, r->core.pos)) - 1;
   230     start_pos = r->core.pos;
   231     query_pos = start_pos - get_pos_on_query(cigar, r->core.n_cigar, start_pos, r->core.pos);
   233   for(std::vector<int>::size_type i = 0; i!=primers.size();i++){
   234     if(query_pos >= primers[i].get_start() && query_pos <= primers[i].get_end() && start_pos >= primers[i].get_start() && start_pos <= primers[i].get_end()) 
   240 cigar_ remove_trailing_query_ref_consumption(uint32_t* cigar, uint32_t n){
   241   int i = 0, len = 0, cig, start_pos = 0;
   243     cig = bam_cigar_op(cigar[i]);
   244     len = bam_cigar_oplen(cigar[i]);
   245     if((bam_cigar_type(cig) & 2) && (bam_cigar_type(cig) & 1))
   247     if(bam_cigar_type(cig) & 1){    
   248       cigar[i] = bam_cigar_gen(len, BAM_CSOFT_CLIP);
   250     if(bam_cigar_type(cig) & 2){    
   251       for (
int j = i; j < n-1; ++j){
   252     cigar[j] = cigar[j+1];
   260   reverse_cigar(cigar, n);
   263     cig = bam_cigar_op(cigar[i]);
   264     len = bam_cigar_oplen(cigar[i]);
   265     if((bam_cigar_type(cig) & 2) && (bam_cigar_type(cig) & 1))
   267     if(bam_cigar_type(cig) & 1){    
   268       cigar[i] = bam_cigar_gen(len, BAM_CSOFT_CLIP);
   270     if(bam_cigar_type(cig) & 2){    
   271       for (
int j = i; j < n-1; ++j){
   272     cigar[j] = cigar[j+1];
   280   reverse_cigar(cigar, n);
   288 cigar_ condense_cigar(uint32_t* cigar, uint32_t n){
   289   int i = 0, len = 0, cig, next_cig, start_pos = 0;
   290   cigar_ t = remove_trailing_query_ref_consumption(cigar, n);
   293   start_pos = t.start_pos;
   295     cig = bam_cigar_op(cigar[i]);
   296     next_cig = bam_cigar_op(cigar[i+1]);
   298       len = bam_cigar_oplen(cigar[i])+bam_cigar_oplen(cigar[i+1]);
   299       cigar[i] = bam_cigar_gen(len, bam_cigar_op(cigar[i]));
   300       for(
int j = i+1; j < n - 1; j++){
   301     cigar[j] = cigar[j+1];
   315 int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, std::string region_, uint8_t min_qual, uint8_t sliding_window, 
int min_length = 30){
   316   std::vector<primer> primers = populate_from_file(bed);
   318     std::cout << 
"Bam file in empty." << std::endl;
   322   samFile *in = hts_open(bam.c_str(), 
"r");
   323   BGZF *out = bgzf_open(bam_out.c_str(), 
"w");
   325     std::cout << (
"Unable to open BAM/SAM file.") << std::endl;
   329   hts_idx_t *idx = sam_index_load(in, bam.c_str());
   331     std::cout << (
"Unable to open BAM/SAM index.") << std::endl; 
   335   bam_hdr_t *header = sam_hdr_read(in);
   338     std::cout << 
"Unable to open BAM/SAM header." << std::endl;
   340   if(bam_hdr_write(out, header) < 0){
   341     std::cout << 
"Unable to write BAM header to path." << std::endl;
   345   if (region_.empty()){
   346     std::cout << 
"Number of references: " << header->n_targets << std::endl;
   347     for (
int i = 0; i < header->n_targets; ++i){
   348       std::cout << 
"Reference Name: " << header->target_name[i] << std::endl;
   349       std::cout << 
"Reference Length: " << header->target_len[i] << std::endl;
   351     region_.assign(header->target_name[i]);
   354     std::cout << 
"Using Region: " << region_ << std::endl;
   356   std::string hdr_text(header->text);
   357   if (hdr_text.find(std::string(
"SO:coordinate"))) {
   358     std::cout << 
"Sorted By Coordinate" << std::endl; 
   359   } 
if(hdr_text.find(std::string(
"SO:queryname"))) {
   360     std::cout << 
"Sorted By Query Name" << std::endl; 
   362     std::cout << 
"Not sorted" << std::endl;
   365   hts_itr_t *iter = NULL;
   367   iter  = sam_itr_querys(idx, header, region_.c_str());
   368   if(header == NULL || iter == NULL) {
   370     std::cout << 
"Unable to iterate to region within BAM/SAM." << std::endl;
   374   bam1_t *aln = bam_init1();
   377   int16_t *p = (int16_t*)malloc(
sizeof(int16_t));
   378   uint32_t primer_trim_count = 0;
   379   while(sam_itr_next(in, iter, aln) >= 0) {
   380     *p = get_overlapping_primer_indice(aln, primers);
   384     t = primer_trim(aln, primers[*p].get_start() - 1);
   386     t = primer_trim(aln, primers[*p].get_end() + 1);
   387     aln->core.pos = primers[*p].get_end() + 1;
   389       replace_cigar(aln, t.nlength, t.cigar);
   391     t = quality_trim(aln, min_qual, sliding_window);    
   393       aln->core.pos = t.start_pos;
   394     t = condense_cigar(t.cigar, t.nlength);
   395     aln->core.pos += t.start_pos;
   396     replace_cigar(aln, t.nlength, t.cigar);
   397     if(bam_cigar2rlen(aln->core.n_cigar, bam_get_cigar(aln)) >= min_length){
   399     bam_aux_append(aln, 
"xa", 
'i', 4, (uint8_t*) p);
   400       if(bam_write1(out, aln) < 0){
   401     std::cout << 
"Not able to write to BAM" << std::endl;
   402     hts_itr_destroy(iter);
   403     hts_idx_destroy(idx);
   405     bam_hdr_destroy(header);
   412     if(ctr % 1000000 == 0){
   413       std::cout << 
"Processed " << ctr << 
"reads ... " << std::endl;
   416   std::cout << 
"Results: " << std::endl;
   417   std::cout << 
"Trimmed primers from " << primer_trim_count << 
" reads." << std::endl;
   418   hts_itr_destroy(iter);
   419   hts_idx_destroy(idx);
   421   bam_hdr_destroy(header);