SHORE API
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
alignment_processing.hpp
Go to the documentation of this file.
1 
2 /*
3  * Copyright 2008,2009,2010,2011,2012 Stephan Ossowski, Korbinian Schneeberger,
4  * Felix Ott, Joerg Hagmann, Alf Scotland, Sebastian Bender
5  *
6  * This file is part of SHORE.
7  *
8  * SHORE is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * SHORE is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with SHORE. If not, see <http://www.gnu.org/licenses/>.
20  */
21 
25 
26 #ifndef SHORE_ALIGNMENT_PROCESSING_HPP__
27 #define SHORE_ALIGNMENT_PROCESSING_HPP__
28 
29 #include <deque>
30 #include <iostream>
31 #include <list>
32 #include <map>
33 #include <set>
34 #include <utility>
35 #include <vector>
36 
43 #include "shore/datatype/coor.hpp"
44 #include "shore/base/stringops.hpp"
45 
46 namespace shore {
47 
48 class serialization_core_access;
49 
52 {
53  private:
54 
55  std::deque<std::string> m_ids;
56  std::deque<int> m_chromosomes;
57  std::deque<long> m_positions;
58  std::deque<size_t> m_sizes;
59  std::deque<shore::Strand> m_strands;
60  std::deque<int> m_hits;
61  std::deque<std::string> m_alignments;
62  std::deque<int> m_mismatches;
63  std::deque<int> m_indexes;
64  std::deque<shore::alignment::InsertState> m_pe_states;
65  std::deque<std::string> m_quals1;
66  std::deque<std::string> m_quals2;
67  std::deque<std::string> m_quals3;
68 
69 
70  std::deque<int> m_mapquals;
71  std::deque<std::string> m_readgroups;
72  std::deque<int> m_num_reads;
73  std::deque<shore::read::PeOrientation> m_oris;
74  std::deque<int> m_rnext_chromosomes;
75  std::deque<long> m_rnext_positions;
76  std::deque<shore::Strand> m_rnext_strands;
77 
78  size_t m_size;
79 
80  shore::alignment m_front;
81  shore::alignment m_back;
82 
83  bool m_fifo;
84 
85  public:
86 
89 
90  alignment_queue(const bool fifo=true);
91 
92  void push_back(const shore::alignment& f);
93 
94  void pop_front();
95  void pop_back();
96 
97  void get(const size_t i,shore::alignment& res) const;
98 
99  size_t size() const;
100 
101  bool empty() const;
102 
103  const shore::alignment& front() const;
104  const shore::alignment& back() const;
105  shore::alignment& front();
106  shore::alignment& back();
107 
108  void clear();
109 
110  const shore::alignment& current() const;
111  bool has_data() const;
112  void next();
113  void append(const shore::alignment& f);
114  void flush() {}
115 };
116 
119 {
120  private:
121 
122  typedef bool(*const FtOrd_t) (const shore::alignment&,const shore::alignment&);
123 
124  static bool cmp_coor(const shore::alignment& f0,const shore::alignment& f1);
125 
126  long m_xsize;
127 
128  alignment_queue m_fbuf;
129  std::multiset<shore::alignment,FtOrd_t> m_rbuf;
130 
131  shore::refseq_coor m_flushpos;
132 
133  alignment_helper m_ah;
134 
135  public:
136 
139 
140  alignment_3px(const long xsize);
141 
142  const shore::alignment& current() const;
143  bool has_data() const;
144  void next();
145  void append(const shore::alignment& f);
146  void flush();
147 };
148 
151 {
152  public:
153 
156 
157  class pf_core
158  {
159  public:
160 
161  struct pos
162  {
166  double n;
169  double m1;
171  // for right window.
172  double m2;
175  double v1;
178  double v2;
181  double d1;
184  double d2;
186  double d;
187  //double k1;
188  //double k2;
189  //double k;
192  double l;
193  };
194 
195  private:
196 
197  std::deque<pos> m_limits;
198 
199  size_t m_width;
200  size_t m_ws;
201 
203  shore::refseq_coor m_centre;
204  std::deque<double> m_window;
205 
206  bool m_verbose;
207 
208 
209  void move_window(const shore::refseq_coor& c);
210 
211  public:
212 
213  pf_core(const size_t w=24);
214  virtual ~pf_core() {}
215 
216  void set_verbose(const bool v)
217  {
218  m_verbose=v;
219  }
220 
221  virtual void add(const shore::refseq_coor& c,const double w);
222  virtual void flush();
223 
224  pos next_limit();
225  bool has_limit() const;
226  };
227 
228  private:
229 
230  alignment_queue m_inbuff;
231  alignment_queue m_outbuff;
232  alignment_queue m_inbufr;
233  alignment_queue m_outbufr;
234 
235  alignment_helper m_ah;
236 
237  pf_core m_pf;
238  pf_core m_pr;
239  alignment_3px m_3px;
240 
241  std::map<shore::refseq_coor,size_t> m_revlimits;
242 
243  bool m_isflushed;
244 
245  double m_lmax;
246 
247  shore::feed<shore::alignment> m_duplicates;
248 
249  public:
250 
251  poissonifier(const size_t w=24);
252 
253  shore::feed<shore::alignment> &duplicates();
254 
255  void set_verbose(const bool v,const bool w)
256  {
257  m_pf.set_verbose(v);
258  m_pr.set_verbose(w);
259  }
260 
261  const shore::alignment& current() const;
262  bool has_data() const;
263  void next();
264  void append(const shore::alignment& f);
265  void flush();
266 
267  double get_lmax() const { return m_lmax; }
268 };
269 
270 inline std::ostream& operator<<(std::ostream& os,const poissonifier::pf_core::pos& p)
271 {
272  os<<"chr="<<p.c.chromosome<<"; pos="<<p.c.position
273  <<"; n="<<p.n
274  <<"; m1="<<p.m1
275  <<"; m2="<<p.m2
276  <<"; v1="<<p.v1
277  <<"; v2="<<p.v2
278  <<"; d1="<<p.d1
279  <<"; d2="<<p.d2
280  <<"; d="<<p.d
281  <<"; l="<<p.l;
282  return os;
283 }
284 
287 {
288  private:
289 
290  std::vector<int> m_seqmap;
291  shore::alignment m_current;
292  bool m_hasdata;
293 
294  public:
295 
298 
299  seqid_translator(const std::vector<int> &seqmap);
300 
301  const shore::alignment &current() const;
302 
303  bool has_data() const;
304 
305  void next();
306 
307  void append(const shore::alignment &f);
308 
309  void flush() {}
310 };
311 
316 {
317  private:
318 
319  typedef std::pair<shore::Strand,std::pair<shore::refseq_coor,std::string> > key_type;
320 
321  alignment_queue m_queue;
322  shore::alignment m_fidwait;
323  bool m_hasdata;
324  bool m_idwait;
325  std::string m_currentid;
326 
327  int m_current_ed1;
328  int m_current_ed2;
329  int m_current_hits1;
330  int m_current_hits2;
331  std::set<key_type> m_current_alignments1;
332  std::set<key_type> m_current_alignments2;
333 
335  shore::signal<void> m_sigdiscardhit_flush;
336 
337 
338  void clearbesthit1();
339  void clearbesthit2();
340  bool tweak_front();
341  void prune_queue();
342 
343  public:
344 
347 
348  besthit_filter();
349 
350  const shore::alignment &current() const;
351 
352  bool has_data() const;
353 
354  void next();
355 
356  void append(const shore::alignment &f);
357 
358  void flush();
359 
361  shore::signal<void>& sigdiscardhit_flush();
362 };
363 
366 {
367  private:
368 
369  struct element
370  {
372  int sequence;
373  size_t size;
374  int index;
375  int edit_distance;
376  int hits;
377  char strand;
378 
379 
380  element() {}
381  element(const shore::alignment &f);
382  element(const shore::read &f);
383 
384 
386  template<class Archive>
387  void serialize(Archive& ar,const unsigned int version)
388  {
389  ar&sequence;
390  ar&size;
391  ar&index;
392  ar&edit_distance;
393  ar&hits;
394  ar&strand;
395  }
396  };
397 
398  typedef bool(*cmp_t)(const element &,const element &);
399 
400  std::map<element,size_t,cmp_t> m_stats;
401 
402  static bool cmp(const element &e1,const element &e2);
403 
405 
406  public:
407 
409 
411 
412  void append(const shore::alignment &f);
413  void append(const shore::read &f);
414  void append(const alignment_statistic &other);
415  void flush() {}
416 
417  void save(std::ostream &out) const;
418  void save(const std::string &outfile) const;
419 
420  void load(const std::string &statsfile);
421 
424  void update(const std::string &statsfile,const std::string &mapfile,
425  const std::string &reffile=std::string());
426 
427  void clear();
428 
433  double calc_total() const;
434 };
435 
441 :public shore::pipe_facade<shore::hits_correction_single_pipe,
442  shore::alignment,shore::alignment>
443 {
444  private:
445 
446  friend class pipeline_core_access;
447 
448  shore::alignment m_current;
449  alignment_queue m_queue;
450  long m_current_hits;
451 
452 
453  void next();
454  void prepare(const shore::alignment& f);
455  void append(const shore::alignment& f);
456 
457  void flush();
458 };
459 
462 :public shore::pipe_facade<shore::flatten_alignment_pipe,
463  shore::alignment,shore::read>
464 {
465  private:
466 
467  friend class pipeline_core_access;
468 
470 
471  std::vector<std::string> m_current_ids;
472  shore::read m_current;
473 
474  bool m_remove_redundant;
475 
476 
477  void next();
478  void prepare(const shore::alignment &f);
479  void append(const shore::alignment &f);
480  void flush();
481 
482  public:
483 
484  flatten_alignment_pipe(const bool remove_redundant=true);
485 };
486 
490 :public shore::pipe_facade<mapping_pair_finder_pipe,
491  shore::alignment,std::vector<shore::alignment> >
492 {
493  private:
494 
495  friend class pipeline_core_access;
496 
498  typedef std::pair<shore::Strand,std::string> pairkey_t;
499 
502  typedef std::pair<std::vector<shore::alignment>,bool> paircollect_t;
503 
505  pairkey_t m_key;
506  std::deque<paircollect_t> m_pairwait_q;
507  uint64_t m_queueofs;
508  std::map<shore::refseq_coor,std::map<pairkey_t,size_t> > m_nextones;
509 
510  bool m_find_all_concordant;
511  long m_find_unique_distance;
512 
513 
514  void emit_next();
515 
517  void next();
518 
520  void prepare(const shore::alignment &a);
522  void append(const shore::alignment &a);
524  void flush();
525 
526  public:
527 
528  mapping_pair_finder_pipe(bool conc,long any);
529 };
530 
533 :public shore::pipe_facade<mapping_pair_joiner_pipe,
534  std::vector<shore::alignment>,shore::alignment>
535 {
536  private:
537 
538  friend class pipeline_core_access;
539 
541  shore::alignment m_current;
542  std::string m_buf;
543 
545  void next();
546 
548  void prepare(const std::vector<shore::alignment> &a);
550  void append(const std::vector<shore::alignment> &a);
552  void flush();
553 };
554 
557 :public shore::pipe_facade<alignment_trimmer_pipe,shore::alignment,shore::alignment>
558 {
559  private:
560 
561  friend class pipeline_core_access;
562 
566  shore::alignment m_current;
568  const shore::refseq_range m_range;
569 
571  void append(const shore::alignment &a);
572 
573  public:
574 
577 };
578 
581 :public shore::pipe_facade<alignment_filter_pipe,
582  shore::alignment,shore::alignment>
583 {
584  public:
585 
586  class outofrange_exception
587  :public std::exception
588  {};
589 
590  struct config
591  {
593  int p3x;
594 
596  bool join_happy;
598  long join_unique;
599 
601  std::pair<size_t,size_t> hitrange;
603  std::pair<size_t,size_t> edrange;
605  shore::OptionalStrand strand;
607  std::set<size_t> lengths;
609  std::set<std::pair<shore::alignment::InsertState,int> > peflags;
610 
612  double duplicates;
613 
615  shore::refseq_range posrange;
616 
618  int wpoiss;
619 
620  bool position_sorted;
621 
622  config();
623 
624  void publish_options(shore::av_parser& p,
625  const char*const D="strand,T",
626  const char*const F="duplicate-estimation,F",
627  const char*const N="read-lengths,N",
628  const char*const X="p3fix,X",
629  const char*const B="duplicates,B",
630  const char*const R="region,R",
631  const char*const M="mm-range,M",
632  const char*const H="hits-range,H",
633  const char*const pe="peflags",
634  const char*const jh="join-happy",
635  const char*const ju="join-unique");
636 
637  void sanity_check(shore::av_parser& p);
638 
639  bool operator==(const config& c2) const
640  {
641  return
642  (p3x==c2.p3x)
643  &&(join_happy==c2.join_happy)
644  &&(join_unique==c2.join_unique)
645  &&(hitrange==c2.hitrange)
646  &&(edrange==c2.edrange)
647  &&(strand==c2.strand)
648  &&(lengths==c2.lengths)
649  &&(peflags==c2.peflags)
650  &&(duplicates==c2.duplicates)
651  &&(posrange==c2.posrange)
652  &&(wpoiss==c2.wpoiss);
653  }
654 
655  bool is_default() const
656  {
657  return (*this)==(config());
658  }
659  };
660 
661  private:
662 
663  friend class pipeline_core_access;
664 
665  const config m_defaults;
666  const config m_conf;
667 
669 
672 
674  shore::splitter<shore::alignment> m_rangefilter;
676  mapping_pair_finder_pipe m_pairfinder;
678  mapping_pair_joiner_pipe m_pairjoiner;
680  shore::splitter<shore::alignment> m_firstfilter;
687 
689 
690  bool flt_hits(const shore::alignment& f);
691  bool flt_editdist(const shore::alignment& f);
692  bool flt_strand(const shore::alignment& f);
693  bool flt_length(const shore::alignment& f);
694  bool flt_PE(const shore::alignment& f);
695  bool flt_position_sorted(const shore::alignment& f);
696  bool flt_position_unsorted(const shore::alignment& f);
697 
698  struct flt_duplicates
699  {
700  config conf;
701 
703 
704  double m_duplicacy_f;
705  shore::refseq_coor m_lastpos_f;
706  std::map<shore::refseq_coor,double> m_duplicacy_r;
707 
708  flt_duplicates(const config& c);
709 
710  bool operator()(const shore::alignment& f);
711  };
712 
713  void init_filters();
714 
715  template<typename T>
716  void connect_rangefilter(T &source)
717  {
718  if(!m_rangefilter.empty())
719  {
720  source|m_rangefilter;
721  connect_pejoin(m_rangefilter);
722  }
723  else connect_pejoin(source);
724  }
725 
726  template<typename T>
727  void connect_pejoin(T &source)
728  {
729  if(m_conf.join_happy||(m_conf.join_unique>0))
730  {
731  source|m_pairfinder|m_pairjoiner;
732  connect_filter1(m_pairjoiner);
733  }
734  else connect_filter1(source);
735  }
736 
737  template<typename T>
738  void connect_filter1(T &source)
739  {
740  if(!m_firstfilter.empty())
741  {
742  source|m_firstfilter;
743  m_firstfilter.failures()|m_failures;
744  connect_3primex(m_firstfilter);
745  }
746  else connect_3primex(source);
747  }
748 
749  template<typename T>
750  void connect_3primex(T &source)
751  {
752  if(m_conf.p3x!=0)
753  {
754  source|m_3primex;
755  connect_dup(m_3primex);
756  }
757  else connect_dup(source);
758  }
759 
760  template<typename T>
761  void connect_dup(T &source)
762  {
763  if(m_conf.wpoiss!=0)
764  {
765  source|m_dupfilter;
766  m_dupfilter->duplicates()|m_failures;
767  connect_last(m_dupfilter);
768  }
769  else connect_last(source);
770  }
771 
772  template<typename T>
773  void connect_last(T &source)
774  {
775  if(!m_lastfilter.empty())
776  {
777  source|m_lastfilter|m_extractor;
778  m_lastfilter.failures()|m_failures;
779  }
780  else source|m_extractor;
781  }
782 
784  void next();
785 
787  void prepare(const shore::alignment &a);
789  void append(const shore::alignment &a);
791  void flush();
792 
793  public:
794 
795  alignment_filter_pipe(const config &c);
796 
797  shore::thru<shore::alignment> &failures();
798 };
799 
803 :public shore::pipe_facade<alignment_condenser_pipe,
804  shore::alignment,shore::alignment>
805 {
806  public:
807 
808  enum IDStyle
809  {
810  ID_PRESERVE,
811  ID_PREFIXED,
812  ID_PLAIN
813  };
814 
815  private:
816 
817  friend class pipeline_core_access;
818 
819  int m_compact_aln;
820  int m_qgrid;
821  bool m_discard_qual_other;
822 
823  IDStyle m_readid_style;
824 
825  shore::alignment m_buf;
826 
828  std::vector<std::pair<bool,int> > m_tokbuf;
829 
831 
832  std::map<std::string,int> *m_readid_map;
833 
834 
836  template<typename Iter>
837  void condense_quality(Iter beg,Iter end) const
838  {
839  for(size_t i=0;(beg!=end)&&(i<m_tokbuf.size());++i)
840  {
841  // Match
842  if(m_tokbuf[i].first)
843  {
844  Iter save=beg;
845 
846  int qmin=uint8_t(*beg);
847  int qmax=uint8_t(*beg);
848 
849  for(int j=0;(beg!=end)&&(j<m_tokbuf[i].second);
850  ++j,++beg)
851  {
852  const int qcur=uint8_t(*beg);
853 
854  if((qcur>(qmin+m_qgrid))
855  ||(qcur<(qmax-m_qgrid)))
856  {
857  while(save!=beg)
858  {
859  (*save)=char(qmin);
860  ++save;
861  }
862 
863  qmin=qmax=qcur;
864  }
865  else
866  {
867  qmin=std::min(qmin,qcur);
868  qmax=std::max(qmax,qcur);
869  }
870  }
871  while(save!=beg)
872  {
873  (*save)=char(qmin);
874  ++save;
875  }
876  }
877  // Not a match
878  else
879  {
880  for(int j=0;(beg!=end)&&(j<m_tokbuf[i].second);
881  ++j,++beg)
882  {}
883  }
884  }
885  }
886 
887  void append(const shore::alignment &a);
888 
889  public:
890 
891  alignment_condenser_pipe(const int compact_aln=1,const int qgrid=0,
892  const bool discard_qual_other=false,
893  const IDStyle readid_style=ID_PRESERVE,
894  std::map<std::string,int> *const readid_map=0);
895 };
896 
897 S_MAKE_ENUM_TRAITS(alignment_condenser_pipe::IDStyle)
898 
899 } // namespace
900 
901 #endif // SHORE_ALIGNMENT_PROCESSING_HPP__
902