SHORE API
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
sam.hpp
Go to the documentation of this file.
1 
2 /*
3  * Copyright 2008,2009,2010,2011,2012 Stephan Ossowski, Korbinian Schneeberger,
4  * Felix Ott, Joerg Hagmann, Alf Scotland, Sebastian Bender
5  *
6  * This file is part of SHORE.
7  *
8  * SHORE is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * SHORE is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with SHORE. If not, see <http://www.gnu.org/licenses/>.
20  */
21 
25 
26 #ifndef SHORE_IO_SAM_HPP__
27 #define SHORE_IO_SAM_HPP__
28 
29 #include <map>
30 #include <string>
31 
36 #include "shore/stream/streams.hpp"
37 #include "shore/fmtio/text.hpp"
38 
39 namespace shore {
40 
43 {
44  SAM_PE_PAIRED_END=0x001,
45  SAM_PE_PROPER_PAIR=0x002,
46  SAM_UNMAPPED_SELF=0x004,
47  SAM_PE_UNMAPPED_MATE=0x008,
48  SAM_REVERSE_SELF=0x010,
49  SAM_REVERSE_MATE=0x020,
50  SAM_PE_READ1=0x040,
51  SAM_PE_READ2=0x080,
52  SAM_NO_PRIMARY_ALIGNMENT=0x100,
53  SAM_QUALITY_FILTERED=0x200,
54  SAM_DUPLICATE_READ=0x400
55 };
56 
58 class sam_base
59 {
60  public:
61 
62  class leftover_pipe
63  :public shore::pipe_facade<leftover_pipe,shore::read,shore::read>
64  {
65  private:
66 
67  friend class sam_base;
68 
69  leftover_pipe();
70 
71  public:
72 
73  void next();
74  void prepare(const shore::read &r);
75  void append(const shore::read &r);
76  void flush();
77  };
78 
81  struct config
82  {
83  std::string refseq_fn;
84 
86  int max_ed;
87  int max_gaps;
88  int max_gapsize;
90  int max_hits;
91 
92  int forced_readindex;
93  shore::alignment::InsertState forced_pestate;
94 
95  bool discard_softclipped;
96 
97  bool ignore_xareads;
98 
99  bool bismark;
100 
101  config(const std::string &reffn=std::string());
102  };
103 
104  struct cigarstats
105  {
106  int edit_distance;
107  int ngaps;
108  int nsoftclipped;
109  };
110 
111  private:
112 
113  leftover_pipe m_leftovers;
114 
115  protected:
116 
117  config m_conf;
118 
120  static const int MAXTAGS=3224;
121 
123  std::pair<int,shore::alignment::InsertState> m_peconv[0x801];
125  std::pair<int,shore::read::InsertState> m_peconv_left[0x801];
126 
128  int m_tid[256][256];
129 
130  struct samheader
131  {
132  struct sh_hd
133  {
134  std::string vn;
135  std::string so;
136  };
137 
138  struct sh_sq
139  {
140  std::string sn;
141  size_t ln;
142  std::string as;
143  std::string m5;
144  std::string sp;
145  std::string ur;
146  };
147 
148  sh_hd hd;
149 
151  std::vector<sh_sq> sq;
153  std::map<std::string,size_t> m5_map;
156  std::map<std::string,size_t> sn_map;
157 
158 
160  void append(const std::string& s);
161 
162  void flush() {}
163  };
164 
166 
168  std::map<std::string,size_t> m_refmap;
169  typedef std::map<std::string,size_t>::const_iterator ichr_t;
171  std::vector<shore::sequence_record> m_refseq;
173  std::vector<int> m_bamid2seqid;
174 
176  void load_reference();
177 
179  void xa_parse(shore::alignment &f,std::string &seq,std::string &cigar,
180  bool &isrev,cigarstats &cigsta,size_t &xaofs,
181  const std::string &xa);
182 
183  public:
184 
185  sam_base(const config &c);
186 
187  leftover_pipe &leftovers();
188 
189  void set_max_ed(const int ed);
190  void set_max_gaps(const int g);
191  void set_max_gapsize(const int g);
192  void set_max_hits(const int h);
193 
194  void discard_softclipped(const bool d);
195 
198  static void cigar2aln(std::string &aln,cigarstats &cs,
199  const std::string &read,const std::string &cigar,
200  const shore::sequence_record &chr,
201  const size_t start,const int max_gapsize=1);
202 };
203 
206 :public sam_base
207 {
208  public:
209 
211 
212  private:
213 
215  static const int MAXCOLS=MAXTAGS+11;
216 
217  enum SAMCol
218  { // SAM doc column names
219  SC_ID=0, // QNAME
220  SC_FLAG=1, // FLAG
221  SC_CHRNAME=2, // RNAME
222  SC_POS=3, // POS
223  SC_MAPPINGQUALITY=4,// MAPQ
224  SC_CIGAR=5, // CIGAR
225  SC_MATE_SEQNAME=6, // MRNM
226  SC_MATE_POS=7, // MPOS
227  SC_INSERTSIZE=8, // ISIZE
228  SC_READSEQ=9, // SEQ
229  SC_QUALITY=10 // QUAL
230  };
231 
232  typedef boost::function<void (const std::string*const)> tagfun_t;
234  tagfun_t m_tagfuns[MAXTAGS+1];
235 
236  shore::line_reader m_in;
237 
238  shore::alignment m_buf1;
239  shore::alignment m_buf2;
240  shore::alignment *m_current;
241  shore::alignment *m_last;
242  shore::read m_current_left;
243 
244  std::string m_linebuf;
245 
246  std::string m_splitbuf[MAXCOLS];
247  int m_ntok;
248  cigarstats m_cigsta;
249 
250  bool m_hasdata;
251  bool m_needinit;
252 
253  const std::string* m_xastring;
254  size_t m_xapos;
255  shore::refseq_coor m_xa_parentcoor;
256 
257  std::string m_xrtag;
258  std::string m_xgtag;
259  std::string m_xmtag;
260  void convert_alignment_string();
261 
262  bool m_emit_as_leftover;
263  int m_currentsamflag;
264  bool m_current_is_revcomp;
265 
266  std::string m_rangeindexfile;
267 
269 
270  void tagfun_invalid(const std::string*const ptag);
271  void tagfun_MD(const std::string*const ptag);
272  void tagfun_X0(const std::string*const ptag);
273  void tagfun_XA(const std::string*const ptag);
274  void tagfun_NH(const std::string*const ptag);
275  void tagfun_RG(const std::string*const ptag);
276  void tagfun_XR(const std::string*const ptag);
277  void tagfun_XG(const std::string*const ptag);
278  void tagfun_XM(const std::string*const ptag);
279 
280  public:
281 
282  sam_reader(const std::string& fn,const config &c,
284 
285  ~sam_reader();
286 
287  bool has_data();
288 
289  const shore::alignment& current() const;
290 
291  void next();
292 
293  const std::string &current_line() const;
294 
295  void set_range(const shore::refseq_coor& s,const shore::refseq_coor& e,
296  const long maxrl=0l);
297 };
298 
301 
304 :public sam_base
305 {
306  public:
307 
309 
310  private:
311 
312  typedef boost::function<void (const char**)> tagfun_t;
314  tagfun_t m_tagfuns[MAXTAGS+1];
315 
316  static const char CIGARCONV[17];
317  static const char BASECONV[17];
318 
319  shore::istreams m_in;
320 
321  shore::alignment m_buf1;
322  shore::alignment m_buf2;
323  shore::alignment *m_current;
324  shore::alignment *m_last;
325  shore::read m_current_left;
326 
327  cigarstats m_cigsta;
328 
329  bool m_hasdata;
330  bool m_needinit;
331 
332  bool m_emit_as_leftover;
333  int m_currentsamflag;
334  bool m_current_is_revcomp;
335 
337 
338  size_t m_readbufsize;
339  char *m_readbuf;
340 
341  size_t m_nextblocklen;
342 
343  std::string m_cigarbuf;
344  std::string m_seqbuf;
345  std::string m_xabuf;
346  size_t m_xaofs;
347  shore::refseq_coor m_xa_parentcoor;
348 
349  void ensure_buf(size_t size);
350 
351  void read_header();
352 
353  void tagfun_invalid(const char **ptag);
354  void tagfun_default(const char **ptag);
355  void tagfun_MD(const char **ptag);
356  void tagfun_X0(const char **ptag);
357  void tagfun_XA(const char **ptag);
358  void tagfun_NH(const char **ptag);
359  void tagfun_RG(const char **ptag);
360 
361  public:
362 
363  bam_reader(const std::string& fn,const config &c,
365  ~bam_reader();
366 
367  bool has_data();
368 
369  const shore::alignment& current() const;
370 
371  void next();
372 
373  void set_range(const shore::refseq_coor& s,const shore::refseq_coor& e,
374  const long maxrl=0l);
375 };
376 
379 
382 {
383  private:
384 
385  shore::ostreams m_os;
386 
387  std::string m_mdbuf;
388  std::string m_cgbuf;
389  std::string m_sqbuf;
390  std::string m_qualbuf;
391 
392  int m_peconv[9];
393 
394  int m_seqmax;
395 
397 
398  std::set<std::string> m_allowed_readgroups;
399 
400  bool m_correct_leftover_flags;
401  bool m_orphan_partners_only;
402 
404  std::map<std::string,std::pair<shore::refseq_coor,shore::Strand> > m_orphan_coors;
405 
406  public:
407 
408  enum SamSorting
409  {
410  SAM_SO_UNKNOWN,
411  SAM_SO_UNSORTED,
412  SAM_SO_COORDINATE,
413  SAM_SO_QUERYNAME
414  };
415 
417 
418  sam_writer(const std::string& fn,const std::vector<std::string> &allowed_readgroups);
419  sam_writer(std::ostream*const os,const std::vector<std::string> &allowed_readgroups);
420 
423  void write_header(const std::string& fasta_fn=std::string(),
424  const SamSorting sort=SAM_SO_UNKNOWN);
425 
428  void set_correct_leftover_flags(const bool b);
429 
432  void set_orphan_leftovers_only(const bool b);
433 
434  void append(const shore::alignment& f);
435  void append(const shore::read& f);
436 
437  void flush();
438 };
439 
440 S_MAKE_ENUM_TRAITS(sam_writer::SamSorting)
441 
442 typedef shore::sink<sam_writer> sam_sink;
444 
446 typedef shore::sink<sam_writer,shore::read> sam_read_sink;
447 
448 } // namespace
449 
450 #endif // SHORE_IO_SAM_HPP__
451