SHORE API
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
datatypes.hpp
Go to the documentation of this file.
1 
2 /*
3  * Copyright 2008,2009,2010,2011,2012 Stephan Ossowski, Korbinian Schneeberger,
4  * Felix Ott, Joerg Hagmann, Alf Scotland, Sebastian Bender
5  *
6  * This file is part of SHORE.
7  *
8  * SHORE is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * SHORE is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with SHORE. If not, see <http://www.gnu.org/licenses/>.
20  */
21 
24 
25 #ifndef DATATYPES_HPP__
26 #define DATATYPES_HPP__
27 
28 #include <string>
29 #include <vector>
30 #include <map>
31 
32 #include <boost/shared_ptr.hpp>
33 
36 #include "shore/base/memops.hpp"
37 #include "shore/datatype/coor.hpp"
38 
39 namespace shore {
40 
42 struct read
43 {
45  static const int ASCIIENC_ILLUMINA=64;
47  static const int ASCIIENC_SANGER=33;
49  static const int ASCIIENC_CHASTITY=-10;
50 
53  {
62  };
63 
64  static const char insstate_enc[5];
65 
68  {
77  };
78 
79  static const char *ori_conv[4];
80 
82  std::string id;
83 
85  std::string sequence;
86 
88  std::string qual_std;
89 
94  int index;
95  InsertState pe_state;
96 
98  std::string qual_aux;
99 
101  struct accessory
102  {
104  static const uint16_t CLIP_RIGHT_NA;
106  static const int EDIT_DISTANCE_NA;
107 
113 
115  uint16_t clip_left;
117  uint16_t clip_right;
119  std::string barcode;
121  std::string readgroup;
122 
129 
130  PeOrientation ori;
131 
133  accessory();
134 
135  void reset();
136 
137  bool empty() const;
138 
139  void write(std::ostream &str,const char *const sep) const;
140  };
141 
144 
146  read();
147 
149  bool is_firstread() const;
151  bool is_lastread() const;
152 
154  int get_qual_std(const size_t pos) const;
155 
158  void get_part(shore::read &buf,size_t left,size_t right,bool set_tags) const;
159 
162  void get_clipped_read(shore::read &buf,bool set_tags=false) const;
163 
166  void get_read_prefix(shore::read &buf,bool set_tags=false) const;
167 
170  void get_read_suffix(shore::read &buf,bool set_tags=false) const;
171 
172  typedef bool(*compare_type)(const read &,const read &);
173 
175  static bool cmp_id(const read &r1,const read &r2);
176 
178  static bool cmp_seq(const read &r1,const read &r2);
179 
181  static bool cmp_id_str(const std::string &id1,const std::string &id2);
182 
185  static void qual_sanger2illumina(std::string &qual);
188  static void qual_illumina2sanger(std::string &qual);
191  static void qual_solexa2sanger(std::string &qual);
192 
194  static char enc_qual_sanger(const int q);
196  static int dec_qual_sanger(const char q);
197 };
198 
199 S_MAKE_ENUM_TRAITS(read::PeOrientation)
200 
201 
204 struct alignment
205 {
207  static const char strandconv_dp[5];
209  //static const bool peconv_legacy[9];
210 
213  {
223  // size distribution
228  // \brief Partner is unmapped.
229  PE_MAPPING_ORPHAN,
232  // \brief Concordant, but another mapping has a more likely pairing
233  PE_SUBOPTIMAL
234  };
235 
236  static const read::InsertState insstate_flatten[9];
237  static const char insstate_enc[10];
238 
240  std::string id;
241 
244 
246  size_t readlength;
247 
250 
252  int hits;
253 
255  std::string alignment_string;
256 
259 
261  size_t offset;
262 
264  int index;
265  InsertState pe_state;
266 
268  std::string qual_std;
269 
271  std::string qual_aux;
272 
274  std::string qual_raw;
275 
277  struct accessory
278  {
279  static const int MAP_QUAL_NA=255;
280 
283  int map_qual;
284 
287  std::string readgroup;
288 
295 
297 
301 
302  shore::refseq_coor nextread_coor;
303  shore::Strand nextread_strand;
304 
306  accessory();
308  explicit accessory(const shore::read::accessory &read_tags);
309 
311  accessory & operator=(const shore::read::accessory &read_tags);
312 
313  void write(std::ostream &os,const char *const sep) const;
314  void reset();
315  bool empty() const;
316  };
317 
320 
322  alignment();
323 
324  // convenience functions ///////////////////////////////////////////////////
325 
329  char get_qual_std_char(const size_t pos) const;
333  char get_qual_aux_char(const size_t pos) const;
334 
338  int get_qual_std(const size_t pos) const;
342  int get_qual_aux(const size_t pos) const;
343 
345  //static int plain_peflag(const int pairing);
346 
348  //static bool is_first(const int pairing);
349 
351  bool is_firstread() const;
353  bool is_lastread() const;
354 
356  bool is_fwd_frag() const;
358  bool is_rev_frag() const;
359 
361  bool partner_mapped() const;
362 
363 
365  typedef bool(*cmp_type)(const alignment &,const alignment &);
366 
368  static bool cmp_coor(const alignment &f1,const alignment &f2);
370  static bool cmp_id(const alignment &f1,const alignment &f2);
372  static bool cmp_null(const alignment &f1,const alignment &f2);
373 };
374 
376 
377 class fasta_reader;
378 
381 {
382  private:
383 
384  friend class fasta_reader;
385 
387  long m_enumerated;
389  std::string m_header;
391  std::string m_id;
393  std::string m_supplement;
395  std::string m_md5sum;
396 
398  std::string m_sequence_str;
400  intpack m_sequence_pack;
402  boost::shared_ptr<shore::mmapping> m_sequence_mmpp;
403  std::streamoff m_mmpp_off;
404  std::streamsize m_mmpp_size;
405 
406 
407  size_t m_size;
408  const char *m_cbegin;
409  const char *m_cend;
410  dna_iterator m_seqbegin;
411  dna_iterator m_seqend;
412  chardna_iterator m_seqbegin_c;
413  chardna_iterator m_seqend_c;
414 
415 
417  void init();
418 
419  public:
420 
421  sequence_record();
422  sequence_record(const sequence_record &other);
423  sequence_record &operator=(const sequence_record &other);
424 
426  const std::string &get_header() const;
428  const std::string &get_id() const;
430  const std::string &get_supplement() const;
431 
433  long enumerated() const;
434 
436  dna_iterator seqbegin() const;
438  dna_iterator seqend() const;
442  chardna_iterator seqend_c() const;
443 
445  void substr(std::string &res,const size_t ofs,size_t size=0) const;
447  std::string substr(const size_t ofs,const size_t size=0) const;
449  char operator[](const size_t i) const;
451  bool compare(const size_t ofs,const size_t size,const std::string &str) const;
452  bool empty() const;
453 
455  size_t size() const;
456 
458  std::string md5sum() const;
459 
461  void write_sequence(std::ostream &out) const;
462 };
463 
465 
467 struct sff_read
468 {
470  struct commonhead
471  {
476  uint32_t magic;
477 
479  char version[4];
480 
486  uint64_t index_offset;
487  uint32_t index_length;
488 
490  uint32_t number_of_reads;
491 
498  uint16_t header_length;
499 
504  uint16_t key_length;
505 
511 
524 
530  std::string flow_chars;
531 
536  std::string key_sequence;
537 
542  // uint8_t* eight_byte_padding (omitted)
543  };
544 
545  /*
546  * If an index is included in the file, the index_offset and index_length values in the common
547  * header should point to the section of the file containing the index. To support different
548  * indexing methods, the index section should begin with the following two fields:
549  *
550  * - index_magic_number uint32_t
551  * - index_version char[4]
552  *
553  * ... and should end with an eight_byte_padding field, so that the length of the index section
554  * is divisible by 8. The format of the rest of the index section is specific to the indexing
555  * method used. The index_length given in the common header should include the bytes of
556  * these fields and the padding.
557  */
558 
560  struct readhead
561  {
568 
573  uint16_t name_length;
574 
579  uint32_t number_of_bases;
580 
603  uint16_t clip_qual_left;
604  uint16_t clip_qual_right;
605  uint16_t clip_adapter_left;
606  uint16_t clip_adapter_right;
607 
612  std::string name;
613 
618  //uint8_t[*]
619  //eight_byte_padding
620  };
621 
623  struct readdata
624  {
632  std::vector<uint16_t> flowgram_values;
633 
643  std::vector<uint8_t> flow_index_per_base;
644 
649  std::string bases;
650 
656  std::string quality_scores;
657 
662  //uint8_t[*] eight_byte_padding
663  };
664 
665  commonhead *chead;
666  readhead rhead;
667  readdata rdata;
668 
669  sff_read()
670  :chead(0)
671  {}
672 
673  typedef bool(*compare_type)(const sff_read &,const sff_read &);
674 
676  static bool cmp_id(const sff_read &r1,const sff_read &r2);
677 
679  static bool cmp_seq(const sff_read &r1,const sff_read &r2);
680 };
681 
683 
688 {
693  std::string seqid;
694 
701  std::string source;
702 
709  std::string type;
710 
720  long pos;
721  long end;
722 
727  double score;
728  static const double SCORE_NA;
729  static bool is_score_NA(const double& score);
730 
736 
749  enum Phase
750  {
751  PHASE_NA=-1,
752  PHASE_0=0,
753  PHASE_1=1,
754  PHASE_2=2
755  };
756 
757  Phase phase;
758 
765  std::vector<std::string> other_attributes;
766 
767  static const std::string ATTR_ID;
768  static const std::string ATTR_NAME;
769  static const std::string ATTR_ALIAS;
770  static const std::string ATTR_PARENT;
771  static const std::string ATTR_TARGET;
772  static const std::string ATTR_GAP;
773  static const std::string ATTR_DERIVES_FROM;
774  static const std::string ATTR_NOTE;
775  static const std::string ATTR_DBXREF;
776  static const std::string ATTR_ONTOLOGY_TERM;
777  static const std::string ATTR_IS_CIRCULAR;
778 
785  std::string id;
786 
790  std::string name;
791 
796  std::string alias;
797 
802  std::string parent;
803 
809  std::string target;
810 
815  std::string gap;
816 
821  std::string derives_from;
822 
824  std::string note;
825 
828  std::string dbxref;
829 
832  std::string ontology_term;
833 
836 };
837 
838 std::istream &operator>>(std::istream &is,annotation_entry::Phase &phase);
839 std::ostream &operator<<(std::ostream &os,const annotation_entry::Phase &phase);
840 
842 
845 {
846  std::string files;
847  std::string delta_type;
848 
849  std::string ref_seqid;
850  std::string qry_seqid;
851  size_t ref_seqsize;
852  size_t qry_seqsize;
853 
855  size_t ref_start;
857  size_t ref_end;
858 
861  size_t qry_start;
862 
865  size_t qry_end;
866 
868  int err_edit;
871 
873  int nstop;
874 
875  std::vector<int> deltas;
876 };
877 
879 
881 struct coverage
882 :public shore::refseq_region
883 {
884  shore::OptionalStrand strand;
885 
886  double score;
887 
888  coverage()
889  :refseq_region(refseq_coor(1,1),0),
890  strand(shore::STR_NA_OPT),score(0.0)
891  {}
892 };
893 
895 
896 struct segment
897 :public shore::refseq_region
898 {
899  std::string id;
900 
901  shore::OptionalStrand strand;
902 
903  double min_coverage;
904  double max_coverage;
905  double p;
906 
907  segment()
908  :refseq_region(refseq_coor(1,1),0),
909  strand(shore::STR_NA_OPT),
910  min_coverage(-1),max_coverage(-1),p(-1)
911  {}
912 
914  typedef bool(*cmp_type)(const segment &,const segment &);
915 
917  static bool cmp_coor(const segment &f1,const segment &f2);
918 };
919 
921 
924 {
925  int num_id;
926  std::string str_id;
927  size_t size;
928  std::string md5;
929  std::string supplement;
930 
931  operator shore::refseq_region() const;
932 };
933 
935 
938 {
939  std::string ecotype;
940  std::string chromosome;
941  long position;
942  char ref_base;
943  char new_base;
944  int quality;
945  unsigned int support;
946  double concordance;
947  double repetitiveness;
948 };
949 
951 
952 struct wga_record
953 {
954  size_t id;
955 
956  std::string ref_seqid;
957  std::string qry_seqid;
958  long ref_pos;
959  long qry_pos;
960  int ref_size;
961  int qry_size;
962 
963  shore::Strand strand;
964 
965  std::string aln;
966 
967  int nsubst;
968  int nins;
969  int ndel;
970  int nmatch;
971 
972  wga_record();
973  void init(const int idnum,const std::string &refid,const std::string &qryid);
974 };
975 
976 } // namespace shore
977 
978 #endif /* DATATYPES_HPP__ */