SHORE API
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
2dex.hpp
Go to the documentation of this file.
1 
2 /*
3  * Copyright 2008,2009,2010,2011,2012 Stephan Ossowski, Korbinian Schneeberger,
4  * Felix Ott, Joerg Hagmann, Alf Scotland, Sebastian Bender
5  *
6  * This file is part of SHORE.
7  *
8  * SHORE is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * SHORE is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with SHORE. If not, see <http://www.gnu.org/licenses/>.
20  */
21 
25 
26 #ifndef SHORE_ALGO_TWODEX_HPP__
27 #define SHORE_ALGO_TWODEX_HPP__
28 
29 #include <iterator>
30 #include <set>
31 #include <string>
32 #include <vector>
33 
34 #include <boost/iterator/iterator_facade.hpp>
35 
36 #include "shore/algo/d2tree.hpp"
40 #include "shore/base/memops.hpp"
41 #include "shore/datatype/coor.hpp"
42 #include "shore/base/stringops.hpp"
43 
44 namespace shore {
45 
48 :public boost::iterator_facade<
49  size_iterator,size_t const,boost::random_access_traversal_tag>
50 {
51  private:
52 
53  friend class boost::iterator_core_access;
54 
55  size_t m_value;
56 
57  void increment()
58  {
59  ++m_value;
60  }
61 
62  void decrement()
63  {
64  --m_value;
65  }
66 
67  void advance(difference_type n)
68  {
69  m_value+=n;
70  }
71 
72  difference_type distance_to(const size_iterator& other) const
73  {
74  return other.m_value-m_value;
75  }
76 
77  bool equal(const size_iterator& other) const
78  {
79  return other.m_value==m_value;
80  }
81 
82  const reference dereference() const
83  {
84  return m_value;
85  }
86 
87  public:
88 
89  size_iterator(const size_t value=0)
90  :m_value(value)
91  {}
92 
93  size_iterator(const size_iterator& other)
94  :m_value(other.m_value)
95  {}
96 
97  size_iterator& operator=(const size_iterator& other)
98  {
99  m_value=other.m_value;
100  return *this;
101  }
102 
103  const size_t operator[](difference_type n) const
104  {
105  return m_value+n;
106  }
107 };
108 
112 {
113  const intpack::const_iterator& pos;
114 
116  :pos(p)
117  {}
118 
119  blockcmp_pos(const intpack& p)
120  :pos(p.begin())
121  {}
122 
123  bool operator()(const size_t i,const size_t j) const
124  {
125  return pos[i]<pos[j];
126  }
127 
128  bool operator()(const size_t i,const std::pair<size_t,size_t>& posend) const
129  {
130  return pos[i]<posend.first;
131  }
132 
133  bool operator()(const std::pair<long,long>& posend,const size_t i) const
134  {
135  return posend.first<long(pos[i]);
136  }
137 };
138 
142 {
143  const intpack::const_iterator& pos;
144  const intpack::const_iterator& sizes;
145 
147  :pos(p),sizes(s)
148  {}
149 
150  blockcmp_end(const intpack& p,const intpack& s)
151  :pos(p.begin()),sizes(s.begin())
152  {}
153 
154  bool operator()(const size_t i,const size_t j) const
155  {
156  return (pos[i]+sizes[i])<(pos[j]+sizes[j]);
157  }
158 
159  bool operator()(const size_t i,const std::pair<size_t,size_t>& posend) const
160  {
161  return (pos[i]+sizes[i])<posend.second;
162  }
163 
164  bool operator()(const std::pair<long,long>& posend,const size_t i) const
165  {
166  return posend.second<long(pos[i]+sizes[i]);
167  }
168 };
169 
173 {
174  const intpack::const_iterator& pos;
175  const intpack::const_iterator& sizes;
176 
178  :pos(p),sizes(s)
179  {}
180 
181  blockcmp_posend(const intpack& p,const intpack& s)
182  :pos(p.begin()),sizes(s.begin())
183  {}
184 
185  bool operator()(const size_t i,const size_t j) const
186  {
187  return pos[i]<(pos[j]+sizes[j]);
188  }
189 
190  bool operator()(const size_t i,const std::pair<size_t,size_t>& posend) const
191  {
192  return pos[i]<posend.second;
193  }
194 
195  bool operator()(const std::pair<long,long>& posend,const size_t i) const
196  {
197  return posend.first<long(pos[i]+sizes[i]);
198  }
199 };
200 
202 class twodex
203 {
204  public:
205 
206  enum D2Type
207  {
208  D2_SIZE,
209  D2_INCLUSIVE,
210  D2_EXCLUSIVE,
211  D2_MAPLIST,
212  D2_CIGAR
213  };
214 
217  {
224  };
225 
227  struct build_spec
228  {
229  std::string colspec;
230  size_t blocksize;
231  size_t maxgap;
232  D2Type d2type;
233  char commentchar;
234 
235  build_spec();
236  };
237 
238  struct seq_info
239  {
240  size_t id;
241  long offset;
242  long firstpos;
243 
244  seq_info(const size_t i)
245  :id(i),offset(std::numeric_limits<long>::min()),
246  firstpos(std::numeric_limits<long>::max())
247  {}
248  };
249 
251  struct header
252  {
253  size_t blocksize;
254 
255  int chr_col;
256  int pos_col;
257  int d2_col;
258  D2Type d2type;
259  char commentchar;
260 
261  bool sorted;
262 
263  std::streamsize seq_total;
264  std::streamsize file_total;
265  size_t nnzblocks;
266 
267  std::map<std::string,seq_info> sequences;
268  std::vector<std::map<std::string,seq_info>::iterator> seqv;
269 
270  int bitsperid;
271  int bitsperpos;
272  int bitspersize;
273 
274  size_t id_bytes;
275  size_t pos_bytes;
276  size_t size_bytes;
277 
278  header();
279 
280  size_t parse(std::istream& in);
281  void cleardata();
282 
283  void print(std::ostream& os) const;
284  };
285 
287  struct bytecount
288  {
289  std::streamsize count;
290 
291  bytecount();
292  void operator()(const std::string& s);
293  };
294 
295  private:
296 
297  struct block
298  {
299  typedef std::pair<shore::refseq_coor,shore::refseq_coor> range_t;
300 
301  std::vector<range_t> ranges;
302  std::vector<range_t> rangebuf;
303 
304  void add_range(const shore::refseq_coor& p,const long e,const long maxgap)
305  {
306  range_t orange(p,shore::refseq_coor(p.chromosome,e));
307 
308  for(size_t i=0;i<ranges.size();++i)
309  {
310  if(maxgap<0)
311  {
312  // maxgap not set, blocks may cross chromosome borders
313  if(ranges[i].first<orange.first)
314  orange.first=ranges[i].first;
315  if(orange.second<ranges[i].second)
316  orange.second=ranges[i].second;
317  }
318  else if((ranges[i].first.chromosome!=p.chromosome)
319  ||(ranges[i].second.position<=(p.position-maxgap))
320  ||((e+maxgap)<=ranges[i].first.position))
321  {
322  rangebuf.push_back(ranges[i]);
323  }
324  else
325  {
326  if(ranges[i].first.position<orange.first.position)
327  orange.first.position=ranges[i].first.position;
328 
329  if(orange.second.position<ranges[i].second.position)
330  orange.second.position=ranges[i].second.position;
331  }
332  }
333 
334  ranges.clear();
335  ranges.swap(rangebuf);
336  ranges.push_back(orange);
337  }
338  };
339 
340  header m_header;
341 
342  shore::mmapping m_mmapping;
343 
344  const char *m_data;
345  intpack::const_iterator m_idbeg;
346  intpack::const_iterator m_idend;
347  intpack::const_iterator m_posbeg;
348  intpack::const_iterator m_posend;
349  intpack::const_iterator m_sizebeg;
350  intpack::const_iterator m_sizeend;
351 
352  d2tree<blockcmp_pos,blockcmp_end,blockcmp_posend> m_tree;
353 
354  QueryMode m_querymode;
355 
356 
357  void load(const std::string &indexfile);
358 
359  public:
360 
370  static void build(const std::string &fn,const std::string &colspec,
371  const size_t blocksize,const size_t maxgap,
372  const D2Type d2type,const char commentchar,
373  std::ostream *const log=0);
374 
375  static void build(const std::string &fn,const build_spec &bs,
376  std::ostream *const log=0);
377 
379  static void parse_xend(long *const xend,const long pos,
380  const D2Type d2type,const std::string& d2,
382 
384  twodex(const std::string& indexfile,const QueryMode qm=QRY_OVERLAPPING);
385 
386  void set_querymode(const QueryMode qm);
387 
388  const header& get_header() const;
389 
390  void query(std::set<size_t>& res,const std::pair<long,long>& r) const;
391  void query(std::set<size_t>& res,const shore::genomic_range& r) const;
392 
393  std::pair<long,long> range_enc(const shore::genomic_range& r) const;
394 
395  int test(const shore::genomic_range& r1,const shore::genomic_range& r2) const;
396  int test(const shore::genomic_range& r1,const std::pair<long,long>& r2) const;
397 
402  static int test(const std::pair<long,long>& r1,
403  const std::pair<long,long>& r2,
404  const QueryMode querymode);
405 };
406 
407 S_MAKE_ENUM_TRAITS(shore::twodex::QueryMode)
408 S_MAKE_ENUM_TRAITS(shore::twodex::D2Type)
409 
410 } // namespace shore
411 
412 #endif // SHORE_ALGO_TWODEX_HPP__
413