26 #ifndef SHORE_ALGO_SUFFIX_INDEX_HPP__
27 #define SHORE_ALGO_SUFFIX_INDEX_HPP__
34 #include <boost/utility/enable_if.hpp>
35 #include <boost/type_traits.hpp>
60 typedef std::pair<raw_iterator,raw_iterator> rawrange;
61 typedef std::pair<iterator,iterator> range;
85 static const std::string HK[16];
86 static const std::string SE[3];
109 bool have_reverse_index;
112 std::map<size_t,size_t> offsets;
113 std::vector<std::string> labels;
114 std::vector<std::string> md5sums;
117 void parse(std::istream& in);
118 void print(std::ostream& out)
const;
132 packeddna_iterator m_seqbeg;
133 packeddna_iterator m_seqend;
134 std::reverse_iterator<packeddna_iterator> m_rev_seqbeg;
135 std::reverse_iterator<packeddna_iterator> m_rev_seqend;
137 raw_iterator m_idxbeg;
138 raw_iterator m_idxend;
140 raw_iterator m_hashbeg;
141 raw_iterator m_hashend;
143 raw_iterator m_lcpbeg;
144 raw_iterator m_lcpend;
146 raw_iterator m_rev_idxbeg;
147 raw_iterator m_rev_idxend;
149 raw_iterator m_rev_hashbeg;
150 raw_iterator m_rev_hashend;
152 raw_iterator m_rev_lcpbeg;
153 raw_iterator m_rev_lcpend;
158 const std::map<size_t,size_t> &offsets,
159 const uint64_t seqtotal,
160 const int reverse_offset);
162 template<
typename SeqIterator>
164 SeqIterator seq_beg,SeqIterator seq_end,
165 const raw_iterator arr_beg,
167 std::string &seed,
const int kmersize)
176 const packeddna_iterator f=packeddna_iterator::begin(seed);
177 packeddna_iterator t=packeddna_iterator::end(seed);
182 while(((t-1)!=f)&&((*(t-1))==0))
185 const rawrange r_new=
suffix_query(seq_beg,seq_end,r.first,r.second,f,t);
187 if(seed.size()==size_t(kmersize))
189 hash[
lhash(f,t,kmersize)]=r_new.first-arr_beg;
193 build_hash_recursive(hash,seq_beg,seq_end,arr_beg,r_new,seed,kmersize);
195 seed.resize(seed.size()-1);
203 const bool map_lcp=
false,
const bool map_rev=
false);
209 void decode_position(
shore::refseq_coor &res,
const size_t raw,
const int reverse_offset);
224 iterator coor_begin()
const;
225 iterator coor_end()
const;
229 template<
typename Iter>
231 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
nuc::packed_base> >::type *dummy=0)
const;
234 template<
typename Iter>
236 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
nuc::base> >::type *dummy=0)
const;
239 template<
typename Iter>
241 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
char> >::type *dummy=0)
const;
244 template<
typename Iter>
245 range
match(Iter f,Iter t)
const;
247 range
match(
const std::string& str)
const;
251 template<
typename Iter>
252 std::pair<rawrange,Iter>
match5_raw(Iter f,Iter t,
253 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
nuc::packed_base> >::type *dummy=0)
const;
257 template<
typename Iter>
259 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
nuc::packed_base> >::type *dummy=0)
const;
263 template<
typename Iter>
264 std::pair<range,Iter>
match5(Iter f,Iter t)
const;
268 std::pair<range,std::string::const_iterator>
match5(
const std::string &query)
const;
272 template<
typename Iter>
277 std::pair<range,std::string::const_iterator>
match5_reverse(
const std::string &query)
const;
280 template<
typename Iter>
281 size_t count(Iter f,Iter t)
const;
283 size_t count(
const std::string &str)
const;
285 template<
typename Iter>
288 double calc_freq(
const std::string &str)
const;
294 static void build(
const std::string& fasta_fn,
const std::string& out_fn,
295 std::ostream*
const log=0,
const int kmersize=10,
296 const bool with_lcp=
false,
const bool with_rev=
false);
299 static void build(
const std::vector<std::string>& fasta_fn,
300 const std::string& out_fn,
301 std::ostream*
const log=0,
const int kmersize=10,
302 const bool with_lcp=
false,
const bool with_rev=
false);
305 static void build(
const std::vector<std::string>& fasta_fn,
307 std::ostream*
const log=0,
const int kmersize=10,
308 const bool with_lcp=
false,
const bool with_rev=
false);
317 template<
typename Iter>
318 static size_t lhash(Iter f,Iter t,
const size_t n,
319 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
325 for(Iter i=f;(i!=t)&&(j<n);++i,++j)
337 ret=(ret<<((n-j)<<1));
352 template<
typename Iter>
353 static size_t rhash(Iter f,Iter t,
const size_t n,
354 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
359 for(Iter i=f;(i!=t)&&(j<n);++i,++j)
369 ret=(ret<<((n-j)<<1));
373 template<
typename Iter>
374 static size_t lhash(Iter f,Iter t,
const size_t n,
375 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
380 return lhash<typename base2packed_iterator<Iter>::proxy_type>(ff,tt,n);
383 template<
typename Iter>
384 static size_t lhash(Iter f,Iter t,
const size_t n,
385 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
386 char> >::type *dummy=0)
388 const char2packed_iterator<Iter> ff(f);
389 const char2packed_iterator<Iter> tt(t);
390 return lhash<typename char2packed_iterator<Iter>::proxy_type>(ff,tt,n);
393 template<
typename Iter>
394 static size_t rhash(Iter f,Iter t,
const size_t n,
395 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
398 const base2packed_iterator<Iter> ff(f);
399 const base2packed_iterator<Iter> tt(t);
400 return rhash<typename base2packed_iterator<Iter>::proxy_type>(ff,tt,n);
403 template<
typename Iter>
404 static size_t rhash(Iter f,Iter t,
const size_t n,
405 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
406 char> >::type *dummy=0)
408 const char2packed_iterator<Iter> ff(f);
409 const char2packed_iterator<Iter> tt(t);
410 return rhash<typename char2packed_iterator<Iter>::proxy_type>(ff,tt,n);
415 static const std::string
unhash(
const size_t hash,
const size_t len);
420 :
public boost::iterator_facade<
421 iterator,const shore::refseq_coor,
422 boost::random_access_traversal_tag,const shore::refseq_coor>
426 friend class boost::iterator_core_access;
430 const std::map<size_t,size_t>* m_offsets;
431 const uint64_t m_seqtotal;
432 const int m_reverse_offset;
436 const std::map<size_t,size_t>*
const s,
437 const uint64_t seqtotal,
const int reverse_offset);
441 void advance(difference_type n);
442 difference_type distance_to(
const iterator& other)
const;
443 bool equal(
const iterator& other)
const;
445 reference dereference()
const;
475 template<
typename Iter>
477 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
nuc::packed_base> >::type *dummy=0)
480 m_left_updated=m_right_updated=0;
481 while((m_hashshift>0)&&(f!=t))
486 m_lhash+=((1<<m_hashshift)-1);
487 m_left_updated+=(a_up+1);
496 const size_t bval=*f;
497 m_lhash|=(bval<<m_hashshift);
498 m_left_updated+=(a_up+1);
505 m_rhash+=(1<<m_hashshift);
514 size_t right()
const;
515 int left_updated()
const;
516 int right_updated()
const;
527 size_t m_query_offset;
544 void match(
const char b);
549 template<
typename Iter>
551 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
556 if(m_hash.left_updated()>0)
557 m_range.first=m_parent->m_idxbeg+m_parent->m_hashbeg[m_hash.left()];
559 if(m_hash.right_updated()>0)
560 m_range.second=m_parent->m_idxbeg+m_parent->m_hashbeg[m_hash.right()];
562 const int hashed=std::min(m_hash.left_updated(),m_hash.right_updated());
567 m_query_offset+=hashed-1;
585 template<
typename Iter>
587 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
592 match<typename base2packed_iterator<Iter>::proxy_type>(ff,tt);
597 template<
typename Iter>
599 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
600 char> >::type *dummy=0)
604 match<typename char2packed_iterator<Iter>::proxy_type>(ff,tt);
607 const rawrange &get_raw_range()
const;
608 range get_range()
const;
609 size_t get_count()
const;
620 size_t m_query_offset;
634 void match(
const char b);
638 template<
typename Iter>
640 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
645 if(m_hash.left_updated()>0)
646 m_range.first=m_parent->m_idxbeg+m_parent->m_rev_hashbeg[m_hash.left()];
648 if(m_hash.right_updated()>0)
649 m_range.second=m_parent->m_idxbeg+m_parent->m_rev_hashbeg[m_hash.right()];
651 const int hashed=std::min(m_hash.left_updated(),m_hash.right_updated());
656 m_query_offset+=hashed-1;
662 m_parent->m_rev_seqend,
674 template<
typename Iter>
676 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
681 match<typename base2packed_iterator<Iter>::proxy_type>(ff,tt);
686 template<
typename Iter>
688 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
689 char> >::type *dummy=0)
693 match<typename char2packed_iterator<Iter>::proxy_type>(ff,tt);
696 const rawrange &get_raw_range()
const;
697 range get_range()
const;
698 size_t get_count()
const;
704 template<
typename Iter>
706 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
719 incremental_query iq(*
this);
721 return iq.get_raw_range();
724 template<
typename Iter>
726 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
729 const base2packed_iterator<Iter> ff(f);
730 const base2packed_iterator<Iter> tt(t);
731 return match_raw<typename base2packed_iterator<Iter>::proxy_type>(ff,tt);
734 template<
typename Iter>
736 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
737 char> >::type *dummy)
const
739 const char2packed_iterator<Iter> ff(f);
740 const char2packed_iterator<Iter> tt(t);
741 return match_raw<typename char2packed_iterator<Iter>::proxy_type>(ff,tt);
744 template<
typename Iter>
748 iterator df(r.first,&m_idxinfo.offsets,m_idxinfo.seqtotal,0);
749 iterator dt(r.second,&m_idxinfo.offsets,m_idxinfo.seqtotal,0);
753 template<
typename Iter>
756 const suffix_index::rawrange r=
match_raw(f,t);
757 return r.second-r.first;
760 template<
typename Iter>
763 const size_t size=t-f;
765 const double kmer_total=m_idxinfo.seqtotal-(size*m_idxinfo.nseq);
766 if(kmer_total<=0)
return 0.0;
767 return static_cast<double>(
count(f,t))/kmer_total;
770 template<
typename Iter>
771 std::pair<suffix_index::rawrange,Iter>
773 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
776 if(m_idxinfo.kmersize==0)
778 const suffix_query_result<raw_iterator,Iter> q=
780 return std::make_pair(rawrange(q.array_beg,q.array_end),q.query_end);
783 const size_t x=m_hashbeg[
lhash(f,t,m_idxinfo.kmersize)];
784 const size_t y=m_hashbeg[
rhash(f,t,m_idxinfo.kmersize)];
788 const suffix_query_result<raw_iterator,Iter> q=
791 return std::make_pair(rawrange(q.array_beg,q.array_end),q.query_end);
794 const suffix_query_result<raw_iterator,Iter> q=
797 return std::make_pair(rawrange(q.array_beg,q.array_end),q.query_end);
800 template<
typename Iter>
803 const std::pair<rawrange,Iter> r=
match5_raw(f,t);
804 iterator df(r.first.first,&m_idxinfo.offsets,m_idxinfo.seqtotal,0);
805 iterator dt(r.first.second,&m_idxinfo.offsets,m_idxinfo.seqtotal,0);
806 return std::make_pair(range(df,dt),r.second);
809 template<
typename Iter>
810 std::pair<suffix_index::rawrange,Iter>
812 typename boost::enable_if<boost::is_same<
typename std::iterator_traits<Iter>::value_type,
816 throw std::runtime_error(
"suffix_index::match5_reverse_raw: "
817 "reverse index must be present and "
818 "mmapped for this method");
822 if(m_idxinfo.kmersize==0)
824 const suffix_query_result<raw_iterator,Iter> q=
826 return std::make_pair(rawrange(q.array_beg,q.array_end),q.query_end);
829 const size_t x=m_rev_hashbeg[
lhash(f,t,m_idxinfo.kmersize)];
830 const size_t y=m_rev_hashbeg[
rhash(f,t,m_idxinfo.kmersize)];
834 const suffix_query_result<raw_iterator,Iter> q=
837 return std::make_pair(rawrange(q.array_beg,q.array_end),q.query_end);
840 const suffix_query_result<raw_iterator,Iter> q=
843 return std::make_pair(rawrange(q.array_beg,q.array_end),q.query_end);
846 template<
typename Iter>
850 const int reverse_offset=f-r.second;
851 iterator df(r.first.first,&m_idxinfo.offsets,m_idxinfo.seqtotal,reverse_offset);
852 iterator dt(r.first.second,&m_idxinfo.offsets,m_idxinfo.seqtotal,reverse_offset);
853 return std::make_pair(range(df,dt),r.second);
858 #endif // SHORE_ALGO_SUFFIX_INDEX_HPP__