SHORE API
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
sort_file.hpp
Go to the documentation of this file.
1 
2 /*
3  * Copyright 2008,2009,2010,2011,2012 Stephan Ossowski, Korbinian Schneeberger,
4  * Felix Ott, Joerg Hagmann, Alf Scotland, Sebastian Bender
5  *
6  * This file is part of SHORE.
7  *
8  * SHORE is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * SHORE is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with SHORE. If not, see <http://www.gnu.org/licenses/>.
20  */
21 
25 
26 #ifndef SHORE_ALGO_SORT_FILE_HPP__
27 #define SHORE_ALGO_SORT_FILE_HPP__
28 
29 #include <unistd.h>
30 #include <fcntl.h>
31 #include <sys/mman.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 
35 
36 #include <algorithm>
37 #include <cerrno>
38 #include <cstring>
39 #include <iostream>
40 #include <iterator>
41 #include <list>
42 #include <map>
43 #include <set>
44 #include <vector>
45 #include <string>
46 
47 #include <boost/filesystem.hpp>
48 
49 namespace bfs=boost::filesystem;
50 
51 #include "shore/base/memops.hpp"
52 #include "shore/base/util.hpp"
53 #include "shore/base/stringops.hpp"
54 #include "shore/base/pathops.hpp"
56 #include "shore/stream/streams.hpp"
58 
59 namespace shore {
60 
63 bool cmp_field0(const char*const c0,const char*const c1);
64 
67 bool scmp_field0(const std::string &s0,const std::string &s1);
68 
70 bool cmp_field0_lex(const char*const c0,const char*const c1);
71 
73 bool cmp_line(const char*const c0,const char*const c1);
74 
76 bool cmp_maplist(const char*const c0,const char*const c1);
77 
79 bool cmp_maplist_readid(const char*const c0,const char*const c1);
80 
82 bool cmp_sam(const char*const c0,const char*const c1);
83 
85 template<int K>
86 bool cmp_field(const char*const c0,const char*const c1)
87 {
88  // find the (K+1)'th field and then compare
89  const char* nc0=c0;
90  const char* nc1=c1;
91 
92  int ntab=0;
93  while(ntab<K)
94  {
95  if(nc0[0]=='\n') break;
96  if(nc0[0]=='\t') ++ntab;
97  ++nc0;
98  }
99  ntab=0;
100  while(ntab<K)
101  {
102  if(nc1[0]=='\n') break;
103  if(nc1[0]=='\t') ++ntab;
104  ++nc1;
105  }
106 
107  return cmp_field0(nc0,nc1);
108 }
109 
111 template<int K>
112 bool cmp_field_lex(const char*const c0,const char*const c1)
113 {
114  // find the (K+1)'th field and then compare
115  const char* nc0=c0;
116  const char* nc1=c1;
117 
118  int ntab=0;
119  while(ntab<K)
120  {
121  if(nc0[0]=='\n') break;
122  if(nc0[0]=='\t') ++ntab;
123  ++nc0;
124  }
125  ntab=0;
126  while(ntab<K)
127  {
128  if(nc1[0]=='\n') break;
129  if(nc1[0]=='\t') ++ntab;
130  ++nc1;
131  }
132 
133  return cmp_field0_lex(nc0,nc1);
134 }
135 
137 
140 {
141 public:
142 
143  enum CRes
144  {
145  C_LT=0,
146  C_EQ=1,
147  C_GT=2
148  };
149 
150  struct rowpair
151  {
152  const char* c0;
153  const char* c1;
154 
155  rowpair(const char*const p0,const char*const p1);
156  };
157 
158  typedef CRes(*Ord_t)(rowpair& rp);
159 
161  static CRes c_ign(rowpair& rp);
162 
164  static CRes c_text(rowpair& rp);
165 
167  static CRes c_uint(rowpair& rp);
168 
170  static CRes c_int(rowpair& rp);
171 
173  static CRes c_ufloat(rowpair& rp);
174 
176  static CRes c_float(rowpair& rp);
177 
178 private:
179 
181  std::vector<Ord_t> m_vo;
183  std::vector<bool> m_vr;
185  std::vector<size_t> m_vc;
187  std::set<size_t> m_sc;
189  std::vector<CRes> m_res;
190 
191  bool m_commentsfirst;
192 
193 public:
194 
196  cmp_tokens();
197 
199  void set_commentsfirst(const bool b);
200 
203  const size_t column,const Ord_t o,const bool reverse);
204 
205  const std::set<size_t>& get_columns() const;
206 
207  const std::vector<size_t>& get_columnsv() const;
208 
210  bool operator()(const char*const c0,const char*const c1);
211 
213  bool operator()(const std::string& s0,const std::string& s1);
214 };
215 
217 
219 template<class StrictWeakOrdering=bool(*)(const char*,const char*)>
221 {
222  private:
223 
224  StrictWeakOrdering m_cmp;
225 
226  std::string m_temp;
227 
228  size_t m_blocksizelimit;
229 
230  size_t m_maxopenfiles;
231 
232  // discard redundant lines when sorting / merging
233  bool m_uniq;
234  // fail instead of discarding if uniq is set & lines are redundant
235  bool m_assertuniq;
236 
237  bool m_safemerge;
238  bool m_mmapmerge;
239 
240  std::vector<const char*> m_linebuf;
241 
242 
244  void merge_files_rec(const std::vector<std::string>& infiles,
245  std::ostream& out)
246  {
247  std::multimap<long,std::string> inf;
248  for(size_t i=0;i<infiles.size();++i)
249  {
250  long fs=shore::istreams(infiles[i]).get_streamsize(0);
251  if(fs<0) fs=std::numeric_limits<long>::max();
252 
253  inf.insert(std::make_pair(fs,infiles[i]));
254  }
255 
256  shore::ostreams tmpout;
257 
258  // merge as few and as small as possible files to intermediate tempfile
259  while(inf.size()>m_maxopenfiles)
260  {
261  std::vector<std::string> merfn;
262 
263  const size_t nfiles=
264  std::min(m_maxopenfiles,inf.size()-m_maxopenfiles+1);
265 
266  while(merfn.size()<nfiles)
267  {
268  merfn.push_back(inf.begin()->second);
269  inf.erase(inf.begin());
270  }
271 
272  tmpout.open_tempfile(m_temp+"/shore-sort_merge");
273  merge_files(merfn,tmpout.back());
274  tmpout.close_all();
275 
276  long fs=shore::istreams(tmpout.get_streamname(tmpout.size()-1)).get_streamsize(0);
277  if(fs<0)
278  fs=std::numeric_limits<long>::max();
279 
280  inf.insert(std::make_pair(fs,tmpout.get_streamname(tmpout.size()-1)));
281  }
282 
283  // final merge
284  merge_files(shore::map_values(inf),out);
285  }
286 
288  void sort_slices(shore::stream_slicer &strcut,std::ostream& out,
289  const std::string &name)
290  {
291  bool incomplete_line=false;
292 
293  shore::ostreams tmpout;
294 
295  while(strcut.has_data())
296  {
297  const char*const data_begin=strcut.current().data;
298  const char*const data_end=data_begin+strcut.current().size;
299 
300  // complete a line larger than blocksizelimit
301  if(incomplete_line)
302  {
303  const char* first_eol=
304  std::find(data_begin,data_end,'\n');
305 
306  if(first_eol!=data_end)
307  {
308  incomplete_line=false;
309  ++first_eol;
310  }
311 
312  tmpout.back().write(data_begin,first_eol-data_begin);
313  strcut.move(first_eol-data_begin);
314 
315  if(!incomplete_line)
316  tmpout.close_all();
317  }
318  // if this is the last block
319  else if(strcut.current().eof)
320  {
321  // this is the only block and uniq is not set
322  // -> direct output
323  if((strcut.current().offset==0)&&(!m_uniq))
324  sort_block(data_begin,data_end,out);
325  // this is the last of multiple blocks or uniq is set
326  // -> write to temp file
327  else
328  {
329  tmpout.open_tempfile(m_temp
330  +"/shore-sort_"+name);
331  sort_block(data_begin,data_end,tmpout.back());
332  tmpout.close_all();
333  }
334  strcut.move(strcut.current().size);
335  }
336  // not the last block
337  // -> find the last eol of the block before sort
338  // -> write to temp files
339  else
340  {
341  std::reverse_iterator<const char*> rbeg(data_end);
342  std::reverse_iterator<const char*> rend(data_begin);
343 
344  const char*const last_eol=std::find(rbeg,rend,'\n').base();
345 
346  if(last_eol!=data_begin)
347  {
348  tmpout.open_tempfile(m_temp
349  +"/shore-sort_"+name);
350  sort_block(data_begin,last_eol,tmpout.back());
351  tmpout.close_all();
352  strcut.move(last_eol-data_begin);
353  }
354  // handle lines larger than blocksizelimit (no eol found)
355  else
356  {
357  tmpout.open_tempfile(m_temp
358  +"/shore-sort_"+name);
359 
360  tmpout.back().write(data_begin,
361  strcut.current().size);
362  strcut.move(strcut.current().size);
363 
364  incomplete_line=true;
365  }
366  }
367  }
368 
369  // reset the linebuf capacity to default
370  {
371  std::vector<const char*> tmp;
372  m_linebuf.swap(tmp);
373  }
374 
375  // merge temp files
376  if(tmpout.size()!=0)
377  {
378  std::vector<std::string> fnams(tmpout.size());
379  for(size_t i=0;i<tmpout.size();++i)
380  fnams[i]=tmpout.get_streamname(i);
381  merge_files(fnams,out);
382  }
383  }
384 
385 public:
386 
388  line_sorter(StrictWeakOrdering cmp,
389  const std::string& tmp=".",
390  const double max_blockMB=2048.0,
391  const bool uniq=false,
392  const bool assertuniq=false,
393  const size_t maxopenfiles=0)
394  :m_cmp(cmp),
395  m_temp(tmp.empty()?".":tmp),
396  m_blocksizelimit(static_cast<size_t>(std::ceil(max_blockMB*(1<<20)))),
397  m_maxopenfiles(maxopenfiles),
398  m_uniq(uniq),
399  m_assertuniq(assertuniq),
400  m_safemerge(true),
401  m_mmapmerge(false)
402  {
403  if(m_maxopenfiles==0)
404  {
405  try
406  {
407  m_maxopenfiles=shore::get_maxopenfiles()>>1;
408  }
409  catch(const std::exception &e)
410  {
411  std::cerr<<"# WARNING: "<<e.what()<<std::endl;
412  std::cerr<<"# maxopenfiles set to 128"<<std::endl;
413  m_maxopenfiles=128;
414  }
415  }
416  }
417 
420  {
421  m_safemerge=true;
422  return *this;
423  }
424 
427  {
428  m_safemerge=false;
429  return *this;
430  }
431 
432  line_sorter &mmapmerge()
433  {
434  m_mmapmerge=true;
435  return *this;
436  }
437 
438  line_sorter &nommapmerge()
439  {
440  m_mmapmerge=false;
441  return *this;
442  }
443 
445  void sort_block(const char*const data_begin,const char*const data_end,
446  std::ostream& out)
447  {
448  m_linebuf.push_back(data_begin);
449 
450  const char* eol=std::find(data_begin,data_end,'\n');
451  while(eol<(data_end-1))
452  {
453  m_linebuf.push_back(eol+1);
454  eol=std::find(eol+1,data_end,'\n');
455  }
456 
457  // ensure that the last line is also terminated by '\n'
458  const bool eofnewline=((*(data_end-1))=='\n');
459  const std::string lastline=eofnewline?
460  "":(std::string(m_linebuf.back(),data_end)+'\n');
461 
462  if(!eofnewline)
463  m_linebuf.back()=lastline.c_str();
464 
465  std::sort(m_linebuf.begin(),m_linebuf.end(),m_cmp);
466 
467  typedef std::vector<const char*>::const_iterator i_t;
468  for(i_t i=m_linebuf.begin();i!=m_linebuf.end();++i)
469  {
470  const char*const s=(*i);
471  const char* nl=s;
472 
473  while((*nl)!='\n')
474  ++nl;
475 
476  if(eofnewline||(s!=m_linebuf.back()))
477  ++nl;
478 
479  out.write(s,nl-s);
480  }
481 
482  m_linebuf.clear();
483  }
484 
486  void sort_file(const std::string& fn_in,std::ostream& out)
487  {
488  // Cut the file into slices.
489  shore::stream_slicer strcut(fn_in,m_blocksizelimit);
490 
491  sort_slices(strcut,out,shore::path_leaf(fn_in));
492 
493  /*
494  // decompress if necessary
495  if(shore::istreams::is_gzip(fn_in)||shore::istreams::is_xz(fn_in))
496  {
497  shore::ostreams tmp;
498  tmp.open_tempfile(m_temp+"/shore-sort_"+shore::path_leaf(fn_in),
499  shore::ostreams::CODEC_PLAIN);
500 
501  {
502  shore::istreams in(fn_in);
503  if(in[0].peek()!=EOF)
504  tmp[0]<<in[0].rdbuf();
505  in.read_error_check_rdbuf(0,"line_sorter::sort_file("+fn_in+")");
506  }
507 
508  tmp.close_all();
509  sort_file(tmp.get_streamname(0),out);
510  return;
511  }
512 
513  // open & stat input file
514  shore::mmapping mmpp(fn_in);
515  const size_t total_space=mmpp.get_filesize();
516 
517  bool incomplete_line=false;
518 
519  size_t processed_space=0;
520 
521  shore::ostreams tmpout;
522 
523  while(processed_space<total_space)
524  {
525  const size_t block_space=
526  std::min(m_blocksizelimit,total_space-processed_space);
527 
528  mmpp.map(processed_space,block_space);
529 
530  const char*const data_begin=mmpp.begin();
531  const char*const data_end=mmpp.end();
532 
533  // complete a line larger than blocksizelimit
534  if(incomplete_line)
535  {
536  const char* first_eol=
537  std::find(data_begin,data_end,'\n');
538 
539  if(first_eol!=data_end)
540  {
541  incomplete_line=false;
542  ++first_eol;
543  }
544 
545  tmpout.back().write(data_begin,first_eol-data_begin);
546  processed_space+=(first_eol-data_begin);
547 
548  if(!incomplete_line) tmpout.close_all();
549  }
550  // if this is the last block
551  else if((processed_space+block_space)==total_space)
552  {
553  // this is the only block and uniq is not set
554  // -> direct output
555  if((processed_space==0)&&(!m_uniq))
556  {
557  sort_block(data_begin,data_end,out);
558  }
559  // this is the last of multiple blocks or uniq is set
560  // -> write to temp file
561  else
562  {
563  tmpout.open_tempfile(m_temp+"/shore-sort_"+shore::path_leaf(fn_in));
564  sort_block(data_begin,data_end,tmpout.back());
565  tmpout.close_all();
566  }
567  processed_space+=block_space;
568  }
569  // not the last block
570  // -> find the last eol of the block before sort
571  // -> write to temp files
572  else
573  {
574  std::reverse_iterator<const char*> rbeg(data_end);
575  std::reverse_iterator<const char*> rend(data_begin);
576 
577  const char*const last_eol=std::find(rbeg,rend,'\n').base();
578 
579  if(last_eol!=data_begin)
580  {
581  tmpout.open_tempfile(m_temp+"/shore-sort_"
582  +shore::path_leaf(fn_in));
583  sort_block(data_begin,last_eol,tmpout.back());
584  tmpout.close_all();
585  processed_space+=(last_eol-data_begin);
586  }
587  // handle lines larger than blocksizelimit (no eol found)
588  else
589  {
590  tmpout.open_tempfile(m_temp+"/shore-sort_"
591  +shore::path_leaf(fn_in));
592 
593  tmpout.back().write(data_begin,block_space);
594  processed_space+=block_space;
595 
596  incomplete_line=true;
597  }
598  }
599  }
600 
601  mmpp.unmap();
602 
603  // reset the linebuf capacity to default
604  {
605  std::vector<const char*> tmp;
606  m_linebuf.swap(tmp);
607  }
608 
609  // merge temp files
610  if(tmpout.size()!=0)
611  {
612  std::vector<std::string> fnams(tmpout.size());
613  for(size_t i=0;i<tmpout.size();++i)
614  fnams[i]=tmpout.get_streamname(i);
615  merge_files(fnams,out);
616  }
617  */
618  }
619 
621  void sort_file(std::istream &is,std::ostream& out)
622  {
623  // Cut the stream into slices.
624  shore::stream_slicer strcut(is,m_blocksizelimit);
625 
626  sort_slices(strcut,out,"stream");
627  }
628 
630  void sort_file(const std::string& fn_in,const std::string& fn_out)
631  {
632  shore::ostreams out;
633 
634  out.open_file(fn_out);
635 
636  sort_file(fn_in,out[0]);
637 
638  out.close_all();
639  }
640 
642  void sort_file(const std::string& fn)
643  {
644  shore::ostreams to;
645 
647  shore::ostreams::EXISTS_REPLACE);
648 
649  sort_file(fn,to.back());
650 
651  // automatically replaces fn
652  to.close_all();
653  }
654 
656  void sort_files(const std::vector<std::string>& fn,std::ostream& out)
657  {
658  if(fn.empty())
659  return;
660 
661  shore::ostreams tmps;
662  std::vector<std::string> fnams(fn.size());
663  for(size_t i=0;i<fn.size();++i)
664  {
665  tmps.open_tempfile(m_temp+"/shore-sort_"+shore::path_leaf(fn[i]));
666  sort_file(fn[i],tmps.back());
667  tmps.close_stream(i);
668  fnams[i]=tmps.get_streamname(i);
669  }
670  merge_files(fnams,out);
671  }
672 
675  const std::vector<std::string>& fn,const std::string& fn_out)
676  {
677  shore::ostreams os(fn_out);
678  sort_files(fn,os[0]);
679  os.close_all();
680  }
681 
685  bool is_sorted(std::istream& in)
686  {
687  std::string buf,buf_last;
688  std::getline(in,buf_last);
689  buf_last+='\n';
690 
691  if(!m_uniq)
692  {
693  while(std::getline(in,buf))
694  {
695  buf+='\n';
696  if(m_cmp(buf.c_str(),buf_last.c_str()))
697  return false;
698  buf_last.swap(buf);
699  }
700  }
701  else
702  {
703  while(std::getline(in,buf))
704  {
705  buf+='\n';
706  if(!m_cmp(buf_last.c_str(),buf.c_str()))
707  return false;
708  buf_last.swap(buf);
709  }
710  }
711  shore::istreams::read_error_check(in,"<stream>","line_sorter::is_sorted (1)");
712  return true;
713  }
714 
719  bool is_sorted(std::istream& in,size_t*const line)
720  {
721  std::string buf,buf_last;
722  std::getline(in,buf_last);
723  buf_last+='\n';
724 
725  ++(*line);
726 
727  if(!m_uniq)
728  {
729  while(std::getline(in,buf))
730  {
731  buf+='\n';
732  if(m_cmp(buf.c_str(),buf_last.c_str()))
733  return false;
734  buf_last.swap(buf);
735  ++(*line);
736  }
737  }
738  else
739  {
740  while(std::getline(in,buf))
741  {
742  buf+='\n';
743  if(!m_cmp(buf_last.c_str(),buf.c_str()))
744  return false;
745  buf_last.swap(buf);
746  ++(*line);
747  }
748  }
749  shore::istreams::read_error_check(in,"<stream>","line_sorter::is_sorted (2)");
750  return true;
751  }
752 
756  bool is_sorted(const std::string& fn,size_t*const line=0)
757  {
758  shore::istreams in(fn);
759 
760  if(line==0)
761  return is_sorted(in.back());
762  return is_sorted(in.back(),line);
763  }
764 
769  const std::vector<std::string>& infiles,std::ostream& out)
770  {
771  if(infiles.empty())
772  return;
773 
774  // do a step-wise merge if the number of files is too high
775  if((m_maxopenfiles>1)&&(infiles.size()>m_maxopenfiles))
776  {
777  merge_files_rec(infiles,out);
778  }
779  // actual file merger part
780  else
781  {
782  shore::istreams in;
783  // mmap files to save file descriptors?
784  in.use_mmap(m_mmapmerge);
785  in.open_files(infiles.begin(),infiles.end());
786 
787  std::multimap<const char*,size_t,StrictWeakOrdering> filemap(m_cmp);
788 
789  std::vector<std::string> buffers(in.size());
790  std::string auxbuf;
791 
792  // build map
793  for(size_t i=0;i<in.size();++i)
794  {
795  if(std::getline(in[i],buffers[i]))
796  {
797  buffers[i]+='\n';
798  const char*const ptr=buffers[i].c_str();
799  filemap.insert(std::make_pair(ptr,i));
800  }
801  else
802  in.read_error_check(i,"line_sorter::merge_files (1)");
803  }
804 
805  bool assertinit=true;
806 
807  if(m_uniq&&(!filemap.empty()))
808  {
809  auxbuf=filemap.begin()->first;
810  out<<filemap.begin()->first;
811  }
812 
813  // merge
814  while(!filemap.empty())
815  {
816  if(!m_uniq||m_cmp(auxbuf.c_str(),filemap.begin()->first))
817  {
818  out<<filemap.begin()->first;
819  }
820  else if(m_assertuniq)
821  {
822  if(assertinit)
823  assertinit=false;
824  else
825  throw std::runtime_error(
826  "merge_files: line is not unique {\n"
827  +auxbuf+filemap.begin()->first+"}");
828  }
829 
830  const size_t fileid=filemap.begin()->second;
831 
832  filemap.erase(filemap.begin());
833 
834  if(m_safemerge||m_uniq)
835  buffers[fileid].swap(auxbuf);
836 
837  if(std::getline(in[fileid],buffers[fileid]))
838  {
839  buffers[fileid]+='\n';
840  const char*const ptr=buffers[fileid].c_str();
841 
842  // check sorting
843  if(m_safemerge&&m_cmp(ptr,auxbuf.c_str()))
844  throw std::runtime_error(
845  "merge_files: "+in.get_streamname(fileid)
846  +" is not sorted appropriately {"
847  +auxbuf.substr(0,80)+" ...}");
848 
849  filemap.insert(std::make_pair(ptr,fileid));
850  }
851  else if(!in[fileid].eof())
852  in.read_error_check(fileid,"line_sorter::merge_files (2)");
853  }
854  }
855  }
856 
859  const std::vector<std::string>& infiles,const std::string& fn_out)
860  {
861  shore::ostreams out(fn_out);
862  merge_files(infiles,out[0]);
863  out.close_all();
864  }
865 
870  std::pair<off_t,std::string> upper_bound(std::istream& in,
871  const std::string& what,
872  const off_t rangemin=0l,
873  off_t rangemax=-1l)
874  {
875  const std::string nlwhat=what+'\n';
876  std::string buf;
877 
878  if(rangemax==-1l)
879  {
880  in.seekg(0,std::ios::end);
881 
882  if(!in.good())
883  throw std::runtime_error("shore::upper_bound: seek failed");
884 
885  rangemax=in.tellg();
886 
887  if(rangemax<0l)
888  throw std::runtime_error("shore::upper_bound: tell failed");
889  }
890 
891  const off_t spos=rangemin+((rangemax-rangemin)>>1);
892 
893  in.seekg(spos);
894  if(!in.good())
895  throw std::runtime_error("shore::upper_bound: seek failed");
896 
897  std::getline(in,buf);
898  buf+='\n';
899 
900  if(!in)
901  {
902  if(in.eof())
903  in.clear();
904  else
905  shore::istreams::read_error_check(in,"<stream>",
906  "line_sorter::upper_bound (1)");
907  }
908 
909  off_t p0=in.tellg();
910  if(p0<0l)
911  throw std::runtime_error("shore::upper_bound: tell failed");
912 
913  if(p0>=rangemax)
914  {
915  p0=rangemin;
916  in.seekg(p0);
917  }
918  std::getline(in,buf);
919  buf+='\n';
920 
921  if(!in)
922  {
923  if(in.eof())
924  in.clear();
925  else
926  shore::istreams::read_error_check(in,"<stream>",
927  "line_sorter::upper_bound (2)");
928  }
929 
930  off_t p1=in.tellg();
931  if(p1<0l)
932  throw std::runtime_error("shore::upper_bound: tell failed");
933 
934  if((p1>=rangemax)&&(p0!=rangemin))
935  {
936  p0=rangemin;
937  in.seekg(p0);
938 
939  std::getline(in,buf);
940  buf+='\n';
941 
942  if(!in)
943  {
944  if(in.eof())
945  in.clear();
946  else
947  shore::istreams::read_error_check(in,"<stream>",
948  "line_sorter::upper_bound (3)");
949  }
950 
951  p1=in.tellg();
952  if(p1<0l)
953  throw std::runtime_error("shore::upper_bound: tell failed");
954  }
955 
956  if(p1>=rangemax)
957  {
958  if(m_cmp(nlwhat.c_str(),buf.c_str()))
959  {
960  in.seekg(p0);
961  return std::make_pair(p0,buf.substr(0,buf.size()-1));
962  }
963  else
964  return std::make_pair(p1,"");
965  }
966 
967  // if buf compares greater than what
968  if(m_cmp(nlwhat.c_str(),buf.c_str()))
969  {
970  return upper_bound(in,what,rangemin,p1);
971  }
972  else
973  {
974  return upper_bound(in,what,p1,rangemax);
975  }
976  }
977 
982  std::pair<off_t,std::string> lower_bound(std::istream& in,
983  const std::string& what,const off_t rangemin=0l,off_t rangemax=-1l)
984  {
985  const std::string nlwhat=what+'\n';
986  std::string buf;
987 
988  if(rangemax==-1l)
989  {
990  in.seekg(0,std::ios::end);
991 
992  if(!in.good())
993  throw std::runtime_error("shore::lower_bound: seek failed");
994 
995  rangemax=in.tellg();
996 
997  if(rangemax<0l)
998  throw std::runtime_error("shore::lower_bound: tell failed");
999  }
1000 
1001  const off_t spos=rangemin+((rangemax-rangemin)>>1);
1002 
1003  in.seekg(spos);
1004  if(!in.good())
1005  throw std::runtime_error("shore::lower_bound: seek failed");
1006 
1007  std::getline(in,buf);
1008  buf+='\n';
1009 
1010  if(!in)
1011  {
1012  if(in.eof())
1013  in.clear();
1014  else
1015  shore::istreams::read_error_check(in,"<stream>",
1016  "line_sorter::lower_bound (1)");
1017  }
1018 
1019  off_t p0=in.tellg();
1020  if(p0<0l)
1021  throw std::runtime_error("shore::lower_bound: tell failed");
1022 
1023  if(p0>=rangemax)
1024  {
1025  p0=rangemin;
1026  in.seekg(p0);
1027  }
1028  std::getline(in,buf);
1029  buf+='\n';
1030 
1031  if(!in)
1032  {
1033  if(in.eof())
1034  in.clear();
1035  else
1036  shore::istreams::read_error_check(in,"<stream>",
1037  "line_sorter::lower_bound (2)");
1038  }
1039 
1040  off_t p1=in.tellg();
1041  if(p1<0l)
1042  throw std::runtime_error("shore::lower_bound: tell failed");
1043 
1044  if((p1>=rangemax)&&(p0!=rangemin))
1045  {
1046  p0=rangemin;
1047  in.seekg(p0);
1048 
1049  std::getline(in,buf);
1050  buf+='\n';
1051 
1052  if(!in)
1053  {
1054  if(in.eof())
1055  in.clear();
1056  else
1057  shore::istreams::read_error_check(in,"<stream>",
1058  "line_sorter::lower_bound (3)");
1059  }
1060 
1061  p1=in.tellg();
1062  if(p1<0l)
1063  throw std::runtime_error("shore::lower_bound: tell failed");
1064  }
1065 
1066  if(p1>=rangemax)
1067  {
1068  if(!m_cmp(buf.c_str(),nlwhat.c_str()))
1069  {
1070  in.seekg(p0);
1071  return std::make_pair(p0,buf.substr(0,buf.size()-1));
1072  }
1073  else
1074  return std::make_pair(p1,"");
1075  }
1076 
1077  // if buf does not compare less than what
1078  if(!m_cmp(buf.c_str(),nlwhat.c_str()))
1079  {
1080  return lower_bound(in,what,rangemin,p1);
1081  }
1082  else
1083  {
1084  return lower_bound(in,what,p1,rangemax);
1085  }
1086  }
1087 };
1088 
1089 } // namespace
1090 
1091 #endif // SHORE_ALGO_SORT_FILE_HPP__
1092