26 #ifndef SHORE_ALGO_SORT_FILE_HPP__
27 #define SHORE_ALGO_SORT_FILE_HPP__
32 #include <sys/types.h>
47 #include <boost/filesystem.hpp>
49 namespace bfs=boost::filesystem;
63 bool cmp_field0(
const char*
const c0,
const char*
const c1);
67 bool scmp_field0(
const std::string &s0,
const std::string &s1);
73 bool cmp_line(
const char*
const c0,
const char*
const c1);
76 bool cmp_maplist(
const char*
const c0,
const char*
const c1);
82 bool cmp_sam(
const char*
const c0,
const char*
const c1);
86 bool cmp_field(
const char*
const c0,
const char*
const c1)
95 if(nc0[0]==
'\n')
break;
96 if(nc0[0]==
'\t') ++ntab;
102 if(nc1[0]==
'\n')
break;
103 if(nc1[0]==
'\t') ++ntab;
121 if(nc0[0]==
'\n')
break;
122 if(nc0[0]==
'\t') ++ntab;
128 if(nc1[0]==
'\n')
break;
129 if(nc1[0]==
'\t') ++ntab;
155 rowpair(
const char*
const p0,
const char*
const p1);
158 typedef CRes(*Ord_t)(rowpair& rp);
161 static CRes
c_ign(rowpair& rp);
164 static CRes
c_text(rowpair& rp);
167 static CRes
c_uint(rowpair& rp);
170 static CRes
c_int(rowpair& rp);
176 static CRes
c_float(rowpair& rp);
181 std::vector<Ord_t> m_vo;
183 std::vector<bool> m_vr;
185 std::vector<size_t> m_vc;
187 std::set<size_t> m_sc;
189 std::vector<CRes> m_res;
191 bool m_commentsfirst;
203 const size_t column,
const Ord_t o,
const bool reverse);
205 const std::set<size_t>& get_columns()
const;
207 const std::vector<size_t>& get_columnsv()
const;
210 bool operator()(
const char*
const c0,
const char*
const c1);
213 bool operator()(
const std::string& s0,
const std::string& s1);
219 template<
class StrictWeakOrdering=
bool(*)(const
char*,const
char*)>
224 StrictWeakOrdering m_cmp;
228 size_t m_blocksizelimit;
230 size_t m_maxopenfiles;
240 std::vector<const char*> m_linebuf;
244 void merge_files_rec(
const std::vector<std::string>& infiles,
247 std::multimap<long,std::string> inf;
248 for(
size_t i=0;i<infiles.size();++i)
251 if(fs<0) fs=std::numeric_limits<long>::max();
253 inf.insert(std::make_pair(fs,infiles[i]));
259 while(inf.size()>m_maxopenfiles)
261 std::vector<std::string> merfn;
264 std::min(m_maxopenfiles,inf.size()-m_maxopenfiles+1);
266 while(merfn.size()<nfiles)
268 merfn.push_back(inf.begin()->second);
269 inf.erase(inf.begin());
278 fs=std::numeric_limits<long>::max();
289 const std::string &name)
291 bool incomplete_line=
false;
295 while(strcut.has_data())
297 const char*
const data_begin=strcut.current().data;
298 const char*
const data_end=data_begin+strcut.current().size;
303 const char* first_eol=
304 std::find(data_begin,data_end,
'\n');
306 if(first_eol!=data_end)
308 incomplete_line=
false;
312 tmpout.
back().write(data_begin,first_eol-data_begin);
313 strcut.
move(first_eol-data_begin);
319 else if(strcut.current().eof)
323 if((strcut.current().offset==0)&&(!m_uniq))
330 +
"/shore-sort_"+name);
334 strcut.
move(strcut.current().size);
341 std::reverse_iterator<const char*> rbeg(data_end);
342 std::reverse_iterator<const char*> rend(data_begin);
344 const char*
const last_eol=std::find(rbeg,rend,
'\n').base();
346 if(last_eol!=data_begin)
349 +
"/shore-sort_"+name);
352 strcut.
move(last_eol-data_begin);
358 +
"/shore-sort_"+name);
360 tmpout.
back().write(data_begin,
361 strcut.current().size);
362 strcut.
move(strcut.current().size);
364 incomplete_line=
true;
371 std::vector<const char*> tmp;
378 std::vector<std::string> fnams(tmpout.
size());
379 for(
size_t i=0;i<tmpout.
size();++i)
389 const std::string& tmp=
".",
390 const double max_blockMB=2048.0,
391 const bool uniq=
false,
392 const bool assertuniq=
false,
393 const size_t maxopenfiles=0)
395 m_temp(tmp.empty()?
".":tmp),
396 m_blocksizelimit(static_cast<size_t>(std::ceil(max_blockMB*(1<<20)))),
397 m_maxopenfiles(maxopenfiles),
399 m_assertuniq(assertuniq),
403 if(m_maxopenfiles==0)
409 catch(
const std::exception &e)
411 std::cerr<<
"# WARNING: "<<e.what()<<std::endl;
412 std::cerr<<
"# maxopenfiles set to 128"<<std::endl;
445 void sort_block(
const char*
const data_begin,
const char*
const data_end,
448 m_linebuf.push_back(data_begin);
450 const char* eol=std::find(data_begin,data_end,
'\n');
451 while(eol<(data_end-1))
453 m_linebuf.push_back(eol+1);
454 eol=std::find(eol+1,data_end,
'\n');
458 const bool eofnewline=((*(data_end-1))==
'\n');
459 const std::string lastline=eofnewline?
460 "":(std::string(m_linebuf.back(),data_end)+
'\n');
463 m_linebuf.back()=lastline.c_str();
465 std::sort(m_linebuf.begin(),m_linebuf.end(),m_cmp);
467 typedef std::vector<const char*>::const_iterator i_t;
468 for(i_t i=m_linebuf.begin();i!=m_linebuf.end();++i)
470 const char*
const s=(*i);
476 if(eofnewline||(s!=m_linebuf.back()))
486 void sort_file(
const std::string& fn_in,std::ostream& out)
626 sort_slices(strcut,out,
"stream");
630 void sort_file(
const std::string& fn_in,
const std::string& fn_out)
647 shore::ostreams::EXISTS_REPLACE);
656 void sort_files(
const std::vector<std::string>& fn,std::ostream& out)
662 std::vector<std::string> fnams(fn.size());
663 for(
size_t i=0;i<fn.size();++i)
675 const std::vector<std::string>& fn,
const std::string& fn_out)
687 std::string buf,buf_last;
688 std::getline(in,buf_last);
693 while(std::getline(in,buf))
696 if(m_cmp(buf.c_str(),buf_last.c_str()))
703 while(std::getline(in,buf))
706 if(!m_cmp(buf_last.c_str(),buf.c_str()))
721 std::string buf,buf_last;
722 std::getline(in,buf_last);
729 while(std::getline(in,buf))
732 if(m_cmp(buf.c_str(),buf_last.c_str()))
740 while(std::getline(in,buf))
743 if(!m_cmp(buf_last.c_str(),buf.c_str()))
756 bool is_sorted(
const std::string& fn,
size_t*
const line=0)
769 const std::vector<std::string>& infiles,std::ostream& out)
775 if((m_maxopenfiles>1)&&(infiles.size()>m_maxopenfiles))
777 merge_files_rec(infiles,out);
784 in.use_mmap(m_mmapmerge);
787 std::multimap<const char*,size_t,StrictWeakOrdering> filemap(m_cmp);
789 std::vector<std::string> buffers(in.
size());
793 for(
size_t i=0;i<in.
size();++i)
795 if(std::getline(in[i],buffers[i]))
798 const char*
const ptr=buffers[i].c_str();
799 filemap.insert(std::make_pair(ptr,i));
805 bool assertinit=
true;
807 if(m_uniq&&(!filemap.empty()))
809 auxbuf=filemap.begin()->first;
810 out<<filemap.begin()->first;
814 while(!filemap.empty())
816 if(!m_uniq||m_cmp(auxbuf.c_str(),filemap.begin()->first))
818 out<<filemap.begin()->first;
820 else if(m_assertuniq)
825 throw std::runtime_error(
826 "merge_files: line is not unique {\n"
827 +auxbuf+filemap.begin()->first+
"}");
830 const size_t fileid=filemap.begin()->second;
832 filemap.erase(filemap.begin());
834 if(m_safemerge||m_uniq)
835 buffers[fileid].swap(auxbuf);
837 if(std::getline(in[fileid],buffers[fileid]))
839 buffers[fileid]+=
'\n';
840 const char*
const ptr=buffers[fileid].c_str();
843 if(m_safemerge&&m_cmp(ptr,auxbuf.c_str()))
844 throw std::runtime_error(
846 +
" is not sorted appropriately {"
847 +auxbuf.substr(0,80)+
" ...}");
849 filemap.insert(std::make_pair(ptr,fileid));
851 else if(!in[fileid].eof())
859 const std::vector<std::string>& infiles,
const std::string& fn_out)
871 const std::string& what,
872 const off_t rangemin=0l,
875 const std::string nlwhat=what+
'\n';
883 throw std::runtime_error(
"shore::upper_bound: seek failed");
888 throw std::runtime_error(
"shore::upper_bound: tell failed");
891 const off_t spos=rangemin+((rangemax-rangemin)>>1);
895 throw std::runtime_error(
"shore::upper_bound: seek failed");
897 std::getline(in,buf);
906 "line_sorter::upper_bound (1)");
911 throw std::runtime_error(
"shore::upper_bound: tell failed");
918 std::getline(in,buf);
927 "line_sorter::upper_bound (2)");
932 throw std::runtime_error(
"shore::upper_bound: tell failed");
934 if((p1>=rangemax)&&(p0!=rangemin))
939 std::getline(in,buf);
948 "line_sorter::upper_bound (3)");
953 throw std::runtime_error(
"shore::upper_bound: tell failed");
958 if(m_cmp(nlwhat.c_str(),buf.c_str()))
961 return std::make_pair(p0,buf.substr(0,buf.size()-1));
964 return std::make_pair(p1,
"");
968 if(m_cmp(nlwhat.c_str(),buf.c_str()))
983 const std::string& what,
const off_t rangemin=0l,off_t rangemax=-1l)
985 const std::string nlwhat=what+
'\n';
993 throw std::runtime_error(
"shore::lower_bound: seek failed");
998 throw std::runtime_error(
"shore::lower_bound: tell failed");
1001 const off_t spos=rangemin+((rangemax-rangemin)>>1);
1005 throw std::runtime_error(
"shore::lower_bound: seek failed");
1007 std::getline(in,buf);
1016 "line_sorter::lower_bound (1)");
1019 off_t p0=in.tellg();
1021 throw std::runtime_error(
"shore::lower_bound: tell failed");
1028 std::getline(in,buf);
1037 "line_sorter::lower_bound (2)");
1040 off_t p1=in.tellg();
1042 throw std::runtime_error(
"shore::lower_bound: tell failed");
1044 if((p1>=rangemax)&&(p0!=rangemin))
1049 std::getline(in,buf);
1058 "line_sorter::lower_bound (3)");
1063 throw std::runtime_error(
"shore::lower_bound: tell failed");
1068 if(!m_cmp(buf.c_str(),nlwhat.c_str()))
1071 return std::make_pair(p0,buf.substr(0,buf.size()-1));
1074 return std::make_pair(p1,
"");
1078 if(!m_cmp(buf.c_str(),nlwhat.c_str()))
1091 #endif // SHORE_ALGO_SORT_FILE_HPP__