> ...it takes time to read the index file...
C++ provides no facility to map portions (or all) of a file to virtual memory; the way to do this is platform dependent.
reading/writing byte by byte from a file could be slow; you can usually get much better performance by doing unformatted i/o. Unformatted i/o transfers the internal binary representation of the data directly between memory and the fstreambuf.
i had written the following test program some time ago. note that you would need to repeat this on the implementation that you would be using; differences between compiler, C++ library and operating systems would be significant. also note that the the results depend on how good the file system buffer cache is.
Code:
#include <iostream>
#include <fstream>
#include <string>
#include <cstdlib>
#include <ctime>
#include <iterator>
#include <algorithm>
#include <vector>
#include <sys/types.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/stat.h>
struct rand_char
{
char operator() () const { return char( std::rand() % 127 + 1 ) ; }
};
std::string create_test_file( std::size_t sz )
{
char name_template[] = "/tmp/.test-XXXXXXXX" ;
std::string fname = ::mktemp( name_template ) ;
std::ofstream file( fname.c_str() ) ;
if( !file ) return create_test_file(sz) ;
std::generate_n( std::ostream_iterator<char>(file), sz, rand_char() ) ;
return fname ;
}
std::clock_t read_file_char_by_char( const std::string& fname )
{
std::clock_t start = std::clock() ;
std::ifstream file( fname.c_str(), std::ios_base::binary | std::ios_base::ate ) ;
std::vector<char> vec ;
vec.reserve( file.tellg() ) ;
file.seekg(0) ;
char ch ;
while( file.get(ch) ) vec.push_back(ch) ;
return std::clock() - start ;
}
std::clock_t read_file_using_iterators( const std::string& fname )
{
std::clock_t start = std::clock() ;
std::ifstream file( fname.c_str(), std::ios_base::binary | std::ios_base::ate ) ;
std::vector<char> vec ;
vec.reserve( file.tellg() ) ;
file.seekg(0) ;
file >> std::noskipws ;
std::istream_iterator<char> begin(file), end ;
vec.insert( vec.end(), begin, end ) ;
return std::clock() - start ;
}
std::clock_t read_file_unformatted( const std::string& fname )
{
std::clock_t start = std::clock() ;
std::ifstream file( fname.c_str(), std::ios_base::binary | std::ios_base::ate ) ;
std::streampos sz = file.tellg() ;
std::vector<char> vec(sz) ;
file.seekg(0) ;
file.read( &vec.front(), sz ) ;
return std::clock() - start ;
}
namespace { int global = 0 ; }
std::clock_t read_file_memory_map( const std::string& fname )
{
std::clock_t start = std::clock() ;
int fd = open( fname.c_str(), O_RDONLY ) ;
struct stat sb ;
::fstat( fd, &sb ) ;
void* map = ::mmap( 0, sb.st_size, PROT_READ, 0, fd, 0 ) ;
// access every page to force a read
const char* pch = static_cast<const char*>( map ) ;
enum { MIN_PAGE_SIZE = 1024 } ;
for( const char* begin = pch ; begin < ( pch + sb.st_size ) ; begin += MIN_PAGE_SIZE )
global += int(*begin) ;
::munmap( map, sb.st_size ) ;
::close(fd) ;
return std::clock() - start ;
}
int main()
{
enum { NBYTES = 1024*1024*128 } ; // create a 128 MB test file
std::string fname = create_test_file( NBYTES ) ;
const double clocks_per_millisec = CLOCKS_PER_SEC / double(1000) ;
std::cout << "read_file_char_by_char: "
<< read_file_char_by_char( fname ) / clocks_per_millisec << " millisecs.\n" ;
std::cout << "read_file_using_iterators: "
<< read_file_using_iterators( fname ) / clocks_per_millisec << " millisecs.\n" ;
std::cout << "read_file_unformatted: "
<< read_file_unformatted( fname ) / clocks_per_millisec << " millisecs.\n" ;
std::cout << "read_file_memory_map: "
<< read_file_memory_map( fname ) / clocks_per_millisec << " millisecs.\n" ;
::unlink( fname.c_str() ) ; // cleanup the temp file
}
compiler: g++ (GCC) 4.3.4 20090212
compiled with: -Wall -std=c++98 -pedantic -Werror -O3 -fomit-frame-pointer
os: FreeBSD 7.1 i386
filesystem: Berkeley FFS (UFS II) with soft updates enabled, journaling disabled
cpu: Intel Core2 Duo T7100 @ 1.80GHz (1795.51-MHz 686-class CPU) thinkpad R61i
time taken to read a 128 MB file (cached):
Code:
read_file_char_by_char: 4960.94 millisecs.
read_file_using_iterators: 5814.06 millisecs.
read_file_unformatted: 323.938 millisecs.
read_file_memory_map: 46.875 millisecs.
Bookmarks