Last active
January 5, 2021 19:23
-
-
Save oschonrock/67fc870ba067ebf0f369897a9d52c2dd to your computer and use it in GitHub Desktop.
High speed parsing of floats in CSV format - C++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// refers to this question of SO: | |
// https://stackoverflow.com/questions/17465061/how-to-parse-space-separated-floats-in-c-quickly/59013147 | |
#include <iomanip> | |
#include <iostream> | |
// for mmap: | |
#include <fcntl.h> | |
#include <sys/mman.h> | |
#include <sys/stat.h> | |
class MemoryMappedFile { | |
public: | |
MemoryMappedFile(const char* filename) { | |
int fd = open(filename, O_RDONLY); | |
if (fd == -1) throw std::logic_error("MemoryMappedFile: couldn't open file."); | |
// obtain file size | |
struct stat sb; | |
if (fstat(fd, &sb) == -1) throw std::logic_error("MemoryMappedFile: cannot stat file size"); | |
m_filesize = sb.st_size; | |
m_map = static_cast<const char*>(mmap(NULL, m_filesize, PROT_READ, MAP_PRIVATE, fd, 0u)); | |
if (m_map == MAP_FAILED) throw std::logic_error("MemoryMappedFile: cannot map file"); | |
} | |
~MemoryMappedFile() { | |
if (munmap(static_cast<void*>(const_cast<char*>(m_map)), m_filesize) == -1) | |
std::cerr << "Warnng: MemoryMappedFile: error in destructor during `munmap()`\n"; | |
} | |
const char* start() const { return m_map; } | |
const char* end() const { return m_map + m_filesize; } | |
private: | |
size_t m_filesize = 0; | |
const char* m_map = nullptr; | |
}; | |
// high speed str -> double parser | |
double pow10(int n) { | |
double ret = 1.0; | |
double r = 10.0; | |
if (n < 0) { | |
n = -n; | |
r = 0.1; | |
} | |
while (n) { | |
if (n & 1) { | |
ret *= r; | |
} | |
r *= r; | |
n >>= 1; | |
} | |
return ret; | |
} | |
double crack_atof(const char* start, const char* const end) { | |
if (!start || !end || end <= start) { | |
return 0; | |
} | |
int sign = 1; | |
double int_part = 0.0; | |
double frac_part = 0.0; | |
bool has_frac = false; | |
bool has_exp = false; | |
// +/- sign | |
if (*start == '-') { | |
++start; | |
sign = -1; | |
} else if (*start == '+') { | |
++start; | |
} | |
while (start != end) { | |
if (*start >= '0' && *start <= '9') { | |
int_part = int_part * 10 + (*start - '0'); | |
} else if (*start == '.') { | |
has_frac = true; | |
++start; | |
break; | |
} else if (*start == 'e') { | |
has_exp = true; | |
++start; | |
break; | |
} else { | |
return sign * int_part; | |
} | |
++start; | |
} | |
if (has_frac) { | |
double frac_exp = 0.1; | |
while (start != end) { | |
if (*start >= '0' && *start <= '9') { | |
frac_part += frac_exp * (*start - '0'); | |
frac_exp *= 0.1; | |
} else if (*start == 'e') { | |
has_exp = true; | |
++start; | |
break; | |
} else { | |
return sign * (int_part + frac_part); | |
} | |
++start; | |
} | |
} | |
// parsing exponent part | |
double exp_part = 1.0; | |
if (start != end && has_exp) { | |
int exp_sign = 1; | |
if (*start == '-') { | |
exp_sign = -1; | |
++start; | |
} else if (*start == '+') { | |
++start; | |
} | |
int e = 0; | |
while (start != end && *start >= '0' && *start <= '9') { | |
e = e * 10 + *start - '0'; | |
++start; | |
} | |
exp_part = pow10(exp_sign * e); | |
} | |
return sign * (int_part + frac_part) * exp_part; | |
} | |
int main() { | |
MemoryMappedFile map = MemoryMappedFile("FloatDataset.csv"); | |
const char* curr = map.start(); | |
const char* start = map.start(); | |
const char* const end = map.end(); | |
uintmax_t lines_n = 0; | |
int cnt = 0; | |
double sum = 0.0; | |
while (curr && curr != end) { | |
if (*curr == ',' || *curr == '\n') { | |
// std::string fieldstr(start, curr); | |
// double field = std::stod(fieldstr); | |
// m_numLines = 11000000 cnt=33000000 sum=16498294753551.9 | |
// real 5.998s | |
double field = crack_atof(start, curr); | |
// m_numLines = 11000000 cnt=33000000 sum=16498294753551.9 | |
// real 1.327s | |
sum += field; | |
++cnt; | |
if (*curr == '\n') lines_n++; | |
curr++; | |
start = curr; | |
} else { | |
++curr; | |
} | |
} | |
std::cout << std::setprecision(15) << "m_numLines = " << lines_n << " cnt=" << cnt | |
<< " sum=" << sum << "\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <random> | |
#include <iomanip> | |
std::string filename = "FloatDataset.csv"; | |
const int cols_n = 3; | |
const int rows_n = 11'000'000; | |
std::ofstream ofstream(filename); | |
if (!ofstream.is_open()) { | |
std::cerr << "failed to open " << filename << '\n'; | |
exit(1); | |
} | |
std::random_device rd; | |
std::mt19937 gen{1}; // rd()}; | |
std::uniform_real_distribution<double> dist{1, 1e6}; | |
ofstream << std::setprecision(15); | |
for (int r = 0; r < rows_n; r++) { | |
for (int c = 0; c < cols_n; c++) { | |
double num = dist(gen); | |
ofstream << num; | |
if (c != cols_n - 1) ofstream << ','; | |
} | |
ofstream << "\n"; | |
} | |
ofstream.close(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment