Skip to content

Instantly share code, notes, and snippets.

@cdacamar
Created January 8, 2020 05:20
Show Gist options
  • Save cdacamar/e4644dad6f054fefa76c2286bd166bcb to your computer and use it in GitHub Desktop.
Save cdacamar/e4644dad6f054fefa76c2286bd166bcb to your computer and use it in GitHub Desktop.
Word frequencies
// cl /EHsc /std:c++latest /permissive- /O2 word_frequencies.cpp
#include <cctype>
#include <cstdio>
#include <algorithm>
#include <charconv>
#include <fstream>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>
using WordFrequencyMap = std::unordered_map<std::string, int>;
constexpr auto isalpha_surrogate = [](auto c) { return !!std::isalpha(c); };
static void extract_words_from_line(const std::string& line, WordFrequencyMap& frequency_map) {
auto first = std::begin(line);
auto last = std::end(line);
while (first != last) {
first = std::find_if(first, last, isalpha_surrogate);
if (first == last) {
return;
}
auto word_end = std::find_if_not(first, last, isalpha_surrogate);
++frequency_map[std::string{ first, word_end }];
first = word_end;
}
}
static WordFrequencyMap bucket_word_frequencies(std::ifstream& file) {
WordFrequencyMap frequency_map;
std::string line;
while (std::getline(file, line)) {
extract_words_from_line(line, frequency_map);
}
return frequency_map;
}
using WordsAndFrequencies = std::vector<std::pair<std::string, int>>;
WordsAndFrequencies flatten(const WordFrequencyMap& frequency_map) {
WordsAndFrequencies flattened;
flattened.reserve(frequency_map.size());
for (auto& [word, frequency] : frequency_map) {
flattened.push_back({ word, frequency });
}
return flattened;
}
using TopNSorted = std::pair<WordsAndFrequencies::iterator, WordsAndFrequencies::iterator>;
TopNSorted top_n(WordsAndFrequencies& words, int top) {
auto partition_point = [&] {
if (top <= -1 || top >= words.size()) {
return std::end(words);
}
auto first = std::begin(words);
auto p = first + top;
std::nth_element(first, p, std::end(words), [](const auto& lhs, const auto& rhs) { return lhs.second > rhs.second; });
return p;
}();
auto first = std::begin(words);
std::sort(first, partition_point, [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
return { first, partition_point };
}
int main(int argc, const char* argv[]) {
if (argc < 2) {
std::printf(".\\%s: not enough arguments\n", argv[0]);
return 1;
}
int top = -1; // means take all.
if (argc > 2) {
std::string_view top_n_str { argv[2] };
auto [_, ec] = std::from_chars(top_n_str.data(), top_n_str.data() + top_n_str.length(), top);
if (ec != std::errc{}) {
std::printf(".\\%s: failed to parse \"%s\"\n", argv[0], argv[2]);
return 1;
}
if (top < -1) {
std::printf(".\\%s: argument \"%s\" cannot be negative.\n", argv[0], argv[2]);
return 1;
}
}
std::ifstream file{argv[1]};
if (!file) {
std::printf(".\\%s: could not open file \"%s\" for reading.\n", argv[0], argv[1]);
return 1;
}
auto word_frequencies = bucket_word_frequencies(file);
auto flattened = flatten(word_frequencies);
auto [first, last] = top_n(flattened, top);
while (first != last) {
auto [word, frequency] = *first;
std::printf("word: \"%s\", appears: %d\n", word.c_str(), frequency);
++first;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment