Created
April 30, 2015 06:54
-
-
Save shoooe/1ea1ca26cc835b283db6 to your computer and use it in GitHub Desktop.
Some random corrector which, given a dictionary file with "<word> <frequency>" space separated pairs and fed with a word, returns the closest word of distance 1 with the highest frequency.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <map> | |
#include <stdexcept> | |
#include <vector> | |
namespace a8 { | |
// aliases | |
using word = std::string; | |
using dict = std::map<word, unsigned int>; | |
// generates all similar strings by removing 1 character | |
// from the original string in different places | |
template<typename OutIt> | |
OutIt generate_similar_by_deletion(a8::word const& word, OutIt first) { | |
std::cout << "Generating by deletetion...\n"; | |
for (std::size_t i = 0; i < word.size() - 1; i++) { | |
a8::word curr = word; | |
curr.erase(i, 1); | |
std::cout << "\tGenerated: " << curr << '\n'; | |
*first++ = curr; | |
} | |
return first; | |
} | |
// generates all similar strings by substituting 1 character | |
// from the alphabet in different positions | |
template<typename OutIt> | |
OutIt generate_similar_by_substitution(a8::word const& word, OutIt first) { | |
std::cout << "Generating by substitution...\n"; | |
for (std::size_t i = 0; i < word.size(); i++) { | |
for (char c = 'A'; c <= 'Z'; c++) { | |
a8::word curr = word; | |
curr.replace(i, 1, 1, c); // inserts for 1 time at position i the character c for 1 time | |
std::cout << "\tGenerated: " << curr << '\n'; | |
*first++ = curr; | |
} | |
} | |
return first; | |
} | |
// generates all similar strings by inserting 1 character | |
// from the alphabet in different positions | |
template<typename OutIt> | |
OutIt generate_similar_by_insertion(a8::word const& word, OutIt first) { | |
std::cout << "Generating by insertion...\n"; | |
for (std::size_t i = 0; i < word.size() + 1; i++) { | |
for (char c = 'A'; c <= 'Z'; c++) { | |
a8::word curr = word; | |
curr.insert(i, 1, c); // inserts at i | |
std::cout << "\tGenerated: " << curr << '\n'; | |
*first++ = curr; | |
} | |
} | |
return first; | |
} | |
// generates all similar string with distance of 1, | |
// by deletion, substitution and insertion | |
template<typename OutIt> | |
OutIt generate_similar(a8::word const& word, OutIt begin) { | |
auto begin1 = generate_similar_by_deletion(word, begin); | |
auto begin2 = generate_similar_by_substitution(word, begin1); | |
auto begin3 = generate_similar_by_insertion(word, begin2); | |
return begin3; | |
} | |
// returns a copy of the value with the given key or | |
// the passed in value if not found | |
template<typename Dict, typename Key, typename Value> | |
Value at_or(Dict const& map, Key const& key, Value other) { | |
try { return map.at(key); } | |
catch (std::out_of_range const&) { return other; } | |
} | |
// finds the iterator to the best match in the dictionary if it exists, | |
// otherwise it returns the end iterator | |
template<typename Dict> | |
typename Dict::iterator best_match(Dict& dictionary, a8::word const& word) { | |
auto it = dictionary.find(word); | |
if (it != dictionary.end()) return it; | |
std::vector<a8::word> similar; | |
generate_similar(word, std::back_inserter(similar)); | |
auto similar_it = std::max_element(similar.begin(), similar.end(), | |
[&dictionary](a8::word const& a, a8::word const& b) { | |
return at_or(dictionary, a, 0u) < at_or(dictionary, b, 0u); // @todo: probably not optimal | |
}); | |
if (similar_it == similar.end()) return dictionary.end(); | |
else return dictionary.find(*similar_it); | |
} | |
// takes an input stream with space separated pairs | |
// representing the word and the frequency of such word | |
a8::dict make_dictionary(std::istream& input) { | |
a8::word word; | |
unsigned int count; | |
a8::dict dictionary; | |
while (input >> word && input >> count) { | |
dictionary[word] = count; | |
} | |
return dictionary; | |
} | |
// runs the user interface for the program | |
// asking for words and replying to them one by one | |
void run_corrector(dict& dictionary) { | |
std::string word; | |
while (std::cin >> word) { | |
auto it = best_match(dictionary, word); | |
if (it != dictionary.end()) { | |
std::cout << it->first << ' ' << it->second << '\n'; | |
it->second++; | |
} else { | |
std::cout << "-\n"; | |
dictionary[word] = 1; | |
} | |
} | |
} | |
} | |
int main(int argc, char* argv[]) { | |
if (argc != 2) throw std::runtime_error("Filename not passed in"); | |
std::string filename = argv[1]; | |
std::ifstream filestream(filename); | |
a8::dict dictionary = a8::make_dictionary(filestream); | |
a8::run_corrector(dictionary); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment