Skip to content

Instantly share code, notes, and snippets.

Created March 4, 2013 22:23
Show Gist options
  • Save anonymous/5086232 to your computer and use it in GitHub Desktop.
Save anonymous/5086232 to your computer and use it in GitHub Desktop.
#include<iostream>
#include<unistd.h>
#include<algorithm>
#include<map>
#include<vector>
#include<string>
#include<cassert>
#include<sstream>
#include<fstream>
using namespace std;
char fix_character(char c)
{
if(c >= 'a' && c <= 'z')
return c;
if(c >= 'A' && c <= 'Z')
return c + 32;
return -1;
}
string fix_characters(string s)
{
string new_string;
for(size_t i = 0; i < s.size(); i++) {
char c = fix_character(s[i]);
if(c != -1) new_string += c;
}
return new_string;
}
#define TRIE_SIZE 26
struct Trie {
vector<Trie*> children;
bool endpoint;
Trie()
{
this->children.assign(TRIE_SIZE, NULL);
this->endpoint = false;
}
void add(string word)
{
word = fix_characters(word);
add(word, 0);
}
void find_and_add_to_map(string word, map<string, unsigned int> &hits)
{
find_and_add_to_map(word, hits, 0);
}
private:
void add(string &word, size_t index)
{
if(index == word.size()) {
this->endpoint = true;
} else {
size_t character = word[index] - 'a';
if(character > TRIE_SIZE) {
cerr << "TURBO: Refusing to add " << word << " to search trie." << endl;
return;
}
if(this->children[character] == NULL)
this->children[character] = new Trie;
this->children[character]->add(word, index + 1);
}
}
void find_and_add_to_map(string &word, map<string, unsigned int> &hits, size_t index)
{
if(this->endpoint)
hits[word.substr(0, index)]++;
if(index == word.size() + 1)
return;
size_t character = word[index] - 'a';
if(character > TRIE_SIZE) // error
return;
if(this->children[character] == NULL)
return;
this->children[character]->find_and_add_to_map(word, hits, index + 1);
}
};
Trie build_search_trie()
{
Trie root;
ifstream search("search.csv");
string line;
while(getline(search, line)) {
stringstream ss(line);
getline(ss, line, ';');
root.add(line);
}
cerr << "TURBO: Done building search trie." << endl;
return root;
}
map<string, unsigned int> corpus_search(Trie &root)
{
ifstream corpus("corpus.csv");
string line;
map<string, unsigned int> hits;
while(getline(corpus, line)) {
stringstream ss(line);
for(size_t i = 0; i <= 3 && getline(ss, line, '|'); i++) {
if(i == 0) continue;
line = fix_characters(line);
for(size_t k = 0; k < line.size(); k++)
root.find_and_add_to_map(line.substr(k, line.size() - k), hits);
}
}
return hits;
}
int main()
{
Trie root = build_search_trie();
auto hits = corpus_search(root);
// cout << "term;count" << endl;
// for(auto hit : hits)
// cout << hit.first << ";" << hit.second << endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment