Created
March 14, 2019 14:32
-
-
Save alipha/a9bafec8a7d2a4fe5209eb3ea0c88413 to your computer and use it in GitHub Desktop.
Find all strings in a set which are not a substring of another string within the set
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <iostream> | |
#include <fstream> | |
#include <ctime> | |
#include <cstring> | |
#include <set> | |
#include <algorithm> | |
using namespace std; | |
class cstring { | |
public: | |
cstring(const char *s) : str(s) {} | |
const char *str; | |
}; | |
bool operator<(const cstring &left, const cstring &right) { | |
return strcmp(left.str, right.str) < 0; | |
} | |
cstring to_lower(char *str) { | |
transform(str, str + strlen(str), str, ::tolower); | |
return str; | |
} | |
void explode(set<cstring> &segments, const char *str) { | |
for(size_t i = 1, len = strlen(str); i < len; i++) | |
segments.insert(str + i); | |
} | |
void explode_all(set<cstring> &segments, set<cstring> &words) { | |
for(set<cstring>::iterator word = words.begin(); word != words.end(); word++) | |
explode(segments, word->str); | |
} | |
bool word_found(const set<cstring> &segments, const cstring &word, bool use_lower) { | |
set<cstring>::iterator it = use_lower ? segments.lower_bound(word) : segments.upper_bound(word); | |
if(it == segments.end()) | |
return false; | |
//cout << word.str << " == " << it->str << endl; | |
return strncmp(word.str, it->str, strlen(word.str)) == 0 && it->str != word.str; | |
} | |
bool slow_word_found(const set<cstring> &words, const cstring &word) { | |
for(set<cstring>::iterator it = words.begin(); it != words.end(); it++) { | |
if(strstr(it->str, word.str) && it->str != word.str) | |
return true; | |
} | |
return false; | |
} | |
void load_file(const char *filename, set<cstring> &words) { | |
string line; | |
ifstream file(filename); | |
while(getline(file, line)) { | |
char *word = new char[line.size() + 1]; | |
strcpy(word, line.c_str()); | |
words.insert(to_lower(word)); | |
} | |
file.close(); | |
} | |
void save(const char *filename, const set<cstring> &output) { | |
ofstream file(filename); | |
for(set<cstring>::const_iterator it = output.begin(); it != output.end(); it++) { | |
file << it->str << endl; | |
} | |
file.close(); | |
} | |
int main(int argc, char **argv) { | |
set<cstring> words; | |
set<cstring> segments; | |
set<cstring> output; | |
set<cstring> slow_output; | |
if(argc < 2) { | |
cout << "Must specify input filename" << endl; | |
return 1; | |
} | |
load_file(argv[1], words); | |
clock_t start = clock(); | |
explode_all(segments, words); | |
clock_t end = clock(); | |
cout << "Explode: " << ((double)end - start) / CLOCKS_PER_SEC << endl; | |
/* | |
for(set<cstring>::iterator it = segments.begin(); it != segments.end(); it++) { | |
cout << it->str << endl; | |
} | |
cout << endl; | |
*/ | |
start = clock(); | |
for(set<cstring>::iterator word = words.begin(); word != words.end(); word++) { | |
if(!word_found(words, *word, false) && !word_found(segments, *word, true)) | |
output.insert(*word); | |
} | |
end = clock(); | |
cout << " Filter: " << ((double)end - start) / CLOCKS_PER_SEC << endl; | |
save("output.txt", output); | |
time_t sec_start = time(0); | |
start = clock(); | |
for(set<cstring>::iterator word = words.begin(); word != words.end(); word++) { | |
if(!slow_word_found(words, *word)) | |
slow_output.insert(*word); | |
} | |
end = clock(); | |
time_t sec_end = time(0); | |
cout << " Slow: " << ((double)end - start) / CLOCKS_PER_SEC << endl; | |
cout << "in secs: " << sec_end - sec_start << endl; | |
save("slow_output.txt", slow_output); | |
cout << words.size() << " words" << endl; | |
cout << segments.size() << " segments" << endl; | |
cout << output.size() << " unique" << endl; | |
cout << slow_output.size() << " unique (double check)" << endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment