alipha · March 14, 2019 14:32
diff --git a/substr.cpp b/substr.cpp
 #include <string>
 #include <iostream>
 #include <fstream>
 #include <ctime>
 #include <cstring>
 #include <set>
 #include <algorithm>

 using namespace std;


 class cstring {
 public:
    cstring(const char *s) : str(s) {}

    const char *str;
 };


 bool operator<(const cstring &left, const cstring &right) {
    return strcmp(left.str, right.str) < 0;
 }


 cstring to_lower(char *str) {
    transform(str, str + strlen(str), str, ::tolower);
    return str;
 }


 void explode(set<cstring> &segments, const char *str) {
    for(size_t i = 1, len = strlen(str); i < len; i++)
       segments.insert(str + i);
 }


 void explode_all(set<cstring> &segments, set<cstring> &words) {
    for(set<cstring>::iterator word = words.begin(); word != words.end(); word++)
        explode(segments, word->str);
 }


 bool word_found(const set<cstring> &segments, const cstring &word, bool use_lower) {
    set<cstring>::iterator it = use_lower ? segments.lower_bound(word) : segments.upper_bound(word);
    if(it == segments.end())
        return false;
    //cout << word.str << " == " << it->str << endl;
    return strncmp(word.str, it->str, strlen(word.str)) == 0 && it->str != word.str;
 }


 bool slow_word_found(const set<cstring> &words, const cstring &word) {
    for(set<cstring>::iterator it = words.begin(); it != words.end(); it++) {
        if(strstr(it->str, word.str) && it->str != word.str)
            return true;
    }

    return false;
 }


 void load_file(const char *filename, set<cstring> &words) {
    string line;
    ifstream file(filename);

    while(getline(file, line)) {
        char *word = new char[line.size() + 1];
        strcpy(word, line.c_str());
        words.insert(to_lower(word));
    }

    file.close();
 }


 void save(const char *filename, const set<cstring> &output) {
    ofstream file(filename);

    for(set<cstring>::const_iterator it = output.begin(); it != output.end(); it++) {
        file << it->str << endl;
    }

    file.close();
 }


 int main(int argc, char **argv) {
    set<cstring> words;
    set<cstring> segments;
    set<cstring> output;
    set<cstring> slow_output;

    if(argc < 2) {
        cout << "Must specify input filename" << endl;
        return 1;
    }

    load_file(argv[1], words);

    clock_t start = clock();
    explode_all(segments, words);
    clock_t end = clock();
    cout << "Explode: " << ((double)end - start) / CLOCKS_PER_SEC << endl;
    /*
    for(set<cstring>::iterator it = segments.begin(); it != segments.end(); it++) {
        cout << it->str << endl;
    }
    cout << endl;
    */

    start = clock();

    for(set<cstring>::iterator word = words.begin(); word != words.end(); word++) {
        if(!word_found(words, *word, false) && !word_found(segments, *word, true))
            output.insert(*word);
    }

    end = clock();
    cout << " Filter: " << ((double)end - start) / CLOCKS_PER_SEC << endl;

    save("output.txt", output);


    time_t sec_start = time(0);
    start = clock();

    for(set<cstring>::iterator word = words.begin(); word != words.end(); word++) {
        if(!slow_word_found(words, *word))
            slow_output.insert(*word);
    }

    end = clock();
    time_t sec_end = time(0);
    cout << "   Slow: " << ((double)end - start) / CLOCKS_PER_SEC << endl;
    cout << "in secs: " << sec_end - sec_start << endl;

    save("slow_output.txt", slow_output);


    cout << words.size() << " words" << endl;
    cout << segments.size() << " segments" << endl;
    cout << output.size() << " unique" << endl;
    cout << slow_output.size() << " unique (double check)" << endl;
    return 0;
 }
	#include <string>
	#include <iostream>
	#include <fstream>
	#include <ctime>
	#include <cstring>
	#include <set>
	#include <algorithm>

	using namespace std;


	class cstring {
	public:
	cstring(const char *s) : str(s) {}

	const char *str;
	};


	bool operator<(const cstring &left, const cstring &right) {
	return strcmp(left.str, right.str) < 0;
	}


	cstring to_lower(char *str) {
	transform(str, str + strlen(str), str, ::tolower);
	return str;
	}


	void explode(set<cstring> &segments, const char *str) {
	for(size_t i = 1, len = strlen(str); i < len; i++)
	segments.insert(str + i);
	}


	void explode_all(set<cstring> &segments, set<cstring> &words) {
	for(set<cstring>::iterator word = words.begin(); word != words.end(); word++)
	explode(segments, word->str);
	}


	bool word_found(const set<cstring> &segments, const cstring &word, bool use_lower) {
	set<cstring>::iterator it = use_lower ? segments.lower_bound(word) : segments.upper_bound(word);
	if(it == segments.end())
	return false;
	//cout << word.str << " == " << it->str << endl;
	return strncmp(word.str, it->str, strlen(word.str)) == 0 && it->str != word.str;
	}


	bool slow_word_found(const set<cstring> &words, const cstring &word) {
	for(set<cstring>::iterator it = words.begin(); it != words.end(); it++) {
	if(strstr(it->str, word.str) && it->str != word.str)
	return true;
	}

	return false;
	}


	void load_file(const char *filename, set<cstring> &words) {
	string line;
	ifstream file(filename);

	while(getline(file, line)) {
	char *word = new char[line.size() + 1];
	strcpy(word, line.c_str());
	words.insert(to_lower(word));
	}

	file.close();
	}


	void save(const char *filename, const set<cstring> &output) {
	ofstream file(filename);

	for(set<cstring>::const_iterator it = output.begin(); it != output.end(); it++) {
	file << it->str << endl;
	}

	file.close();
	}


	int main(int argc, char **argv) {
	set<cstring> words;
	set<cstring> segments;
	set<cstring> output;
	set<cstring> slow_output;

	if(argc < 2) {
	cout << "Must specify input filename" << endl;
	return 1;
	}

	load_file(argv[1], words);

	clock_t start = clock();
	explode_all(segments, words);
	clock_t end = clock();
	cout << "Explode: " << ((double)end - start) / CLOCKS_PER_SEC << endl;
	/*
	for(set<cstring>::iterator it = segments.begin(); it != segments.end(); it++) {
	cout << it->str << endl;
	}
	cout << endl;
	*/

	start = clock();

	for(set<cstring>::iterator word = words.begin(); word != words.end(); word++) {
	if(!word_found(words, word, false) && !word_found(segments, word, true))
	output.insert(*word);
	}

	end = clock();
	cout << " Filter: " << ((double)end - start) / CLOCKS_PER_SEC << endl;

	save("output.txt", output);


	time_t sec_start = time(0);
	start = clock();

	for(set<cstring>::iterator word = words.begin(); word != words.end(); word++) {
	if(!slow_word_found(words, *word))
	slow_output.insert(*word);
	}

	end = clock();
	time_t sec_end = time(0);
	cout << " Slow: " << ((double)end - start) / CLOCKS_PER_SEC << endl;
	cout << "in secs: " << sec_end - sec_start << endl;

	save("slow_output.txt", slow_output);


	cout << words.size() << " words" << endl;
	cout << segments.size() << " segments" << endl;
	cout << output.size() << " unique" << endl;
	cout << slow_output.size() << " unique (double check)" << endl;
	return 0;
	}