Last active
December 30, 2015 04:31
-
-
Save simonsickle-old/1fbb65c1194aefb14ba5 to your computer and use it in GitHub Desktop.
A deduplicator for checking for duplicate emails in CPP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright 2015 - Simon Sickle | |
* CSV DeDuplicator | |
* Finds duplicate emails and creates a list of duplicates | |
* usage is ./dedupe /path/to/whatever.csv /path/to/dupe.csv | |
*/ | |
#include <iostream> | |
#include <fstream> | |
#include <sstream> | |
#include <cstring> | |
#include <string> | |
using namespace std; | |
int main (int argc, const char * argv[]) { | |
int lineCount = 0; | |
int lineNum = 0; | |
int numDupes = 0; | |
int numTimesRepeat = 0; | |
char c; | |
string email = ""; | |
// Check arguments | |
if (argc != 3) | |
{ | |
cout << "Usage ./dedupe /path/to/a.csv /path/for/dupe.csv" << endl; | |
return -1; | |
} | |
// Open file | |
fstream textfile(argv[1], ios::in); | |
// Maks sure file is readable | |
if (textfile.good()) { | |
// Get line count | |
while (textfile.get(c)) | |
{ | |
if (c == '\n') | |
lineCount++; | |
} | |
// Print line count | |
cout << "There are " << lineCount << " lines." << endl; | |
// Create a big enough array to hold at least all the lines | |
string * emails = new string[lineCount]; | |
string * dupes = new string[lineCount]; | |
// Remove EOF flag and seek to beginning | |
textfile.clear(); | |
textfile.seekg(0, ios::beg); | |
// Loop through each line | |
for (string line; getline(textfile, line); ) | |
{ | |
// copy headers / first line | |
if (lineNum == 0) | |
{ | |
dupes[numDupes] = line; | |
numDupes++; | |
} | |
// Turn the line string into stringstream for getline | |
stringstream ls(line); | |
// Loop through line until find something with an @ in it | |
while(getline(ls, email,',')) | |
{ | |
if (email.find("@") != string::npos) | |
break; | |
} | |
// Add to email array | |
emails[lineNum] = email; | |
// Loop through emails array to fund amount of dupes | |
for (int i = 0; i <= lineNum; i++) | |
{ | |
// Detect dupe | |
if (emails[i] == email) | |
{ | |
numTimesRepeat++; | |
} | |
} | |
// If its only 1, then its not a dupe | |
if (numTimesRepeat > 1) { | |
dupes[numDupes] = line; | |
numDupes++; | |
} | |
// Reset dupe counter | |
numTimesRepeat = 0; | |
// Go to the next line | |
lineNum++; | |
} | |
cout << "We found " << numDupes << " duplicates." << endl; | |
// Write the dupes to a new csv file | |
fstream txtOut(argv[2], ios::out); | |
if (txtOut.good()) | |
{ | |
for (int i = 0; i < numDupes; i++) | |
{ | |
txtOut << dupes[i] << endl; | |
} | |
} else { | |
cout << "File name to save duplicates under was not valid" << endl; | |
return 1; | |
} | |
// Clean up resources | |
textfile.close(); | |
txtOut.close(); | |
delete [] emails; | |
delete [] dupes; | |
// Return 0 for good in *nix | |
return 0; | |
} | |
cout << "Could not read the file provided" << endl; | |
return 1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment