Created
October 8, 2020 16:52
-
-
Save mklca/8fb5d19dc0c8aae9b9f5441127fbe7af to your computer and use it in GitHub Desktop.
Jaro and Jaro-Winkler String Distance Metrics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
double jaro_string_similarity(boost::string_ref a, boost::string_ref b) | |
{ | |
using std::cbegin; | |
using std::cend; | |
using std::distance; | |
if(a.empty()) { | |
return b.empty() ? 1.0 : 0.0; | |
} | |
const std::size_t as = a.size(); | |
const std::size_t bs = b.size(); | |
// The bandwidth to consider for matches | |
const ptrdiff_t width = std::max(as, bs)/2 - 1; | |
// These vectors match the sizes of their corresponding strings | |
std::vector<bool> a_matches(as, false); | |
std::vector<bool> b_matches(bs, false); | |
std::size_t matches = 0; | |
// Count matches | |
using string_iterator = boost::string_ref::const_iterator; | |
for(string_iterator ap = cbegin(a); ap != cend(a); ++ap) | |
{ | |
const std::ptrdiff_t a_offset = distance(cbegin(a), ap); | |
assert(a_offset >= 0); | |
assert(static_cast<std::size_t>(a_offset) < a_matches.size()); | |
// Avoid undefined behavior by not incrementing past the end of string b | |
const string_iterator b_start = std::next(cbegin(b), | |
std::min(static_cast<std::size_t>(std::max(0l, a_offset - width)), bs)); | |
const string_iterator b_end = std::next(cbegin(b), | |
std::min(static_cast<std::size_t>(a_offset + width + 1), bs)); | |
for(string_iterator bp = b_start; bp != b_end; ++bp) | |
{ | |
const std::ptrdiff_t b_offset = distance(cbegin(b), bp); | |
assert(static_cast<std::size_t>(b_offset) < b_matches.size()); | |
if(!b_matches[b_offset] && *ap == *bp) { | |
a_matches[a_offset] = true; | |
b_matches[b_offset] = true; | |
++matches; | |
break; | |
} | |
} | |
} | |
if(matches == 0) { | |
return 0.0; | |
} | |
// Count transpositions | |
std::size_t transposes = 0; | |
using match_iterator = std::vector<bool>::const_iterator; | |
match_iterator bmp = cbegin(b_matches); | |
for(match_iterator amp = cbegin(a_matches); amp != cend(a_matches); ++amp) | |
{ | |
// Advance amp until until a match is encountered | |
if(!*amp) { | |
continue; | |
} | |
const std::ptrdiff_t a_offset = distance(cbegin(a_matches), amp); | |
// Similarly for b | |
while(bmp != cend(b_matches) && !*bmp) { | |
++bmp; | |
} | |
const std::ptrdiff_t b_offset = distance(cbegin(b_matches), bmp); | |
const string_iterator a_pos = std::next(cbegin(a), a_offset); | |
const string_iterator b_pos = std::next(cbegin(b), b_offset); | |
if(*a_pos != *b_pos) { | |
++transposes; | |
} | |
if(bmp != cend(b_matches)) { | |
++bmp; | |
} | |
} | |
// Compute the distance | |
const double term_a = static_cast<double>(matches)/as; | |
const double term_b = static_cast<double>(matches)/bs; | |
const double term_t = (static_cast<double>(matches) - static_cast<double>(transposes)/2.0)/matches; | |
return (term_a + term_b + term_t)/3.0; | |
} | |
double jaro_winkler_string_similarity(std::size_t max_prefix, double scale, boost::string_ref a, boost::string_ref b) | |
{ | |
const std::size_t as = a.size(); | |
const std::size_t bs = b.size(); | |
// Count up to max_prefix matches at the beginning of each string | |
const std::size_t prefix_bound = std::min({max_prefix, as, bs}); | |
std::size_t prefix_matches = 0; | |
for(unsigned int offset = 0; offset < prefix_bound; ++offset) | |
{ | |
if(a[offset] == b[offset]) { | |
++prefix_matches; | |
} | |
} | |
const double jaro_value = jaro_string_similarity(a, b); | |
return jaro_value + static_cast<double>(prefix_matches)*scale*(1.0 - jaro_value); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using namespace std; | |
BOOST_AUTO_TEST_CASE(jaro_both_empty_match) | |
{ | |
const double jaro_value = klustrcore::jaro_string_similarity("", ""); | |
BOOST_CHECK_EQUAL(jaro_value, 1.0); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_left_empty_nomatch) | |
{ | |
const double jaro_value = klustrcore::jaro_string_similarity("", "quo"); | |
BOOST_CHECK_EQUAL(jaro_value, 0.0); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_right_empty_nomatch) | |
{ | |
const double jaro_value = klustrcore::jaro_string_similarity("quux", ""); | |
BOOST_CHECK_EQUAL(jaro_value, 0.0); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_sample_1, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jaro_value = klustrcore::jaro_string_similarity("DWAYNE", "DUANE"); | |
BOOST_TEST(jaro_value == 0.822); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_sample_2, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jaro_value = klustrcore::jaro_string_similarity("MARTHA", "MARHTA"); | |
BOOST_TEST(jaro_value == 0.944); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_sample_3, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jaro_value = klustrcore::jaro_string_similarity("DIXON", "DICKSONX"); | |
BOOST_TEST(jaro_value == 0.767); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_sample_4, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jaro_value = klustrcore::jaro_string_similarity("JELLYFISH", "SMELLYFISH"); | |
BOOST_TEST(jaro_value == 0.896); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_sample_5, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jaro_value = klustrcore::jaro_string_similarity("CAT", "BIRD"); | |
BOOST_TEST(jaro_value == 0); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_1, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jw_value = klustrcore::jaro_winkler_string_similarity("DWAYNE", "DUANE"); | |
BOOST_TEST(jw_value == 0.858); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_2, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jw_value = klustrcore::jaro_winkler_string_similarity("MARTHA", "MARHTA"); | |
BOOST_TEST(jw_value == 0.961); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_3, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jw_value = klustrcore::jaro_winkler_string_similarity("DIXON", "DICKSONX"); | |
BOOST_TEST(jw_value == 0.813); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_4, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jw_value = klustrcore::jaro_winkler_string_similarity("JELLYFISH", "SMELLYFISH"); | |
BOOST_TEST(jw_value == 0.906); | |
} | |
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_5, * boost::unit_test::tolerance(1e-2)) | |
{ | |
const double jw_value = klustrcore::jaro_winkler_string_similarity("CAT", "BIRD"); | |
BOOST_TEST(jw_value == 0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment