Skip to content

Instantly share code, notes, and snippets.

@niccottrell
Last active December 21, 2017 11:13
Show Gist options
  • Save niccottrell/0b1065e56ea028c2986574eb8dce7578 to your computer and use it in GitHub Desktop.
Save niccottrell/0b1065e56ea028c2986574eb8dce7578 to your computer and use it in GitHub Desktop.
Experimenting with hashs for years in compound shard keys
/**
*
* This script is to experiment with a compound shard key.
*
* Goals:
* 1. the hash (h) to always be the same for the same date (down to the day level)
* 2. documents with the same exact date (dMy) go on the same shard but different day to different shards
* 3. will work well with 100m+ documents
* 4. small overhead related to a simple (non-compound) date shard key
* 5. still support targeted queries (as long as "h" field is included in the query by the application)
*
* Launch a cluster with 3 shards (no RS for simplicity)
* mlaunch --single --sharded 3
*
* Set the smallest chunksize to help visualize our sharding
* mongo config --eval 'db.settings.save( { _id:"chunksize", value: 1 } )'
*
* Connect via the mongos (on 27017)
* mongo
*/
use test;
db.shardedColl.insertMany([
{ h: 47, d: ISODate("2017-12-18-14:12:11"), payload: "just a test" },
{ h: 47, d: ISODate("2017-12-18-14:12:19"), payload: "another test" },
{ h: 47, d: ISODate("2017-12-18-14:12:32"), payload: "third test" }
]);
var doc = db.shardedColl.findOne();
Object.bsonsize(doc)
// 69
db.shardedColl2.insert({ h: "47-2017-12-18-14:12:11", d: ISODate("2017-12-18-14:12:11"), payload: "just a test" });
var doc2 = db.shardedColl2.findOne();
Object.bsonsize(doc2);
// 88
db.shardedColl3.insertMany([
{ h: 47, d: "2017-12-18-14:12:11", payload: "just a test" }
]);
var doc3 = db.shardedColl3.findOne();
Object.bsonsize(doc3);
// 85
// save one more byte
db.shardedColl3.insertMany([
{ h: '5f', d: "2017-12-18-14:16:41", payload: "just a test" }
]);
var doc4 = db.shardedColl3.findOne({h: '5f'});
Object.bsonsize(doc4);
// 84
use test;
var Base64 = {
characters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" ,
encode: function( string )
{
var characters = Base64.characters;
var result = '';
var i = 0;
do {
var a = string.charCodeAt(i++);
var b = string.charCodeAt(i++);
var c = string.charCodeAt(i++);
a = a ? a : 0;
b = b ? b : 0;
c = c ? c : 0;
var b1 = ( a >> 2 ) & 0x3F;
var b2 = ( ( a & 0x3 ) << 4 ) | ( ( b >> 4 ) & 0xF );
var b3 = ( ( b & 0xF ) << 2 ) | ( ( c >> 6 ) & 0x3 );
var b4 = c & 0x3F;
if( ! b ) {
b3 = b4 = 64;
} else if( ! c ) {
b4 = 64;
}
result += Base64.characters.charAt( b1 ) + Base64.characters.charAt( b2 ) + Base64.characters.charAt( b3 ) + Base64.characters.charAt( b4 );
} while ( i < string.length );
return result;
}
};
hashCode = function(str) {
var hash = 0;
if (str.length == 0) {
return hash;
}
for (var i = 0; i < str.length; i++) {
char = str.charCodeAt(i);
hash = ((hash<<5)-hash)+char;
hash = hash & hash; // Convert to 32bit integer
}
return hash;
}
makeHash = function(dateStr) {
base64Full = Base64.encode(dateStr);
return base64Full.substring(0,2);
}
// test our base64 algo
makeHash
// cleanup previous trials
use test;
db.shardedColl.drop();
db.shardedColl.createIndex({h:1, d:1});
sh.enableSharding("test");
sh.shardCollection("test.shardedColl", {h:1, d:1});
// alternatively remove docs and merge chunks
db.shardedColl.remove({});
// now merge empty chunks
// let's assume all chunks have been forced back onto the first shard
// load chunks from the first shard
ourChunks = db.chunks.find({ "ns" : "test.shardedColl", "shard": "shard01"}).sort({"min": 1}); // load in order
prev = null;
ourChunks.forEach(function(item) {
relinkPrev = true;
print("item: ");
printjson( item);
if (prev == null) {
// first chunk we found
} else {
print("prev: ");
printjson(prev);
// is this chunk empty?
docCount = db.shardedColl.count({
h: {$gte: item.min.h, $lt: item.max.h },
d: {$gte: item.min.d, $lt: item.max.d }});
print("docCount: " + docCount);
if (docCount <= 0) { // found an empty chunk
bounds = [ { h: prev.min.h, d: prev.min.d },
{ h: item.max.h, d: item.max.d } ];
print("bounds: ");
printjson(bounds);
// let's see if we can merge this with the previous
res = db.getSiblingDB("admin").runCommand(
{ mergeChunks : "test.shardedColl", bounds : bounds } );
print("res: ");
printjson(res);
if (res && res.ok == 1) {
// merged successfully
print("Merged: ");
printjson(bounds);
// keep prev the same to preserve min boundary for next pass
relinkPrev= false;
}
}
}
if (relinkPrev) prev = item;
});
// check how many chunks remain
db.getSiblingDB("config").chunks.count({ "ns" : "test.shardedColl"});
// inspect the chunks in order
db.getSiblingDB("config").chunks.find({ "ns" : "test.shardedColl"}).sort({min: 1}).pretty()
// disable balancer (for insert performance but also to verify that we are distributing evenly)
sh.disableBalancing("test.shardedColl");
// pre-create 4096 (64*64) chunks
for (d0 = 0; d0 < 64; d0++) {
for (d1 = 0; d1 < 64; d1++) {
hash = Base64.characters[d0] + Base64.characters[d1];
sh.splitAt( "test.shardedColl", { h: hash, d: MinKey });
}
}
// Config for inserts
padSize = 32; // make 32kb payloads
// Insert fresh data
for (year = 2000; year < 2018; year++) {
for (mon = 1; mon <= 12; mon++ ) {
for (day = 1; day < 30; day++) {
var objs = [];
for (ms = 0; ms < 100; ms++) {
// create a date with current time, but force date/month/year
date = new Date();
date.setDate(day);
date.setMonth(mon);
date.setYear(year);
date.setMilliseconds(ms);
dateStr = date.toString();
print(dateStr);
// prepare a hash
base64 = makeHash(dateStr);
print(base64);
// payload just over 32kb
objs.push({h: base64, d: date, payload: ("some test " + dateStr).pad(padSize*1024) });
}
// insert in bulk
db.shardedColl.insertMany(objs);
}
}
}
db.shardedColl.insertMany([
{ h: 47, d: ISODate("2017-12-18-14:12:11"), payload: "just a test" },
{ h: 47, d: ISODate("2017-12-18-14:12:19"), payload: "another test" },
{ h: 47, d: ISODate("2017-12-18-14:12:32"), payload: "third test" }
]);
db.shardedColl.createIndex({h:1, d:1});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment