msjyoo · February 15, 2017 12:52 · msjyoo · Feb 15, 2017 · msjyoo · Feb 15, 2017
diff --git a/Counting Clown Emojis.php b/Counting Clown Emojis.php
 <?php

 // Clown Emoji
 // "🤡".length = 2 in Javascript (Firefox)
 // '🤡abc'.length = 5 in Javascript (Firefox)

 // . = Byte, () = Surrogate pairs
 $x = '🤡'; // 4 bytes (UTF-8 (....))
 $y = '🤡abc'; // 4 + 3 = 7 bytes (UTF-8 (....) . . .)

 // In PHP, strings are simply raw byte streams. Right now $x and $y are stored as UTF-8 because
 // I copy pasted them from my browser.

 echo "--- These are UTF-8 ---"."\n";
 echo "\$x Bytes: ".strlen($x)."\n";
 echo "\$x Unicode Codepoint Count (\"characters\"): ".mb_strlen($x, "UTF-8")."\n";
 echo "\$x Hex Representation: ".bin2hex($x)."\n";
 echo "\$y Bytes: ".strlen($y)."\n";
 echo "\$y Unicode Codepoint Count (\"characters\"): ".mb_strlen($y, "UTF-8")."\n";
 echo "\$y Hex Representation: ".bin2hex($y)."\n";
 echo "--- End ---"."\n";

 // Now, lets convert them to UTF-16 where each codepoint is 2 bytes and a surrogate pair is 4 bytes

 $x1 = mb_convert_encoding($x, "UTF-16", "UTF-8"); // Still 4 bytes! (UTF-16 (.. ..))
 $y1 = mb_convert_encoding($y, "UTF-16", "UTF-8"); // 4 + 6 = 10 bytes (UTF-16 (.. ..) .. .. ..)


 echo "--- These are UTF-16 ---"."\n";
 echo "\$x1 Bytes: ".strlen($x1)."\n";
 echo "\$x1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($x1, "UTF-16")."\n";
 echo "\$x1 Hex Representation: ".bin2hex($x1)."\n";
 echo "\$y1 Bytes: ".strlen($y1)."\n";
 echo "\$y1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($y1, "UTF-16")."\n";
 echo "\$y1 Hex Representation: ".bin2hex($y1)."\n";
 echo "--- End ---"."\n";


 // Now, Javascript's String is sort of like PHP's raw string byte stream, except:
 // >>>>>>
 // JavaScript treats code units as individual characters, while humans generally think in terms of Unicode characters.
 // This has some unfortunate consequences for Unicode characters outside the BMP. Since surrogate pairs consist of
 // two code units, '𝌆'.length == 2, even though there’s only one Unicode character there. The individual surrogate
 // halves are being exposed as if they were characters: '𝌆' == '\uD834\uDF06'.
 // <<<<<< https://mathiasbynens.be/notes/javascript-encoding

 // What this basically means is that while proper counting of UTF-16 codepoints would count surrogate pairs (.. ..) as
 // length 1, Javascript counts them separately as .. .. = length 2.

 // So, our characters $x1 and $y1 are counted in Javascript as:
 // $x1 | .. .. = 2
 // $y1 | .. .. .. .. .. = 5

 // Now it looks obvious that, to emulate Javascript's behaviour we simply need to count the number of bytes
 // in the UTF-16 encoding, and divide that by half.


 echo "--- These are UTF-16 ---"."\n";
 echo "\$x1 Javascript Emulated strlen/2: ".(strlen($x1)/2)."\n";
 echo "\$y1 Javascript Emulated strlen/2: ".(strlen($y1)/2)."\n";
 echo "--- End ---"."\n";

 // And we can see that Javascript's length behaviour is emulated.
	<?php

	// Clown Emoji
	// "🤡".length = 2 in Javascript (Firefox)
	// '🤡abc'.length = 5 in Javascript (Firefox)

	// . = Byte, () = Surrogate pairs
	$x = '🤡'; // 4 bytes (UTF-8 (....))
	$y = '🤡abc'; // 4 + 3 = 7 bytes (UTF-8 (....) . . .)

	// In PHP, strings are simply raw byte streams. Right now $x and $y are stored as UTF-8 because
	// I copy pasted them from my browser.

	echo "--- These are UTF-8 ---"."\n";
	echo "\$x Bytes: ".strlen($x)."\n";
	echo "\$x Unicode Codepoint Count (\"characters\"): ".mb_strlen($x, "UTF-8")."\n";
	echo "\$x Hex Representation: ".bin2hex($x)."\n";
	echo "\$y Bytes: ".strlen($y)."\n";
	echo "\$y Unicode Codepoint Count (\"characters\"): ".mb_strlen($y, "UTF-8")."\n";
	echo "\$y Hex Representation: ".bin2hex($y)."\n";
	echo "--- End ---"."\n";

	// Now, lets convert them to UTF-16 where each codepoint is 2 bytes and a surrogate pair is 4 bytes

	$x1 = mb_convert_encoding($x, "UTF-16", "UTF-8"); // Still 4 bytes! (UTF-16 (.. ..))
	$y1 = mb_convert_encoding($y, "UTF-16", "UTF-8"); // 4 + 6 = 10 bytes (UTF-16 (.. ..) .. .. ..)


	echo "--- These are UTF-16 ---"."\n";
	echo "\$x1 Bytes: ".strlen($x1)."\n";
	echo "\$x1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($x1, "UTF-16")."\n";
	echo "\$x1 Hex Representation: ".bin2hex($x1)."\n";
	echo "\$y1 Bytes: ".strlen($y1)."\n";
	echo "\$y1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($y1, "UTF-16")."\n";
	echo "\$y1 Hex Representation: ".bin2hex($y1)."\n";
	echo "--- End ---"."\n";


	// Now, Javascript's String is sort of like PHP's raw string byte stream, except:
	// >>>>>>
	// JavaScript treats code units as individual characters, while humans generally think in terms of Unicode characters.
	// This has some unfortunate consequences for Unicode characters outside the BMP. Since surrogate pairs consist of
	// two code units, '𝌆'.length == 2, even though there’s only one Unicode character there. The individual surrogate
	// halves are being exposed as if they were characters: '𝌆' == '\uD834\uDF06'.
	// <<<<<< https://mathiasbynens.be/notes/javascript-encoding

	// What this basically means is that while proper counting of UTF-16 codepoints would count surrogate pairs (.. ..) as
	// length 1, Javascript counts them separately as .. .. = length 2.

	// So, our characters $x1 and $y1 are counted in Javascript as:
	// $x1 \| .. .. = 2
	// $y1 \| .. .. .. .. .. = 5

	// Now it looks obvious that, to emulate Javascript's behaviour we simply need to count the number of bytes
	// in the UTF-16 encoding, and divide that by half.


	echo "--- These are UTF-16 ---"."\n";
	echo "\$x1 Javascript Emulated strlen/2: ".(strlen($x1)/2)."\n";
	echo "\$y1 Javascript Emulated strlen/2: ".(strlen($y1)/2)."\n";
	echo "--- End ---"."\n";

	// And we can see that Javascript's length behaviour is emulated.