Created
February 11, 2020 07:33
-
-
Save zonesan/a7a079707e9992d52e011afb15f354e4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<script> | |
function is_ascii(str) { | |
return /^[\x00-\x7F]*$/.test(str); | |
} | |
function is_seperator(c) { | |
return [" ", ",", "。", ",", ";", ",", "?", ".", "?", ";", "《", "》"].indexOf(c) > -1; | |
} | |
function random_choice(arr) { | |
return arr[Math.floor(arr.length * Math.random())]; | |
} | |
function random_shuffle(array) { | |
var current_index = array.length, temporary_value, random_index; | |
// While there remain elements to shuffle... | |
while (0 !== current_index) { | |
// Pick a remaining element... | |
random_index = Math.floor(Math.random() * current_index); | |
current_index -= 1; | |
// And swap it with the current element. | |
temporary_value = array[current_index]; | |
array[current_index] = array[random_index]; | |
array[random_index] = temporary_value; | |
} | |
return array; | |
} | |
function tokenize(src_txt) { | |
var token_list = []; | |
var token = ""; | |
for (var c of src_txt) { | |
if (is_ascii(c)) { | |
token += c; | |
} else { | |
if (token !== "") { | |
token_list.push(token); | |
token = ""; | |
} | |
token_list.push(c); | |
} | |
} | |
if (token !== "") { | |
token_list.push(token); | |
} | |
return token_list; | |
} | |
function reorder(token_list) { | |
var n_grams = [2, 3]; | |
var i = 0; | |
var token_list_reordered = []; | |
while (i < token_list.length) { | |
var n_gram = random_choice(n_grams); | |
var j = Math.min(i + n_gram, token_list.length); | |
n_gram = token_list.slice(i, j); | |
random_shuffle(n_gram); | |
Array.prototype.push.apply(token_list_reordered, n_gram); | |
i = j; | |
} | |
return token_list_reordered; | |
} | |
function sentencize(src_txt) { | |
var sentence_list = []; | |
var sentence = ""; | |
var reordered_txt = []; | |
for (var c of src_txt) { | |
if (is_seperator(c) || !Number.isNaN(+c)) { | |
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence))); | |
reordered_txt.push(c); | |
sentence = ""; | |
} else { | |
sentence += c; | |
} | |
} | |
if (sentence !== "") { | |
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence))); | |
} | |
return reordered_txt.join(""); | |
} | |
</script> | |
<body> | |
<h1>研表究明,汉字的序顺并不定一能影阅响读</h1> | |
<textarea id="src" name="message" rows="10" cols="30"> | |
研究表明,汉字的顺序并不一定能影响阅读。 | |
</textarea> | |
<button type="button" | |
onclick="document.getElementById('demo').innerHTML = sentencize(document.getElementById('src').innerHTML)"> | |
变!</button> | |
<p id="demo"></p> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment