Created
November 20, 2018 11:51
-
-
Save bumcru/729632c7587f16c69d40a878c0bde750 to your computer and use it in GitHub Desktop.
goで禁則処理
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
func splitByBlock(text string) (words []string) { | |
if text == "" { | |
return []string{} | |
} | |
rs := []rune(text) | |
var tmp string | |
for _, r := range rs { | |
// 空白文字の前後は無条件に分割 | |
if isEmptyString(string(r)) { | |
if tmp != "" { | |
words = append(words, tmp) | |
tmp = "" | |
} | |
words = append(words, string(r)) | |
continue | |
} | |
// 1バイト文字の場合、もしくは、行末禁止文字の場合は分割しない | |
if len(string(r)) == 1 || !isValidAsEnd(string(r)) { | |
tmp += string(r) | |
continue | |
} | |
tmp += string(r) | |
// 1文字目が行頭禁止文字の場合、1つ前の単語の末尾に付加する | |
if !isValidAsBullet(string([]rune(tmp)[0])) { | |
if len(words) == 0 { | |
words = []string{tmp} | |
} else { | |
words[len(words)-1] += tmp | |
} | |
} else { | |
words = append(words, tmp) | |
} | |
tmp = "" | |
} | |
if tmp != "" { | |
words = append(words, tmp) | |
} | |
return | |
} | |
var emptyStrings = map[string]bool{ | |
" ": true, | |
" ": true, | |
"\t": true, | |
"\n": true, | |
} | |
func isEmptyString(s string) (res bool) { | |
_, res = emptyStrings[s] | |
return | |
} | |
var invalidBullets = map[string]bool{ | |
"、": true, "。": true, ".": true, | |
"ぁ": true, "ぃ": true, "ぅ": true, "ぇ": true, "ぉ": true, | |
"っ": true, "ゃ": true, "ゅ": true, "ょ": true, | |
"ァ": true, "ィ": true, "ゥ": true, "ェ": true, "ォ": true, | |
"ッ": true, "ャ": true, "ュ": true, "ョ": true, | |
"ァ": true, "ィ": true, "ゥ": true, "ェ": true, "ォ": true, | |
"ッ": true, "ャ": true, "ュ": true, "ョ": true, | |
")": true, "}": true, "]": true, ">": true, | |
"」": true, "』": true, ")": true, "}": true, "】": true, ">": true, "≫": true, "]": true, | |
"・": true, "ー": true, "―": true, "-": true, | |
":": true, ";": true, "/": true, "/": true, | |
"ゝ": true, "々": true, "!": true, "?": true, "!": true, "?": true, | |
} | |
func isValidAsBullet(s string) bool { | |
_, res := invalidBullets[s] | |
return !res | |
} | |
var invalidEndChars = map[string]bool{ | |
"(": true, "{": true, "[": true, "<": true, | |
"「": true, "『": true, "(": true, "{": true, "【": true, "<": true, "≪": true, "[": true, | |
} | |
func isValidAsEnd(s string) bool { | |
_, res := invalidEndChars[s] | |
return !res | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment