Skip to content

Instantly share code, notes, and snippets.

Created August 28, 2022 06:00
Show Gist options
  • Save SergeiMeza/4eef702a14551c04c74de9b4020b7210 to your computer and use it in GitHub Desktop.
Save SergeiMeza/4eef702a14551c04c74de9b4020b7210 to your computer and use it in GitHub Desktop.
Elastic Search Japanese Index with autocomplete suggestions and full-text search
PUT ja
"settings": {
"analysis": {
// 1. custom character filters...
"char_filter": {
// 1-1. icu_normalizer nfkc: This handles the variation of full-width and half-width characters
"ja_normalize": {
"type": "icu_normalizer",
"name": "nfkc",
"mode": "compose"
// 1-2. mapping of kana to romaji
"kana_to_romaji": {
"type": "mapping",
"mappings": [
// 2. custom tokenizers...
"tokenizer": {
// 2-1. kuromoji_tokenizer in normal mode to segment text
"ja_kuromoji_normal": {
"type": "kuromoji_tokenizer",
"mode": "normal"
// 2-2. kuromoji_tokenizer in search mode to divide words into smaller pieces for search
"ja_kuromoji_search": {
"type": "kuromoji_tokenizer",
"mode": "search",
"discard_compound_token": true
// 2-3. ngram for search
"ja_ngram": {
"type": "ngram",
"min_gram": 2,
"max_gram": 2,
"token_chars": [
// 3. custom token filters...
"filter": {
// 3-1. Transform token to an n-gram
"edge_ngram": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "10"
// 3-2. Transform Japanese to readingform romaji
"readingform": {
"type": "kuromoji_readingform",
"use_romaji": "true"
// 4. custom analyzers...
"analyzer": {
// 4-1. standard autocomplete analyzer (used in usernames etc)
"standard_autocomplete_analyzer": {
"type": "custom",
"tokenizer": "standard", // standard tokenizer
"filter": [
"lowercase", // normalize alphabet to lowercase
"edge_ngram" // prefix search
// 4-2. Japanese suggestions index analyzer
"ja_suggest_index_analyzer": {
"type": "custom",
"char_filter": [
"tokenizer": "ja_kuromoji_normal",
"filter": [
"lowercase", // normalize alphabet to lowercase
"edge_ngram" // for prefix search
// 4-3. Japanese suggestions search analyzer
"ja_suggest_search_analyzer": {
"type": "custom",
"char_filter": [
"tokenizer": "ja_kuromoji_normal",
"filter": [
// 4-4. Japanese readingform index analyzer
"ja_readingform_index_analyzer": {
"type": "custom",
"char_filter": [
"tokenizer": "ja_kuromoji_normal",
"filter": [
// 4-5. Japanese readingform search analyzer
"ja_readingform_search_analyzer": {
"type": "custom",
"char_filter": [
"tokenizer": "ja_kuromoji_normal",
"filter": [
// 4-6. Full-text kuromoji Japanese index analyzer
"ja_kuromoji_index_analyzer": {
"type": "custom",
"char_filter": [
"tokenizer": "ja_kuromoji_search",
"filter": [
"kuromoji_baseform", // handle Japanese adjectives and verbs
"kuromoji_part_of_speech", // handle Japanese stoptags
"cjk_width", // normalizes half-width and full-width characters
"ja_stop", // handles Japanese stop words
"kuromoji_stemmer", // handles katakana variations
"lowercase" // lowercases alphabet
// 4-7. Full-text kuromoji Japanese search analyzer
"ja_kuromoji_search_analyzer": {
"type": "custom",
"char_filter": [
"tokenizer": "ja_kuromoji_search",
"filter": [
// 4-8. Full-text ngram Japanese index analyzer
"ja_ngram_index_analyzer": {
"type": "custom",
"char_filter": [
"tokenizer": "ja_ngram",
"filter": [
// 4-9. Full-text ngram Japanese search analyzer
"ja_ngram_search_analyzer": {
"type": "custom",
"char_filter": [
"tokenizer": "ja_ngram",
"filter": [
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment