Tenderfeel · August 23, 2021 06:42
diff --git a/md-search-index.mjs b/md-search-index.mjs
 /**
 * /src/data/blog/ にある markdownファイルと
 * /src/data/json/author.json から キーワードを抽出した
 * flexsearch_index.json を生成する
 */
 import { readdir, readFile, writeFile } from 'fs/promises'
 import path from 'path'
 import { tokenize } from 'kuromojin'
 import removeMarkdown from 'remove-markdown'
 import grayMatter from 'gray-matter'
 import chalk from 'chalk'
 import ora from 'ora'

 const MD_DIRECTORY_PATH = './src/data/blog'
 const AUTHOR_JSON_PATH = './src/data/json/author.json'
 const DIST_FILE_PATH = './src/data/flexsearch_index.json'
 const DICT_DIRECTORY_PATH = './src/data/dict'

 const spinner = ora(`${chalk.bold('flexsearch_index.json create')}`).start(
  `Read ${chalk.bold(MD_DIRECTORY_PATH)} directory`
 )

 try {
  const files = await readdir(MD_DIRECTORY_PATH, { withFileTypes: true })
  const filterFiles = files.filter((file) => file.isFile())

  spinner
    .succeed(`Read ${chalk.bold(files.length)} markdown files.`)
    .start(`Read ${chalk.bold('author.json')}`)

  const author = await readFile(path.resolve(AUTHOR_JSON_PATH))
  const authorJSON = JSON.parse(author.toString('utf8'))

  const documents = []

  spinner.succeed().start('Tokenize markdown files...')

  // Promise.allだと処理状況見れないので…
  for (let i = 0; i < filterFiles.length; i++) {
    const file = filterFiles[i]
    spinner.start(`Start ${chalk.bold(`${file.name} tokenize`)} prosess...`)
    documents.push(await parseMarkdownFile(file, authorJSON))
    spinner.succeed(`${chalk.bold(`${file.name}`)} tokenized`)
  }

  spinner
    .succeed(`complete ${chalk.bold('tokenize')} prosess!`)
    .start('JSON file export...')

  await writeFile(
    DIST_FILE_PATH,
    JSON.stringify({
      documents,
    })
  )

  spinner.succeed('JSON file export')
 } catch (err) {
  spinner.fail()
  console.error(err)
 }

 /**
 * Markdownファイルの中身から必要な情報を抽出
 */
 async function parseMarkdownFile(file, authorJSON) {
  const fp = path.join(MD_DIRECTORY_PATH, file.name)

  const markdown = await readFile(fp)

  const matter = grayMatter(markdown)
  const content = removeMarkdown(matter.content)
  const author = authorJSON.find((auth) => auth.id === matter.data.author)

  const str =
    `${matter.data.title}\n\n` +
    content

  const document = {
    id: matter.data.id,
    title: matter.data.title,
    date: matter.data.date,
    author: `${author?.name}` + (author?.name_reading ? `（${author?.name_reading}）` :''),
    tag: matter.data.tags,
    keywords: '',
  }

  const tokens = await tokenize(str, {
    dicPath: path.resolve(DICT_DIRECTORY_PATH),
  })

  document.keywords = createKeywords(tokens)

  return document
 }

 /**
 * キーワード生成
 */
 function createKeywords(tokens) {
  const allTokens = []
  tokens.filter(tokenFilter).forEach((token) => {

    // 表層形
    if (!allTokens.includes(token.surface_form)) {
      allTokens.push(token.surface_form)
    }

    // 読み
    const reading = token.reading || token.surface_form

    // ひらがな変換
    const hira = reading.replace(/[\u30A2-\u30F3]/g, (m) =>
      String.fromCharCode(m.charCodeAt(0) - 96)
    )

    // 基本形
    if (
      token.surface_form !== token.basic_form &&
      token.basic_form !== '*' &&
      !allTokens.includes(token.basic_form)
    ) {
      allTokens.push(token.basic_form)
    }
    
    // ひらがな
    if (token.surface_form !== hira && !allTokens.includes(hira)) {
      allTokens.push(hira)
    }
  })
  return allTokens.join(' ')
 }

 /**
 * キーワードのフィルタ
 */
 function tokenFilter(token) {

  const keywords = ['GATSBYEMPTYALT', 'GATSBYEMPTYALTPresenter']

  if (
    !['名詞', '動詞', '形容詞'].includes(token.pos) ||
    /^[!-/:-@[-`{-~、-〜”’・.,_\s\u02B0-\u02FF\u2010-\u27FF\u3001-\u303F\uFF01-\uFF0F\uFF1A-\uFF1E\uFF3B-\uFF40\uFF5B-\uFF65]+$/g.test(
      token.surface_form
    ) ||
    token.surface_form.length < 2 ||
    keywords.includes(token.surface_form)
  ) {
    return false
  }
  switch (token.pos) {
    case '名詞':
    case '形容詞':
      return true
    case '動詞':
      return !['基本形', '連用形', '仮定形'].includes(token.conjugated_form)
      break
  }
 }
	/**
	* /src/data/blog/ にある markdownファイルと
	* /src/data/json/author.json からキーワードを抽出した
	* flexsearch_index.json を生成する
	*/
	import { readdir, readFile, writeFile } from 'fs/promises'
	import path from 'path'
	import { tokenize } from 'kuromojin'
	import removeMarkdown from 'remove-markdown'
	import grayMatter from 'gray-matter'
	import chalk from 'chalk'
	import ora from 'ora'

	const MD_DIRECTORY_PATH = './src/data/blog'
	const AUTHOR_JSON_PATH = './src/data/json/author.json'
	const DIST_FILE_PATH = './src/data/flexsearch_index.json'
	const DICT_DIRECTORY_PATH = './src/data/dict'

	const spinner = ora(`${chalk.bold('flexsearch_index.json create')}`).start(
	`Read ${chalk.bold(MD_DIRECTORY_PATH)} directory`
	)

	try {
	const files = await readdir(MD_DIRECTORY_PATH, { withFileTypes: true })
	const filterFiles = files.filter((file) => file.isFile())

	spinner
	.succeed(`Read ${chalk.bold(files.length)} markdown files.`)
	.start(`Read ${chalk.bold('author.json')}`)

	const author = await readFile(path.resolve(AUTHOR_JSON_PATH))
	const authorJSON = JSON.parse(author.toString('utf8'))

	const documents = []

	spinner.succeed().start('Tokenize markdown files...')

	// Promise.allだと処理状況見れないので…
	for (let i = 0; i < filterFiles.length; i++) {
	const file = filterFiles[i]
	spinner.start(`Start ${chalk.bold(`${file.name} tokenize`)} prosess...`)
	documents.push(await parseMarkdownFile(file, authorJSON))
	spinner.succeed(`${chalk.bold(`${file.name}`)} tokenized`)
	}

	spinner
	.succeed(`complete ${chalk.bold('tokenize')} prosess!`)
	.start('JSON file export...')

	await writeFile(
	DIST_FILE_PATH,
	JSON.stringify({
	documents,
	})
	)

	spinner.succeed('JSON file export')
	} catch (err) {
	spinner.fail()
	console.error(err)
	}

	/**
	* Markdownファイルの中身から必要な情報を抽出
	*/
	async function parseMarkdownFile(file, authorJSON) {
	const fp = path.join(MD_DIRECTORY_PATH, file.name)

	const markdown = await readFile(fp)

	const matter = grayMatter(markdown)
	const content = removeMarkdown(matter.content)
	const author = authorJSON.find((auth) => auth.id === matter.data.author)

	const str =
	`${matter.data.title}\n\n` +
	content

	const document = {
	id: matter.data.id,
	title: matter.data.title,
	date: matter.data.date,
	author: `${author?.name}` + (author?.name_reading ? `（${author?.name_reading}）` :''),
	tag: matter.data.tags,
	keywords: '',
	}

	const tokens = await tokenize(str, {
	dicPath: path.resolve(DICT_DIRECTORY_PATH),
	})

	document.keywords = createKeywords(tokens)

	return document
	}

	/**
	* キーワード生成
	*/
	function createKeywords(tokens) {
	const allTokens = []
	tokens.filter(tokenFilter).forEach((token) => {

	// 表層形
	if (!allTokens.includes(token.surface_form)) {
	allTokens.push(token.surface_form)
	}

	// 読み
	const reading = token.reading \|\| token.surface_form

	// ひらがな変換
	const hira = reading.replace(/[\u30A2-\u30F3]/g, (m) =>
	String.fromCharCode(m.charCodeAt(0) - 96)
	)

	// 基本形
	if (
	token.surface_form !== token.basic_form &&
	token.basic_form !== '*' &&
	!allTokens.includes(token.basic_form)
	) {
	allTokens.push(token.basic_form)
	}

	// ひらがな
	if (token.surface_form !== hira && !allTokens.includes(hira)) {
	allTokens.push(hira)
	}
	})
	return allTokens.join(' ')
	}

	/**
	* キーワードのフィルタ
	*/
	function tokenFilter(token) {

	const keywords = ['GATSBYEMPTYALT', 'GATSBYEMPTYALTPresenter']

	if (
	!['名詞', '動詞', '形容詞'].includes(token.pos) \|\|
	/^[!-/:-@[-`{-~、-〜”’・.,_\s\u02B0-\u02FF\u2010-\u27FF\u3001-\u303F\uFF01-\uFF0F\uFF1A-\uFF1E\uFF3B-\uFF40\uFF5B-\uFF65]+$/g.test(
	token.surface_form
	) \|\|
	token.surface_form.length < 2 \|\|
	keywords.includes(token.surface_form)
	) {
	return false
	}
	switch (token.pos) {
	case '名詞':
	case '形容詞':
	return true
	case '動詞':
	return !['基本形', '連用形', '仮定形'].includes(token.conjugated_form)
	break
	}
	}