Created
August 23, 2021 06:42
-
-
Save Tenderfeel/a5fb2f55de4ba57c96eae5f950b256ea to your computer and use it in GitHub Desktop.
Markdonファイルから検索用キーワードを生成する
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* /src/data/blog/ にある markdownファイルと | |
* /src/data/json/author.json から キーワードを抽出した | |
* flexsearch_index.json を生成する | |
*/ | |
import { readdir, readFile, writeFile } from 'fs/promises' | |
import path from 'path' | |
import { tokenize } from 'kuromojin' | |
import removeMarkdown from 'remove-markdown' | |
import grayMatter from 'gray-matter' | |
import chalk from 'chalk' | |
import ora from 'ora' | |
const MD_DIRECTORY_PATH = './src/data/blog' | |
const AUTHOR_JSON_PATH = './src/data/json/author.json' | |
const DIST_FILE_PATH = './src/data/flexsearch_index.json' | |
const DICT_DIRECTORY_PATH = './src/data/dict' | |
const spinner = ora(`${chalk.bold('flexsearch_index.json create')}`).start( | |
`Read ${chalk.bold(MD_DIRECTORY_PATH)} directory` | |
) | |
try { | |
const files = await readdir(MD_DIRECTORY_PATH, { withFileTypes: true }) | |
const filterFiles = files.filter((file) => file.isFile()) | |
spinner | |
.succeed(`Read ${chalk.bold(files.length)} markdown files.`) | |
.start(`Read ${chalk.bold('author.json')}`) | |
const author = await readFile(path.resolve(AUTHOR_JSON_PATH)) | |
const authorJSON = JSON.parse(author.toString('utf8')) | |
const documents = [] | |
spinner.succeed().start('Tokenize markdown files...') | |
// Promise.allだと処理状況見れないので… | |
for (let i = 0; i < filterFiles.length; i++) { | |
const file = filterFiles[i] | |
spinner.start(`Start ${chalk.bold(`${file.name} tokenize`)} prosess...`) | |
documents.push(await parseMarkdownFile(file, authorJSON)) | |
spinner.succeed(`${chalk.bold(`${file.name}`)} tokenized`) | |
} | |
spinner | |
.succeed(`complete ${chalk.bold('tokenize')} prosess!`) | |
.start('JSON file export...') | |
await writeFile( | |
DIST_FILE_PATH, | |
JSON.stringify({ | |
documents, | |
}) | |
) | |
spinner.succeed('JSON file export') | |
} catch (err) { | |
spinner.fail() | |
console.error(err) | |
} | |
/** | |
* Markdownファイルの中身から必要な情報を抽出 | |
*/ | |
async function parseMarkdownFile(file, authorJSON) { | |
const fp = path.join(MD_DIRECTORY_PATH, file.name) | |
const markdown = await readFile(fp) | |
const matter = grayMatter(markdown) | |
const content = removeMarkdown(matter.content) | |
const author = authorJSON.find((auth) => auth.id === matter.data.author) | |
const str = | |
`${matter.data.title}\n\n` + | |
content | |
const document = { | |
id: matter.data.id, | |
title: matter.data.title, | |
date: matter.data.date, | |
author: `${author?.name}` + (author?.name_reading ? `(${author?.name_reading})` :''), | |
tag: matter.data.tags, | |
keywords: '', | |
} | |
const tokens = await tokenize(str, { | |
dicPath: path.resolve(DICT_DIRECTORY_PATH), | |
}) | |
document.keywords = createKeywords(tokens) | |
return document | |
} | |
/** | |
* キーワード生成 | |
*/ | |
function createKeywords(tokens) { | |
const allTokens = [] | |
tokens.filter(tokenFilter).forEach((token) => { | |
// 表層形 | |
if (!allTokens.includes(token.surface_form)) { | |
allTokens.push(token.surface_form) | |
} | |
// 読み | |
const reading = token.reading || token.surface_form | |
// ひらがな変換 | |
const hira = reading.replace(/[\u30A2-\u30F3]/g, (m) => | |
String.fromCharCode(m.charCodeAt(0) - 96) | |
) | |
// 基本形 | |
if ( | |
token.surface_form !== token.basic_form && | |
token.basic_form !== '*' && | |
!allTokens.includes(token.basic_form) | |
) { | |
allTokens.push(token.basic_form) | |
} | |
// ひらがな | |
if (token.surface_form !== hira && !allTokens.includes(hira)) { | |
allTokens.push(hira) | |
} | |
}) | |
return allTokens.join(' ') | |
} | |
/** | |
* キーワードのフィルタ | |
*/ | |
function tokenFilter(token) { | |
const keywords = ['GATSBYEMPTYALT', 'GATSBYEMPTYALTPresenter'] | |
if ( | |
!['名詞', '動詞', '形容詞'].includes(token.pos) || | |
/^[!-/:-@[-`{-~、-〜”’・.,_\s\u02B0-\u02FF\u2010-\u27FF\u3001-\u303F\uFF01-\uFF0F\uFF1A-\uFF1E\uFF3B-\uFF40\uFF5B-\uFF65]+$/g.test( | |
token.surface_form | |
) || | |
token.surface_form.length < 2 || | |
keywords.includes(token.surface_form) | |
) { | |
return false | |
} | |
switch (token.pos) { | |
case '名詞': | |
case '形容詞': | |
return true | |
case '動詞': | |
return !['基本形', '連用形', '仮定形'].includes(token.conjugated_form) | |
break | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment