Skip to content

Instantly share code, notes, and snippets.

@lam0819
Forked from hanxiao/testRegex.js
Created August 15, 2024 04:59
Show Gist options
  • Save lam0819/e28f468c4ae205aaabd1fb2457fb4546 to your computer and use it in GitHub Desktop.
Save lam0819/e28f468c4ae205aaabd1fb2457fb4546 to your computer and use it in GitHub Desktop.
Use regex to do chunking by using all semantic cues
// Used in https://jina.ai/tokenizer (Aug. 14th version)
// Define variables for magic numbers
const MAX_HEADING_LENGTH = 6;
const MAX_HEADING_CONTENT_LENGTH = 200;
const MAX_HEADING_UNDERLINE_LENGTH = 200;
const MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100;
const MAX_LIST_ITEM_LENGTH = 200;
const MAX_NESTED_LIST_ITEMS = 5;
const MAX_LIST_INDENT_SPACES = 7;
const MAX_BLOCKQUOTE_LINE_LENGTH = 200;
const MAX_BLOCKQUOTE_LINES = 10;
const MAX_CODE_BLOCK_LENGTH = 1000;
const MAX_CODE_LANGUAGE_LENGTH = 20;
const MAX_INDENTED_CODE_LINES = 20;
const MAX_TABLE_CELL_LENGTH = 200;
const MAX_TABLE_ROWS = 20;
const MAX_HTML_TABLE_LENGTH = 2000;
const MIN_HORIZONTAL_RULE_LENGTH = 3;
const MAX_SENTENCE_LENGTH = 300;
const MAX_QUOTED_TEXT_LENGTH = 300;
const MAX_PARENTHETICAL_CONTENT_LENGTH = 200;
const MAX_NESTED_PARENTHESES = 5;
const MAX_MATH_INLINE_LENGTH = 100;
const MAX_MATH_BLOCK_LENGTH = 500;
const MAX_PARAGRAPH_LENGTH = 1000;
const MAX_STANDALONE_LINE_LENGTH = 400;
const MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100;
const MAX_HTML_TAG_CONTENT_LENGTH = 1000;
// Read the regex and test text from files
export const chunkRegex = new RegExp(
"(" +
// 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints)
`(?:^(?:[#*=-]{1,${MAX_HEADING_LENGTH}}|\\w[^\\r\\n]{0,${MAX_HEADING_CONTENT_LENGTH}}\\r?\\n[-=]{2,${MAX_HEADING_UNDERLINE_LENGTH}}|<h[1-6][^>]{0,${MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}>)[^\\r\\n]{1,${MAX_HEADING_CONTENT_LENGTH}}(?:</h[1-6]>)?(?:\\r?\\n|$))` +
"|" +
// 2. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints)
`(?:(?:^|\\r?\\n)[ \\t]{0,3}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+[^\\r\\n]{1,${MAX_LIST_ITEM_LENGTH}}` +
`(?:(?:\\r?\\n[ \\t]{2,5}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+[^\\r\\n]{1,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_NESTED_LIST_ITEMS}}` +
`(?:\\r?\\n[ \\t]{4,${MAX_LIST_INDENT_SPACES}}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+[^\\r\\n]{1,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_NESTED_LIST_ITEMS}})?)` +
"|" +
// 3. Block quotes (including nested quotes and citations, up to three levels, with length constraints)
`(?:(?:^>(?:>|\\s{2,}){0,2}[^\\r\\n]{0,${MAX_BLOCKQUOTE_LINE_LENGTH}}\\r?\\n?){1,${MAX_BLOCKQUOTE_LINES}})` +
"|" +
// 4. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints)
`(?:(?:^|\\r?\\n)(?:\`\`\`|~~~)(?:\\w{0,${MAX_CODE_LANGUAGE_LENGTH}})?\\r?\\n[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:\`\`\`|~~~)\\r?\\n?` +
`|(?:(?:^|\\r?\\n)(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}(?:\\r?\\n(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_INDENTED_CODE_LINES}}\\r?\\n?)` +
`|(?:<pre>(?:<code>)?[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:</code>)?</pre>))` +
"|" +
// 5. Tables (Markdown, grid tables, and HTML tables, with length constraints)
`(?:(?:^|\\r?\\n)(?:\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|(?:\\r?\\n\\|[-:]{1,${MAX_TABLE_CELL_LENGTH}}\\|){0,1}(?:\\r?\\n\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|){0,${MAX_TABLE_ROWS}}` +
`|<table>[\\s\\S]{0,${MAX_HTML_TABLE_LENGTH}}?</table>))` +
"|" +
// 6. Horizontal rules (Markdown and HTML hr tag)
`(?:^(?:[-*_]){${MIN_HORIZONTAL_RULE_LENGTH},}\\s*$|<hr\\s*/?>)` +
"|" +
// 10. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints)
`(?:^(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}>)?[^\\r\\n]{1,${MAX_STANDALONE_LINE_LENGTH}}(?:</[a-zA-Z]+>)?(?:\\r?\\n|$))` +
"|" +
// 7. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation)
`(?:[^\\r\\n]{1,${MAX_SENTENCE_LENGTH}}(?:[.!?…]|\\.{3}|[\\u2026\\u2047-\\u2049]|[\\p{Emoji_Presentation}\\p{Extended_Pictographic}])(?=(?=\\s)(?=\\s*[A-Z])|(?=\\r?\\n)|(?=\\Z)))` +
"|" +
// 8. Quoted text, parenthetical phrases, or bracketed content (with length constraints)
"(?:" +
`\"\"\"[^\\r\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}\"\"\"` +
`|['\"\`'"][^\\r\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}['\"\`'"]` +
`|\\([^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\([^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\)[^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\)` +
`|\\[[^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\[[^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\][^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\]` +
`|\\$[^\\r\\n$]{0,${MAX_MATH_INLINE_LENGTH}}\\$` +
`|\`[^\`\\r\\n]{0,${MAX_MATH_INLINE_LENGTH}}\`` +
")" +
"|" +
// 9. Paragraphs (with length constraints)
`(?:(?<=\\r?\\n\\r?\\n|\\A)(?:<p>)?(?:(?!\\r?\\n\\r?\\n|\\z).){1,${MAX_PARAGRAPH_LENGTH}}(?:</p>)?(?=\\r?\\n\\r?\\n|\\z))` +
"|" +
// 11. HTML-like tags and their content (including self-closing tags and attributes, with length constraints)
`(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}(?:>[\\s\\S]{0,${MAX_HTML_TAG_CONTENT_LENGTH}}?</[a-zA-Z]+>|\\s*/>))` +
"|" +
// 12. LaTeX-style math expressions (inline and block, with length constraints)
`(?:(?:\\$\\$[\\s\\S]{0,${MAX_MATH_BLOCK_LENGTH}}?\\$\\$)|(?:\\$[^\\$\\r\\n]{0,${MAX_MATH_INLINE_LENGTH}}\\$))` +
"|" +
// 14. Fallback for any remaining content (with length constraints)
`(?:[^\\r\\n]{1,${MAX_STANDALONE_LINE_LENGTH}})` +
")",
"gm"
);
@lam0819
Copy link
Author

lam0819 commented Aug 15, 2024

<?php

// Define variables for magic numbers
const MAX_HEADING_LENGTH = 6;
const MAX_HEADING_CONTENT_LENGTH = 200;
const MAX_HEADING_UNDERLINE_LENGTH = 200;
const MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100;
const MAX_LIST_ITEM_LENGTH = 200;
const MAX_NESTED_LIST_ITEMS = 5;
const MAX_LIST_INDENT_SPACES = 7;
const MAX_BLOCKQUOTE_LINE_LENGTH = 200;
const MAX_BLOCKQUOTE_LINES = 10;
const MAX_CODE_BLOCK_LENGTH = 1000;
const MAX_CODE_LANGUAGE_LENGTH = 20;
const MAX_INDENTED_CODE_LINES = 20;
const MAX_TABLE_CELL_LENGTH = 200;
const MAX_TABLE_ROWS = 20;
const MAX_HTML_TABLE_LENGTH = 2000;
const MIN_HORIZONTAL_RULE_LENGTH = 3;
const MAX_SENTENCE_LENGTH = 300;
const MAX_QUOTED_TEXT_LENGTH = 300;
const MAX_PARENTHETICAL_CONTENT_LENGTH = 200;
const MAX_NESTED_PARENTHESES = 5;
const MAX_MATH_INLINE_LENGTH = 100;
const MAX_MATH_BLOCK_LENGTH = 500;
const MAX_PARAGRAPH_LENGTH = 1000;
const MAX_STANDALONE_LINE_LENGTH = 400;
const MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100;
const MAX_HTML_TAG_CONTENT_LENGTH = 1000;

// Construct the regex pattern
$chunkRegex = '/(' .
    // 1. Headings
    '(?:^(?:[#*=-]{1,' . MAX_HEADING_LENGTH . '}|\w[^\r\n]{0,' . MAX_HEADING_CONTENT_LENGTH . '}\r?\n[-=]{2,' . MAX_HEADING_UNDERLINE_LENGTH . '}|<h[1-6][^>]{0,' . MAX_HTML_HEADING_ATTRIBUTES_LENGTH . '}>)[^\r\n]{1,' . MAX_HEADING_CONTENT_LENGTH . '}(?:<\/h[1-6]>)?(?:\r?\n|$))' .
    '|' .
    // 2. List items
    '(?:(?:^|\r?\n)[ \t]{0,3}(?:[-*+•]|\d{1,3}\.\w\.|\[[ xX]\])[ \t]+[^\r\n]{1,' . MAX_LIST_ITEM_LENGTH . '}' .
    '(?:(?:\r?\n[ \t]{2,5}(?:[-*+•]|\d{1,3}\.\w\.|\[[ xX]\])[ \t]+[^\r\n]{1,' . MAX_LIST_ITEM_LENGTH . '}){0,' . MAX_NESTED_LIST_ITEMS . '}' .
    '(?:\r?\n[ \t]{4,' . MAX_LIST_INDENT_SPACES . '}(?:[-*+•]|\d{1,3}\.\w\.|\[[ xX]\])[ \t]+[^\r\n]{1,' . MAX_LIST_ITEM_LENGTH . '}){0,' . MAX_NESTED_LIST_ITEMS . '})?)' .
    '|' .
    // 3. Block quotes
    '(?:(?:^>(?:>|\s{2,}){0,2}[^\r\n]{0,' . MAX_BLOCKQUOTE_LINE_LENGTH . '}\r?\n?){1,' . MAX_BLOCKQUOTE_LINES . '})' .
    '|' .
    // 4. Code blocks
    '(?:(?:^|\r?\n)(?:```|~~~)(?:\w{0,' . MAX_CODE_LANGUAGE_LENGTH . '})?\r?\n[\s\S]{0,' . MAX_CODE_BLOCK_LENGTH . '}?(?:```|~~~)\r?\n?' .
    '|(?:(?:^|\r?\n)(?: {4}|\t)[^\r\n]{0,' . MAX_LIST_ITEM_LENGTH . '}(?:\r?\n(?: {4}|\t)[^\r\n]{0,' . MAX_LIST_ITEM_LENGTH . '}){0,' . MAX_INDENTED_CODE_LINES . '}\r?\n?)' .
    '|(?:<pre>(?:<code>)?[\s\S]{0,' . MAX_CODE_BLOCK_LENGTH . '}?(?:<\/code>)?<\/pre>))' .
    '|' .
    // 5. Tables
    '(?:(?:^|\r?\n)(?:\|[^\r\n]{0,' . MAX_TABLE_CELL_LENGTH . '}\|(?:\r?\n\|[-:]{1,' . MAX_TABLE_CELL_LENGTH . '}\|){0,1}(?:\r?\n\|[^\r\n]{0,' . MAX_TABLE_CELL_LENGTH . '}\|){0,' . MAX_TABLE_ROWS . '}' .
    '|<table>[\s\S]{0,' . MAX_HTML_TABLE_LENGTH . '}?<\/table>))' .
    '|' .
    // 6. Horizontal rules
    '(?:^(?:[-*_]){' . MIN_HORIZONTAL_RULE_LENGTH . ',}\s*$|<hr\s*\/?>)' .
    '|' .
    // 10. Standalone lines or phrases
    '(?:^(?:<[a-zA-Z][^>]{0,' . MAX_HTML_TAG_ATTRIBUTES_LENGTH . '}>)?[^\r\n]{1,' . MAX_STANDALONE_LINE_LENGTH . '}(?:<\/[a-zA-Z]+>)?(?:\r?\n|$))' .
    '|' .
    // 7. Sentences or phrases ending with punctuation
    '(?:[^\r\n]{1,' . MAX_SENTENCE_LENGTH . '}(?:[.!?…]|\.{3}|[\x{2026}\x{2047}-\x{2049}]|[\p{Emoji_Presentation}\p{Extended_Pictographic}])(?=(?=\s)(?=\s*[A-Z])|(?=\r?\n)|(?=\Z)))' .
    '|' .
    // 8. Quoted text, parenthetical phrases, or bracketed content
    '(?:' .
    '"""[^\r\n]{0,' . MAX_QUOTED_TEXT_LENGTH . '}"""' .
    '|[\'"`'"][^\r\n]{0,' . MAX_QUOTED_TEXT_LENGTH . '}[\'"`'"]' .
    '|\([^\r\n()]{0,' . MAX_PARENTHETICAL_CONTENT_LENGTH . '}(?:\([^\r\n()]{0,' . MAX_PARENTHETICAL_CONTENT_LENGTH . '}\)[^\r\n()]{0,' . MAX_PARENTHETICAL_CONTENT_LENGTH . '}){0,' . MAX_NESTED_PARENTHESES . '}\)' .
    '|\[[^\r\n\[\]]{0,' . MAX_PARENTHETICAL_CONTENT_LENGTH . '}(?:\[[^\r\n\[\]]{0,' . MAX_PARENTHETICAL_CONTENT_LENGTH . '}\][^\r\n\[\]]{0,' . MAX_PARENTHETICAL_CONTENT_LENGTH . '}){0,' . MAX_NESTED_PARENTHESES . '}\]' .
    '|\$[^\r\n$]{0,' . MAX_MATH_INLINE_LENGTH . '}\$' .
    '|`[^`\r\n]{0,' . MAX_MATH_INLINE_LENGTH . '}`' .
    ')' .
    '|' .
    // 9. Paragraphs
    '(?:(?<=\r?\n\r?\n|\A)(?:<p>)?(?:(?!\r?\n\r?\n|\z).){1,' . MAX_PARAGRAPH_LENGTH . '}(?:<\/p>)?(?=\r?\n\r?\n|\z))' .
    '|' .
    // 11. HTML-like tags and their content
    '(?:<[a-zA-Z][^>]{0,' . MAX_HTML_TAG_ATTRIBUTES_LENGTH . '}(?:>[\s\S]{0,' . MAX_HTML_TAG_CONTENT_LENGTH . '}?<\/[a-zA-Z]+>|\s*\/>))' .
    '|' .
    // 12. LaTeX-style math expressions
    '(?:(?:\$\$[\s\S]{0,' . MAX_MATH_BLOCK_LENGTH . '}?\$\$)|(?:\$[^\$\r\n]{0,' . MAX_MATH_INLINE_LENGTH . '}\$))' .
    '|' .
    // 14. Fallback for any remaining content
    '(?:[^\r\n]{1,' . MAX_STANDALONE_LINE_LENGTH . '})' .
')/mu';

// Function to use the regex for chunking
function chunkText($text) {
    global $chunkRegex;
    preg_match_all($chunkRegex, $text, $matches);
    return $matches[0];
}

// Example usage
$text = "Your input text here...";
$chunks = chunkText($text);

// Print chunks
foreach ($chunks as $chunk) {
    echo $chunk . "\n---\n";
}

?>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment