ChristianOellers · July 26, 2023 10:04
diff --git a/RegEx-Snippets - Log analysis + Filtering.sh b/RegEx-Snippets - Log analysis + Filtering.sh
 # Exemplary snippets I've been using in my own projects. Use for inspiration and take what you need.
 # 'sh' file format is for syntax highlighting only: The RegEx parts should work in many scenarios.
 #
 # Run all snippets to normalize varying strings by replacing or removing characters (as you need).
 # Once all strings are aligned, duplicates can be filtered and lines sorted.
 # This leaves the log with a few distinct, unique errors that are to be considered for development.
 # RegEx design aims to leave the log syntax intact, e.g., delimiters (,"').
 # This might be relevant for proper syntax highlighting or use with advanced analyzers.

 # ---

 # Date + Time formats
 # = 2000-01-01 00:00:00
 # = [2000-01-01 00:00:00]
 \[?(\d+-?){3}\s(\d+:?){3}\]?

 # Serial numbers (letters)
 # = XXXX-XXXX-XXXX
 \w{4}-\w{4}-\w{4}

 # Request URI from JSON part
 # = "request_uri":"http://..."
 ("request_uri.*),

 # API errors - Domain part
 # = Authentication unsuccessful [xxx.xxx.xxx.xxx.COM] ". [] []
 (unsuccessful\s\[.+\])(?=\s")

 # SQL errors - Number part
 # = ", 111]:  SQLSTATE[23000]
 (?<=",)\s\d+(?=\])

 # Symfony framework views - Number part
 # = ::viewExample","id":"1"},"method"
 (?<=viewExample","id":").*\d+

 # Filters / Severities
 # e.g. for highlighting, prioritizing
 \.(debug|info)
 \.warn(ing)?
 \.critical
 \.error(?=.*XXX)(?!.*404a) # - Only 'XXX', ignore '404s'
 \.error(?!.*404) # - Ignore '404s'

diff --git a/Text - Duplicate word finder (variants).sh b/Text - Duplicate word finder (variants).sh
 # \1 matches the 1st group (again) finding all duplicate results.

 # Simple (yet inflexible for numbers > 9)
 (\w+\s)\1

 # Better (any numbers)
 (\w+\s)\g{1}

 # Named group (in this case: only match 3 occurrences of same word)
 (?<NAME>\w+\s)\g{1}\k<NAME>

 # ---

 # Modifiers
 gmi

 # ---

 # Test strings

 # = This iS is some text TEXT text with a lot lot of double douBLE words. These should should be removed be BE BE be removed.
 # = Testing <B><I>bold italic</I></B> text
 # = Testing <B attr="test"><I>bold italic</I></B> text

 # \1 matches the 1st group (again) finding all duplicate results.
 # VARIANTS
 (?<NAME>\w+\s)\g{1}\k<NAME>
 (?<CAP>\w+\s)\g{1}

diff --git a/Text - Token matching algorithms.sh b/Text - Token matching algorithms.sh
 # Token matching regular expressions - Variants.
 # - $ start from end of string (last input, caret position).

 # Find A->B: 'Lorem ipsum A->BB dolor'.
 (\w+)->(\w+)

 # Finds 'AA->BB': 'Lorem AA->BB'.
 [\w]{1,}\-\>([\-+\w]*)$

 # \B excludes any whitespace (space, new line, ...).
 # - Finds 'BB': 'Lorem :BB'.
 \B:([\-+\w]*)$

 # 1. Find any non-whitespace characters at end.
 # 2. Find whitespace chars at end (check after word ends).
 /\S+$/
 /\s+$/
	# Exemplary snippets I've been using in my own projects. Use for inspiration and take what you need.
	# 'sh' file format is for syntax highlighting only: The RegEx parts should work in many scenarios.
	#
	# Run all snippets to normalize varying strings by replacing or removing characters (as you need).
	# Once all strings are aligned, duplicates can be filtered and lines sorted.
	# This leaves the log with a few distinct, unique errors that are to be considered for development.
	# RegEx design aims to leave the log syntax intact, e.g., delimiters (,"').
	# This might be relevant for proper syntax highlighting or use with advanced analyzers.

	# ---

	# Date + Time formats
	# = 2000-01-01 00:00:00
	# = [2000-01-01 00:00:00]
	\[?(\d+-?){3}\s(\d+:?){3}\]?

	# Serial numbers (letters)
	# = XXXX-XXXX-XXXX
	\w{4}-\w{4}-\w{4}

	# Request URI from JSON part
	# = "request_uri":"http://..."
	("request_uri.*),

	# API errors - Domain part
	# = Authentication unsuccessful [xxx.xxx.xxx.xxx.COM] ". [] []
	(unsuccessful\s\[.+\])(?=\s")

	# SQL errors - Number part
	# = ", 111]: SQLSTATE[23000]
	(?<=",)\s\d+(?=\])

	# Symfony framework views - Number part
	# = ::viewExample","id":"1"},"method"
	(?<=viewExample","id":").*\d+

	# Filters / Severities
	# e.g. for highlighting, prioritizing
	\.(debug\|info)
	\.warn(ing)?
	\.critical
	\.error(?=.XXX)(?!.404a) # - Only 'XXX', ignore '404s'
	\.error(?!.*404) # - Ignore '404s'
	# \1 matches the 1st group (again) finding all duplicate results.

	# Simple (yet inflexible for numbers > 9)
	(\w+\s)\1

	# Better (any numbers)
	(\w+\s)\g{1}

	# Named group (in this case: only match 3 occurrences of same word)
	(?<NAME>\w+\s)\g{1}\k<NAME>

	# ---

	# Modifiers
	gmi

	# ---

	# Test strings

	# = This iS is some text TEXT text with a lot lot of double douBLE words. These should should be removed be BE BE be removed.
	# = Testing <B><I>bold italic</I></B> text
	# = Testing <B attr="test"><I>bold italic</I></B> text

	# \1 matches the 1st group (again) finding all duplicate results.
	# VARIANTS
	(?<NAME>\w+\s)\g{1}\k<NAME>
	(?<CAP>\w+\s)\g{1}
	# Token matching regular expressions - Variants.
	# - $ start from end of string (last input, caret position).

	# Find A->B: 'Lorem ipsum A->BB dolor'.
	(\w+)->(\w+)

	# Finds 'AA->BB': 'Lorem AA->BB'.
	[\w]{1,}\-\>([\-+\w]*)$

	# \B excludes any whitespace (space, new line, ...).
	# - Finds 'BB': 'Lorem :BB'.
	\B:([\-+\w]*)$

	# 1. Find any non-whitespace characters at end.
	# 2. Find whitespace chars at end (check after word ends).
	/\S+$/
	/\s+$/