Skip to content

Instantly share code, notes, and snippets.

Last active September 22, 2022 11:02
Show Gist options
  • Save agemooij/15a0eaebc2c1ddd5ddf4 to your computer and use it in GitHub Desktop.
Save agemooij/15a0eaebc2c1ddd5ddf4 to your computer and use it in GitHub Desktop.
Scala text normalization
package rfs.rebb
package common
* Performs standard Java/unicode normalization on the trimmed and lowercased form
* of the input String and then adds a few extra tricks for dealing with special
* characters.
* JVM/Unicode normalization references (warning: learning curve black hole, beware!):
* -
* -
* -
* -
* -
* -
* Some special cases, like "ø" and "ß" are not being stripped/replaced by the
* Java/Unicode normalizer so we have to replace them ourselves.
trait NormalizeSupport {
import java.text.Normalizer.{ normalize jnormalize, _ }
def normalize(in: String): String = {
val cleaned = in.trim.toLowerCase
val normalized = jnormalize(cleaned, Form.NFD).replaceAll("[\\p{InCombiningDiacriticalMarks}\\p{IsM}\\p{IsLm}\\p{IsSk}]+", "")
normalized.replaceAll("'s", "")
.replaceAll("ß", "ss")
.replaceAll("ø", "o")
.replaceAll("[^a-zA-Z0-9-]+", "-")
.replaceAll("-+", "-")
object NormalizeSupport extends NormalizeSupport
package rfs.rebb
package common
import org.scalatest._
import Matchers._
import common._
class NormalizeSupportSpec extends UnitSpec with NormalizeSupport {
"NormalizeSupport" should {
"correctly normalize non -ASCII characters" in {
normalize("ÀÁÂÃĀĂȦÄẢÅǍȀȂĄẠḀẦẤàáâä") shouldBe "aaaaaaaaaaaaaaaaaaaaaa"
normalize("ÉÊẼĒĔËȆȄȨĖèéêẽēȅë") shouldBe "eeeeeeeeeeeeeeeee"
normalize("ÌÍÏïØøÒÖÔöÜüŇñÇçß") shouldBe "iiiioooooouunnccss"
"normalize 's to nothing" in {
normalize("aa'sbba") shouldBe "aabba"
"normalize & for -" in {
normalize("aa & bb") shouldBe "aa-bb"
normalize("aa&& & &&& bb") shouldBe "aa-bb"
"normalize brackets to -" in {
normalize("aa(bb)cc") shouldBe "aa-bb-cc"
normalize("aa((((bb)))cc") shouldBe "aa-bb-cc"
"normalize multiples of '-' to a single '-'" in {
normalize("a----a--b-b-------a") shouldBe "a-a-b-b-a"
"normalize to lowercase" in {
normalize("AAbAbbB") shouldBe "aababbb"
"normalize a string with several diacritical marks" in {
normalize("a'sa((%%$ & b___--BB a") shouldBe "aa-b-bb-a"
normalizationTestCasesSharedWithNl.foreach {
case (input, expectedOutput)
s"""normalize "${input}" to "${expectedOutput}".""" in {
normalize(input) shouldBe expectedOutput
private def normalizationTestCasesSharedWithNl: List[(String, String)] = {
import org.parboiled.common._
val data = FileUtils.readAllTextFromResource("normalization-checks.csv")
val lines = data.trim.split("""\r?\n""").toList line.split("""\|\|""")).map(parts (parts(0), parts(1)))
Copy link

nasazh commented Apr 11, 2022

thanks for sharing this! a lifesaver :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment