jonocarroll · March 13, 2024 05:24
diff --git a/fuzzygroup.md b/fuzzygroup.md
diff --git a/fuzzygroup.R b/fuzzygroup.R
 ## Medical Term Fuzzy Grouping
 ## J. Carroll 2024
 ## 
 ## Uses the {zoomerjoin} package: https://github.com/beniaminogreen/zoomerjoin

 ## read in a set of medical terms, lowercased
 terms <- tolower(readLines("https://raw.githubusercontent.com/socd06/medical-nlp/master/data/vocab.txt"))

 ## example data with typos and inserted words
 gi <- c("gastrointestinal disorders", "gastrointestinal tract disorders", "gastreinstestinal disorder")
 hep <- c("hepatic encephalopathy", "hepatic encephalapathy", "hepatic encefalopathy")
 co <- c("myocarditis", "myocardits", "myocardites")

 ## find the closest matching word in wordlist, either as a direct string match
 ## or the lowest Levenshtein distance of all the words in wordlist
 match_word <- function(word, wordlist) {
  word <- tolower(word)
  if (word %in% wordlist) return(word)
  wordlist[which.min(adist(word, wordlist)[1, ])]
 }

 ## apply spellchecking to each word of a phrase of words
 ## joining back into a space-delimited phrase afterwards
 spellcheck_phrase <- function(phrase, wordlist) {
  sapply(phrase, \(w) paste(sapply(strsplit(w, " ")[[1]], \(word) match_word(word, wordlist)), collapse = " "), USE.NAMES = FALSE)
 }

 ## e.g. spellcheck the gi terms
 spellcheck_phrase(gi, terms)

 ## create an example dataset containing the (misspelled) terms and some values
 meddata <- data.frame(term = c(gi, hep, co), value = LETTERS[1:9])

 ## stir to ensure randomness works
 meddata <- meddata[match(meddata$value, strsplit("FIABDEHCG", "")[[1]]), ]
 meddata

 ## add the corrected phrases to the data
 meddata$corrected <- sapply(meddata$term, \(x) spellcheck_phrase(x, terms), USE.NAMES = FALSE)
 meddata

 ## perform a grouping of the corrected terms, assigning a 'canonical' value to each group
 ## the parameters here may need to be adjusted, but seem to work for this example data
 meddata$group <- zoomerjoin::jaccard_string_group(meddata$corrected, threshold = 0.1)
 meddata

 ## grouping can now be done as usual
 library(dplyr)
 meddata |> 
  group_by(group) |> 
  summarise(res = toString(sort(value)))
	## Medical Term Fuzzy Grouping
	## J. Carroll 2024
	##
	## Uses the {zoomerjoin} package: https://github.com/beniaminogreen/zoomerjoin

	## read in a set of medical terms, lowercased
	terms <- tolower(readLines("https://raw.githubusercontent.com/socd06/medical-nlp/master/data/vocab.txt"))

	## example data with typos and inserted words
	gi <- c("gastrointestinal disorders", "gastrointestinal tract disorders", "gastreinstestinal disorder")
	hep <- c("hepatic encephalopathy", "hepatic encephalapathy", "hepatic encefalopathy")
	co <- c("myocarditis", "myocardits", "myocardites")

	## find the closest matching word in wordlist, either as a direct string match
	## or the lowest Levenshtein distance of all the words in wordlist
	match_word <- function(word, wordlist) {
	word <- tolower(word)
	if (word %in% wordlist) return(word)
	wordlist[which.min(adist(word, wordlist)[1, ])]
	}

	## apply spellchecking to each word of a phrase of words
	## joining back into a space-delimited phrase afterwards
	spellcheck_phrase <- function(phrase, wordlist) {
	sapply(phrase, \(w) paste(sapply(strsplit(w, " ")[[1]], \(word) match_word(word, wordlist)), collapse = " "), USE.NAMES = FALSE)
	}

	## e.g. spellcheck the gi terms
	spellcheck_phrase(gi, terms)

	## create an example dataset containing the (misspelled) terms and some values
	meddata <- data.frame(term = c(gi, hep, co), value = LETTERS[1:9])

	## stir to ensure randomness works
	meddata <- meddata[match(meddata$value, strsplit("FIABDEHCG", "")[[1]]), ]
	meddata

	## add the corrected phrases to the data
	meddata$corrected <- sapply(meddata$term, \(x) spellcheck_phrase(x, terms), USE.NAMES = FALSE)
	meddata

	## perform a grouping of the corrected terms, assigning a 'canonical' value to each group
	## the parameters here may need to be adjusted, but seem to work for this example data
	meddata$group <- zoomerjoin::jaccard_string_group(meddata$corrected, threshold = 0.1)
	meddata

	## grouping can now be done as usual
	library(dplyr)
	meddata \|>
	group_by(group) \|>
	summarise(res = toString(sort(value)))