## Medical Term Fuzzy Grouping
## J. Carroll 2024
##
## Uses the {zoomerjoin} package: https://github.com/beniaminogreen/zoomerjoin
## read in a set of medical terms, lowercased
terms <- tolower(readLines("https://raw.githubusercontent.com/socd06/medical-nlp/master/data/vocab.txt"))
## example data with typos and inserted words
gi <- c("gastrointestinal disorders", "gastrointestinal tract disorders", "gastreinstestinal disorder")
hep <- c("hepatic encephalopathy", "hepatic encephalapathy", "hepatic encefalopathy")
co <- c("myocarditis", "myocardits", "myocardites")
## find the closest matching word in wordlist, either as a direct string match
## or the lowest Levenshtein distance of all the words in wordlist
match_word <- function(word, wordlist) {
word <- tolower(word)
if (word %in% wordlist) return(word)
wordlist[which.min(adist(word, wordlist)[1, ])]
}
## apply spellchecking to each word of a phrase of words
## joining back into a space-delimited phrase afterwards
spellcheck_phrase <- function(phrase, wordlist) {
sapply(phrase, \(w) paste(sapply(strsplit(w, " ")[[1]], \(word) match_word(word, wordlist)), collapse = " "), USE.NAMES = FALSE)
}
## e.g. spellcheck the gi terms
spellcheck_phrase(gi, terms)
#> [1] "gastrointestinal disorders" "gastrointestinal tract disorders"
#> [3] "gastrointestinal disorder"
## create an example dataset containing the (misspelled) terms and some values
meddata <- data.frame(term = c(gi, hep, co), value = LETTERS[1:9])
## stir to ensure randomness works
meddata <- meddata[match(meddata$value, strsplit("FIABDEHCG", "")[[1]]), ]
meddata
#> term value
#> 3 gastreinstestinal disorder C
#> 4 hepatic encephalopathy D
#> 8 myocardits H
#> 5 hepatic encephalapathy E
#> 6 hepatic encefalopathy F
#> 1 gastrointestinal disorders A
#> 9 myocardites I
#> 7 myocarditis G
#> 2 gastrointestinal tract disorders B
## add the corrected phrases to the data
meddata$corrected <- sapply(meddata$term, \(x) spellcheck_phrase(x, terms), USE.NAMES = FALSE)
meddata
#> term value corrected
#> 3 gastreinstestinal disorder C gastrointestinal disorder
#> 4 hepatic encephalopathy D hepatic encephalopathy
#> 8 myocardits H myocarditis
#> 5 hepatic encephalapathy E hepatic encephalopathy
#> 6 hepatic encefalopathy F hepatic encephalopathy
#> 1 gastrointestinal disorders A gastrointestinal disorders
#> 9 myocardites I myocarditis
#> 7 myocarditis G myocarditis
#> 2 gastrointestinal tract disorders B gastrointestinal tract disorders
## perform a grouping of the corrected terms, assigning a 'canonical' value to each group
## the parameters here may need to be adjusted, but seem to work for this example data
meddata$group <- zoomerjoin::jaccard_string_group(meddata$corrected, threshold = 0.1)
#> Loading required namespace: igraph
meddata
#> term value corrected
#> 3 gastreinstestinal disorder C gastrointestinal disorder
#> 4 hepatic encephalopathy D hepatic encephalopathy
#> 8 myocardits H myocarditis
#> 5 hepatic encephalapathy E hepatic encephalopathy
#> 6 hepatic encefalopathy F hepatic encephalopathy
#> 1 gastrointestinal disorders A gastrointestinal disorders
#> 9 myocardites I myocarditis
#> 7 myocarditis G myocarditis
#> 2 gastrointestinal tract disorders B gastrointestinal tract disorders
#> group
#> 3 gastrointestinal disorder
#> 4 hepatic encephalopathy
#> 8 myocarditis
#> 5 hepatic encephalopathy
#> 6 hepatic encephalopathy
#> 1 gastrointestinal disorder
#> 9 myocarditis
#> 7 myocarditis
#> 2 gastrointestinal disorder
## grouping can now be done as usual
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
meddata |>
group_by(group) |>
summarise(res = toString(sort(value)))
#> # A tibble: 3 × 2
#> group res
#> <chr> <chr>
#> 1 gastrointestinal disorder A, B, C
#> 2 hepatic encephalopathy D, E, F
#> 3 myocarditis G, H, I
Created on 2024-03-13 with reprex v2.0.2