amdevine · March 9, 2021 05:04
diff --git a/consolidate-rows.R b/consolidate-rows.R
 # Use dplyr package
 library(dplyr)

 # Create example data frame
 data <- data.frame(personal_id = c(1, 2, 3, 4, 5, 6),
                   name = c('Albert', 'Beth', 'Cindy', 'David', 'Eugene', 'Fred'),
                   household_id = c('1H', '1H', '2H', '2H', '3H', '3H'),
                   score1 = c(3, NA, 1, NA, 1, NA),
                   score2 = c(5, NA, NA, 2, 2, NA),
                   score3 = c(NA, 'banana', 'cherry', NA, 'eggplant', NA),
                   score4 = c(NA, 'bonobo', NA, 'dog', NA, 'ferret'),
                   stringsAsFactors = FALSE)
                   
 # data:
 #    personal_id    name household_id score1 score2   score3 score4
 # 1            1  Albert           1H      3      5     <NA>   <NA>
 # 2            2    Beth           1H     NA     NA   banana bonobo
 # 3            3   Cindy           2H      1     NA   cherry   <NA>
 # 4            4   David           2H     NA      2     <NA>    dog
 # 5            5  Eugene           3H      1      2 eggplant   <NA>
 # 6            6    Fred           3H     NA     NA     <NA> ferret


 # Custom function that takes all the values for a particular score column and a particular household,
 # drops any NA values, sorts them in ascending order, and returns the first value
 household_score <- function(scores) {
    sort(scores, na.last = NA)[1]
 }


 # Create summary data frame containg household ID and household score for each score column
 # across() function accepts a range of columns and a function to apply to each column
 household_data <- data %>%
    select(household_id:score4) %>%
    group_by(household_id) %>%
    summarize(across(score1:score4, household_score),
              .groups = 'drop')     # Removes message about ungrouping
              
 # household_data:
 #    household_id  score1 score2  score3    score4
 # 1  1H                 3      5  banana    bonobo
 # 2  2H                 1      2  cherry    dog   
 # 3  3H                 1      2  eggplant  ferret
	# Use dplyr package
	library(dplyr)

	# Create example data frame
	data <- data.frame(personal_id = c(1, 2, 3, 4, 5, 6),
	name = c('Albert', 'Beth', 'Cindy', 'David', 'Eugene', 'Fred'),
	household_id = c('1H', '1H', '2H', '2H', '3H', '3H'),
	score1 = c(3, NA, 1, NA, 1, NA),
	score2 = c(5, NA, NA, 2, 2, NA),
	score3 = c(NA, 'banana', 'cherry', NA, 'eggplant', NA),
	score4 = c(NA, 'bonobo', NA, 'dog', NA, 'ferret'),
	stringsAsFactors = FALSE)

	# data:
	# personal_id name household_id score1 score2 score3 score4
	# 1 1 Albert 1H 3 5 <NA> <NA>
	# 2 2 Beth 1H NA NA banana bonobo
	# 3 3 Cindy 2H 1 NA cherry <NA>
	# 4 4 David 2H NA 2 <NA> dog
	# 5 5 Eugene 3H 1 2 eggplant <NA>
	# 6 6 Fred 3H NA NA <NA> ferret


	# Custom function that takes all the values for a particular score column and a particular household,
	# drops any NA values, sorts them in ascending order, and returns the first value
	household_score <- function(scores) {
	sort(scores, na.last = NA)[1]
	}


	# Create summary data frame containg household ID and household score for each score column
	# across() function accepts a range of columns and a function to apply to each column
	household_data <- data %>%
	select(household_id:score4) %>%
	group_by(household_id) %>%
	summarize(across(score1:score4, household_score),
	.groups = 'drop') # Removes message about ungrouping

	# household_data:
	# household_id score1 score2 score3 score4
	# 1 1H 3 5 banana bonobo
	# 2 2H 1 2 cherry dog
	# 3 3H 1 2 eggplant ferret