Last active
July 12, 2021 05:54
-
-
Save vikjam/7073f7debca077a03819f184be57789c to your computer and use it in GitHub Desktop.
Joining datasets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) # Helps manipulate data | |
# Let's create example datasets | |
records <- data.frame( | |
user_id = c(1, 2, 3, 4, 5), | |
location = c("A", "B", "C", "C", "D") | |
) | |
# user_id location | |
# 1 1 A | |
# 2 2 B | |
# 3 3 C | |
# 4 4 C | |
# 5 5 D | |
transactions <- data.frame( | |
user_id = c(1, 1, 1, 2, 3, 3, 3, 4, 5, 5), | |
amt = c(50, 30, 20, 25, 42, 67, 43, 82, 90, 9) | |
) | |
# user_id amt | |
# 1 1 50 | |
# 2 1 30 | |
# 3 1 20 | |
# 4 2 25 | |
# 5 3 42 | |
# 6 3 67 | |
# 7 3 43 | |
# 8 4 82 | |
# 9 5 90 | |
# 10 5 9 | |
census <- data.frame( | |
locale = c("A", "B", "E"), | |
poverty_rate = c("0.19", "0.15", "0.25") | |
) | |
# locale poverty_rate | |
# 1 A 0.19 | |
# 2 B 0.15 | |
# 3 E 0.25 | |
# A few types of matches | |
# - inner join | |
# - full join (also know as outer join) | |
# - left join | |
# - right join | |
# inner join: | |
# - Return only matched in both datasets | |
# - Notice in "by" we use the name of merging variable in the | |
# first data.frame first and the name of the second data.frame | |
inner_join( | |
records, | |
census, | |
by = c("location" = "locale") | |
) | |
# user_id location poverty_rate | |
# 1 1 A 0.19 | |
# 2 2 B 0.15 | |
# full join: | |
# - Return matched and unmatched cases | |
full_join( | |
records, | |
census, | |
by = c("location" = "locale") | |
) | |
# user_id location poverty_rate | |
# 1 1 A 0.19 | |
# 2 2 B 0.15 | |
# 3 3 C <NA> | |
# 4 4 C <NA> | |
# 5 5 D <NA> | |
# 6 NA E 0.25 | |
# left join: | |
# - Return matched cases and return unmatched cases from the first data.frame | |
left_join( | |
records, | |
census, | |
by = c("location" = "locale") | |
) | |
# user_id location poverty_rate | |
# 1 1 A 0.19 | |
# 2 2 B 0.15 | |
# 3 3 C <NA> | |
# 4 4 C <NA> | |
# 5 5 D <NA> | |
# right join: | |
# - Return matched cases and unmatched cases from the second data.frame | |
right_join( | |
records, | |
census, | |
by = c("location" = "locale") | |
) | |
# user_id location poverty_rate | |
# 1 1 A 0.19 | |
# 2 2 B 0.15 | |
# 3 NA E 0.25 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment