Created
November 19, 2012 01:56
-
-
Save tomschenkjr/4108562 to your computer and use it in GitHub Desktop.
Chicago Data Visualization reshape Tutorial Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Install reshape package. You will be asked to pick a server. | |
install.packages("reshape") | |
# We're going to also use ggplot2, so let's install that as well: | |
install.packages(c("reshape","ggplot2")) # In R, c() concatenates inputs as a vector | |
# You will always need to load the libraries after installing the package. | |
library(reshape) | |
library(ggplot2) | |
# First, we're going to work with the "mtcars" data set. | |
str(mtcars) # Explore the data structure. | |
rownames(mtcars) # Each row does have a car make/model. | |
## Reshape was inspired by other base R functions that summarize data. Let's explore these first. | |
# summary() is the most basic data summary | |
summary(mtcars) | |
# The apply() function will summarize columns or rows. | |
apply(mtcars, 2, mean) # Average by column, other common named functions are median, sd, length. | |
apply(mtcars, 2, stats::quantile) | |
apply(mtcars, 2, function(x) x^2) # Can define your own function. | |
apply(mtcars, 1, mean) # Average by row, but this doesn't make sense -- why not? | |
# The by() function allows you to create summary data by su | |
by(mtcars$mpg, mtcars$cyl, mean) # What is the average mpg by the number of car cylinders. | |
by(mtcars[ ,c(1,3:5)], mtcars$cyl, mean) # Average mpg, disp, hp, and drat by number of cylinders. | |
# The by() function is a simplification of tapply() | |
tapply(mtcars$mpg, mtcars$cyl, mean) | |
# The table() function provides a simple count of elements. | |
table(mtcars$cyl) | |
table(mtcars$cyl, table$gear) # 2 x 2 table. | |
## Reshape involves two steps, one to "melt" the data. Let's ask the question, how does the car shape and performance vary with cylinders? | |
# Melt the data | |
mtcars.melt <- melt(mtcars, id.var="cyl") | |
# Inspect molten data | |
str(mtcars.melt) | |
head(mtcars.melt) | |
tail(mtcars.melt) | |
## Each row of data is transformed into multiple rows, each row represents a variable for each observed car cylinder. | |
## Now we can cast the data into a reshaped table. | |
cast(mtcars.melt, cyl ~ variable, mean) # Average for all variables for each cylinder. | |
cast(mtcars.melt, cyl ~ variable, sd) # Standard deviation | |
cast(mtcars.melt, cyl ~ variable, stats::quantile) # Quantiles | |
## Let's ask a slightly different question: What is the relationship between the cylinders and horsepower? | |
# All of these are equivalent | |
mtcars.hp.melt <- melt(mtcars, measure.var="hp") | |
mtcars.hp.melt <- melt(mtcars, id.var = -4) | |
mtcars.hp.melt <- melt(mtcars, id.var=c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb")) | |
# Again cast the data to reshape | |
cast(mtcars.hp.melt, cyl ~ variable, mean) # hp is our melted variable | |
# Sometimes it's handy to store the casted data. | |
mtcars.hp.cast.mean <- cast(mtcars.hp.melt, cyl ~ variable, mean) # Mean | |
mtcars.hp.cast.sd <- cast(mtcars.hp.melt, cyl ~ variable, sd) # Standard Deviation | |
mtcars.hp.cast.length <- cast(mtcars.hp.melt, cyl ~ variable, length) # Length | |
# Now we can combine them into one summary table | |
mtcars.hp.cyl.descriptives <- data.frame(mtcars.hp.cast.mean, mtcars.hp.cast.sd[ ,2], mtcars.hp.cast.length[ ,3]) | |
names(mtcars.hp.cyl.descriptives) <- c("Cylinders", "Average Horsepower", "Horsepower Standard Deviation", "Number of cars") | |
# An easier way to do this is in the casting | |
cast(mtcars.hp.melt, cyl ~ variable, c(mean, sd, length)) | |
# Now we can plot it as well | |
plot(mtcars.hp.cyl.descriptives$Cylinders, mtcars.hp.cyl.descriptives$"Number of cars") # Better to just use ggplot, but it shows how to do it. | |
## Which organization had the most FOIA (Freedom of Information Act) requests from the Law department? | |
# Read data from City of Chicago | |
foia <- read.csv("http://data.cityofchicago.org/api/views/44bx-ncpi/rows.csv") | |
# Quickest way to answer our question is the table function | |
table(foia$ORGANIZATION) # But this can be used for plotting. | |
# Cast to make into a proper table | |
foia.melt <- melt(foia, id.var="ORGANIZATION") | |
foia.cast <- cast(foia.melt, ORGANIZATION ~ variable, length) # The "." summarizes all variables instead of listing each column. | |
# Now we can graph it. | |
names(foia.cast)[2] <- "Requests" | |
ggplot(foia.cast) + geom_bar(aes(x=ORGANIZATION, y=Requests)) | |
# A little messy, so let's order them | |
foia.cast[order(foia.cast$Requests), ] # Oops, reverse order | |
foia.cast.order <- foia.cast[order(-foia.cast$Requests), ] | |
foia.cast.order.topfive <- foia.cast.order[1:5, ] | |
ggplot(foia.cast.order.topfive) + geom_bar(aes(x=ORGANIZATION, y=Requests)) # Plot the top 5 requesting agencies. | |
## But what are the total number of requests? | |
# Quickest answer is: | |
nrow(foia) # But doesn't show any of the organizations. | |
cast(foia.melt, ORGANIZATION ~ variable, length, margins="grand_row") # Same answer | |
## So, I'm lazy and I hate all of these columns. Four of them display the exact same data. Let's fix that. | |
cast(foia.melt, ORGANIZATION ~ ., function(x) length(x)/4, margins="grand_row") | |
## What aspects of student and school district characteristics correlate to test scores? | |
# Load data | |
ed <- read.csv("http://www.ats.ucla.edu/stat/r/faq/hsb2.csv") | |
# Inspect data | |
head(ed) | |
str(ed) | |
# Melt data | |
ed.melt <- melt(ed, measure.var(c("read", "write", "math", "science", "socst")) | |
# Cast data showing relationship between females and test scores | |
ed.cast <- cast(ed.melt, female ~ variable, mean) | |
cast(ed.melt, female + race ~ variable, mean) # Add race variable | |
cast(ed.melt, female + race ~ ses + variable, mean) # Add socio-economic status | |
# This is getting to be pretty high dimension, so let's make a list | |
cast(ed.melt, female ~ race | variable, mean) | |
# We can make a heatmap showing test scores for each student. | |
ggplot(ed.melt, aes(x = variable, y=id)) + geom_tile(aes(fill=value, color="white") + scale_fill_gradient(low="white", high="steelblue") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Tom, I thought reshape is a base R command so why do I need to install a package?
Or is this an old requirement which is no longer valid? Thank you.