tomschenkjr · November 19, 2012 01:56 · Pegasus3 · Jul 3, 2023
diff --git a/reshape-tutorial.R b/reshape-tutorial.R
 # Install reshape package. You will be asked to pick a server.
 install.packages("reshape")

 # We're going to also use ggplot2, so let's install that as well:
 install.packages(c("reshape","ggplot2")) # In R, c() concatenates inputs as a vector

 # You will always need to load the libraries after installing the package.
 library(reshape)
 library(ggplot2)

 # First, we're going to work with the "mtcars" data set.
 str(mtcars) # Explore the data structure.
 rownames(mtcars) # Each row does have a car make/model.

 ## Reshape was inspired by other base R functions that summarize data. Let's explore these first.
 # summary() is the most basic data summary
 summary(mtcars)

 # The apply() function will summarize columns or rows.
 apply(mtcars, 2, mean) # Average by column, other common named functions are median, sd, length.
 apply(mtcars, 2, stats::quantile)
 apply(mtcars, 2, function(x) x^2) # Can define your own function.
 apply(mtcars, 1, mean) # Average by row, but this doesn't make sense -- why not?

 # The by() function allows you to create summary data by su
 by(mtcars$mpg, mtcars$cyl, mean) # What is the average mpg by the number of car cylinders.
 by(mtcars[ ,c(1,3:5)], mtcars$cyl, mean) # Average mpg, disp, hp, and drat by number of cylinders.

 # The by() function is a simplification of tapply()
 tapply(mtcars$mpg, mtcars$cyl, mean)

 # The table() function provides a simple count of elements.
 table(mtcars$cyl)
 table(mtcars$cyl, table$gear) # 2 x 2 table.

 ## Reshape involves two steps, one to "melt" the data. Let's ask the question, how does the car shape and performance vary with cylinders?

 # Melt the data
 mtcars.melt <- melt(mtcars, id.var="cyl")

 # Inspect molten data
 str(mtcars.melt)
 head(mtcars.melt)
 tail(mtcars.melt)

 ## Each row of data is transformed into multiple rows, each row represents a variable for each observed car cylinder.

 ## Now we can cast the data into a reshaped table.
 cast(mtcars.melt, cyl ~ variable, mean) # Average for all variables for each cylinder.
 cast(mtcars.melt, cyl ~ variable, sd) # Standard deviation
 cast(mtcars.melt, cyl ~ variable, stats::quantile) # Quantiles

 ## Let's ask a slightly different question: What is the relationship between the cylinders and horsepower?
 # All of these are equivalent
 mtcars.hp.melt <- melt(mtcars, measure.var="hp")
 mtcars.hp.melt <- melt(mtcars, id.var = -4)
 mtcars.hp.melt <- melt(mtcars, id.var=c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb"))

 # Again cast the data to reshape
 cast(mtcars.hp.melt, cyl ~ variable, mean) # hp is our melted variable

 # Sometimes it's handy to store the casted data.
 mtcars.hp.cast.mean <- cast(mtcars.hp.melt, cyl ~ variable, mean) # Mean
 mtcars.hp.cast.sd <- cast(mtcars.hp.melt, cyl ~ variable, sd) # Standard Deviation
 mtcars.hp.cast.length <- cast(mtcars.hp.melt, cyl ~ variable, length) # Length

 # Now we can combine them into one summary table
 mtcars.hp.cyl.descriptives <- data.frame(mtcars.hp.cast.mean, mtcars.hp.cast.sd[ ,2], mtcars.hp.cast.length[ ,3])
 names(mtcars.hp.cyl.descriptives) <- c("Cylinders", "Average Horsepower", "Horsepower Standard Deviation", "Number of cars")

 # An easier way to do this is in the casting
 cast(mtcars.hp.melt, cyl ~ variable, c(mean, sd, length))

 # Now we can plot it as well
 plot(mtcars.hp.cyl.descriptives$Cylinders, mtcars.hp.cyl.descriptives$"Number of cars") # Better to just use ggplot, but it shows how to do it.

 ## Which organization had the most FOIA (Freedom of Information Act) requests from the Law department?
 # Read data from City of Chicago
 foia <- read.csv("http://data.cityofchicago.org/api/views/44bx-ncpi/rows.csv")

 # Quickest way to answer our question is the table function
 table(foia$ORGANIZATION) # But this can be used for plotting.

 # Cast to make into a proper table
 foia.melt <- melt(foia, id.var="ORGANIZATION")
 foia.cast <- cast(foia.melt, ORGANIZATION ~ variable, length) # The "." summarizes all variables instead of listing each column.

 # Now we can graph it.
 names(foia.cast)[2] <- "Requests"
 ggplot(foia.cast) + geom_bar(aes(x=ORGANIZATION, y=Requests))

 # A little messy, so let's order them
 foia.cast[order(foia.cast$Requests), ] # Oops, reverse order
 foia.cast.order <- foia.cast[order(-foia.cast$Requests), ]
 foia.cast.order.topfive <- foia.cast.order[1:5, ]
 ggplot(foia.cast.order.topfive) + geom_bar(aes(x=ORGANIZATION, y=Requests)) # Plot the top 5 requesting agencies.

 ## But what are the total number of requests?

 # Quickest answer is:
 nrow(foia) # But doesn't show any of the organizations.

 cast(foia.melt, ORGANIZATION ~ variable, length, margins="grand_row") # Same answer

 ## So, I'm lazy and I hate all of these columns. Four of them display the exact same data. Let's fix that.
 cast(foia.melt, ORGANIZATION ~ ., function(x) length(x)/4, margins="grand_row")

 ## What aspects of student and school district characteristics correlate to test scores?
 # Load data
 ed <- read.csv("http://www.ats.ucla.edu/stat/r/faq/hsb2.csv")

 # Inspect data
 head(ed)
 str(ed)

 # Melt data
 ed.melt <- melt(ed, measure.var(c("read", "write", "math", "science", "socst"))

 # Cast data showing relationship between females and test scores
 ed.cast <- cast(ed.melt, female ~ variable, mean)
 cast(ed.melt, female + race ~ variable, mean) # Add race variable
 cast(ed.melt, female + race ~ ses + variable, mean) # Add socio-economic status

 # This is getting to be pretty high dimension, so let's make a list
 cast(ed.melt, female ~ race | variable, mean)

 # We can make a heatmap showing test scores for each student.
 ggplot(ed.melt, aes(x = variable, y=id)) + geom_tile(aes(fill=value, color="white") + scale_fill_gradient(low="white", high="steelblue")
	# Install reshape package. You will be asked to pick a server.
	install.packages("reshape")

	# We're going to also use ggplot2, so let's install that as well:
	install.packages(c("reshape","ggplot2")) # In R, c() concatenates inputs as a vector

	# You will always need to load the libraries after installing the package.
	library(reshape)
	library(ggplot2)

	# First, we're going to work with the "mtcars" data set.
	str(mtcars) # Explore the data structure.
	rownames(mtcars) # Each row does have a car make/model.

	## Reshape was inspired by other base R functions that summarize data. Let's explore these first.
	# summary() is the most basic data summary
	summary(mtcars)

	# The apply() function will summarize columns or rows.
	apply(mtcars, 2, mean) # Average by column, other common named functions are median, sd, length.
	apply(mtcars, 2, stats::quantile)
	apply(mtcars, 2, function(x) x^2) # Can define your own function.
	apply(mtcars, 1, mean) # Average by row, but this doesn't make sense -- why not?

	# The by() function allows you to create summary data by su
	by(mtcars$mpg, mtcars$cyl, mean) # What is the average mpg by the number of car cylinders.
	by(mtcars[ ,c(1,3:5)], mtcars$cyl, mean) # Average mpg, disp, hp, and drat by number of cylinders.

	# The by() function is a simplification of tapply()
	tapply(mtcars$mpg, mtcars$cyl, mean)

	# The table() function provides a simple count of elements.
	table(mtcars$cyl)
	table(mtcars$cyl, table$gear) # 2 x 2 table.

	## Reshape involves two steps, one to "melt" the data. Let's ask the question, how does the car shape and performance vary with cylinders?

	# Melt the data
	mtcars.melt <- melt(mtcars, id.var="cyl")

	# Inspect molten data
	str(mtcars.melt)
	head(mtcars.melt)
	tail(mtcars.melt)

	## Each row of data is transformed into multiple rows, each row represents a variable for each observed car cylinder.

	## Now we can cast the data into a reshaped table.
	cast(mtcars.melt, cyl ~ variable, mean) # Average for all variables for each cylinder.
	cast(mtcars.melt, cyl ~ variable, sd) # Standard deviation
	cast(mtcars.melt, cyl ~ variable, stats::quantile) # Quantiles

	## Let's ask a slightly different question: What is the relationship between the cylinders and horsepower?
	# All of these are equivalent
	mtcars.hp.melt <- melt(mtcars, measure.var="hp")
	mtcars.hp.melt <- melt(mtcars, id.var = -4)
	mtcars.hp.melt <- melt(mtcars, id.var=c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb"))

	# Again cast the data to reshape
	cast(mtcars.hp.melt, cyl ~ variable, mean) # hp is our melted variable

	# Sometimes it's handy to store the casted data.
	mtcars.hp.cast.mean <- cast(mtcars.hp.melt, cyl ~ variable, mean) # Mean
	mtcars.hp.cast.sd <- cast(mtcars.hp.melt, cyl ~ variable, sd) # Standard Deviation
	mtcars.hp.cast.length <- cast(mtcars.hp.melt, cyl ~ variable, length) # Length

	# Now we can combine them into one summary table
	mtcars.hp.cyl.descriptives <- data.frame(mtcars.hp.cast.mean, mtcars.hp.cast.sd[ ,2], mtcars.hp.cast.length[ ,3])
	names(mtcars.hp.cyl.descriptives) <- c("Cylinders", "Average Horsepower", "Horsepower Standard Deviation", "Number of cars")

	# An easier way to do this is in the casting
	cast(mtcars.hp.melt, cyl ~ variable, c(mean, sd, length))

	# Now we can plot it as well
	plot(mtcars.hp.cyl.descriptives$Cylinders, mtcars.hp.cyl.descriptives$"Number of cars") # Better to just use ggplot, but it shows how to do it.

	## Which organization had the most FOIA (Freedom of Information Act) requests from the Law department?
	# Read data from City of Chicago
	foia <- read.csv("http://data.cityofchicago.org/api/views/44bx-ncpi/rows.csv")

	# Quickest way to answer our question is the table function
	table(foia$ORGANIZATION) # But this can be used for plotting.

	# Cast to make into a proper table
	foia.melt <- melt(foia, id.var="ORGANIZATION")
	foia.cast <- cast(foia.melt, ORGANIZATION ~ variable, length) # The "." summarizes all variables instead of listing each column.

	# Now we can graph it.
	names(foia.cast)[2] <- "Requests"
	ggplot(foia.cast) + geom_bar(aes(x=ORGANIZATION, y=Requests))

	# A little messy, so let's order them
	foia.cast[order(foia.cast$Requests), ] # Oops, reverse order
	foia.cast.order <- foia.cast[order(-foia.cast$Requests), ]
	foia.cast.order.topfive <- foia.cast.order[1:5, ]
	ggplot(foia.cast.order.topfive) + geom_bar(aes(x=ORGANIZATION, y=Requests)) # Plot the top 5 requesting agencies.

	## But what are the total number of requests?

	# Quickest answer is:
	nrow(foia) # But doesn't show any of the organizations.

	cast(foia.melt, ORGANIZATION ~ variable, length, margins="grand_row") # Same answer

	## So, I'm lazy and I hate all of these columns. Four of them display the exact same data. Let's fix that.
	cast(foia.melt, ORGANIZATION ~ ., function(x) length(x)/4, margins="grand_row")

	## What aspects of student and school district characteristics correlate to test scores?
	# Load data
	ed <- read.csv("http://www.ats.ucla.edu/stat/r/faq/hsb2.csv")

	# Inspect data
	head(ed)
	str(ed)

	# Melt data
	ed.melt <- melt(ed, measure.var(c("read", "write", "math", "science", "socst"))

	# Cast data showing relationship between females and test scores
	ed.cast <- cast(ed.melt, female ~ variable, mean)
	cast(ed.melt, female + race ~ variable, mean) # Add race variable
	cast(ed.melt, female + race ~ ses + variable, mean) # Add socio-economic status

	# This is getting to be pretty high dimension, so let's make a list
	cast(ed.melt, female ~ race \| variable, mean)

	# We can make a heatmap showing test scores for each student.
	ggplot(ed.melt, aes(x = variable, y=id)) + geom_tile(aes(fill=value, color="white") + scale_fill_gradient(low="white", high="steelblue")