- Ubuntu Install
- General observations
- Basics
- Data Types
- Subsetting
- regular expression
- Tidyverse library
- Descriptive statistics
- Statistical model
- Plot
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/'
sudo apt update
sudo apt install r-base
- get help by append
?
to a function name, data set, symbol... - implicit printing, therefore, usually you don't need to write
print()
- counting stats at 1; so to get the first item in a vector
x[1]
; - in function calls, you can specify the arguments by name or order, e.g.
plot(iris$Species, iris$Petal.Width)
,plot(x = iris$Species, y = iris$Petal.Width)
- vectorized language, therefore there's much less need to iterate over objects, e.g.
- to get double of each element of this vector
x <- c(3, 5, 8)
, you only needx * 2
- to get double of each element of this vector
- AND, OR comparisions can be made with single ou double signs (
&, &&, |, ||
), they behave in different ways
?
append a question mark to a function, data set, library name and get info about it
#
= single-line comments- R doesn’t support multi-lined comments
cat("\014")
is the code to send "CTRL + L" to the console
data()
= lists all available data sets, including from libraries (if they're loaded)
View(data set)
NOTE: it's also permissible to declare variable and functions with equal sign
- variables:
x <- 20
- functions:
myF <- function() {...}
- vectors:
x <- c(5, 8, 12)
add <- function(...) {
args <- list(...)
sum <- 0
for (n in args) {
sum <- sum + n
}
return(sum)
}
add(2, 3, 5, 4)
class()
ortypeof()
= the only difference is that class calls double 'numeric' and typeof 'double'str()
= short for structure, displays the internal structure of the given object
%%, %/%
= remainder and quotient:
= creates the series of numbers in sequence for a vector%in%
= if element belongs to a vector& |
,&& ||
=- single operators examine the vector element by element and return a vector filled with logical values (TRUE or FALSE)
- double operators examine only the first element of each vector and return a logical value (TRUE or FALSE)
x <- c( TRUE, FALSE, TRUE );
y <- c( TRUE, FALSE, FALSE);
print(x & y) # TRUE FALSE FALSE
print(x && y) # TRUE
ifelse()
is a vector equivalent form of theif...else
statement
x <- c(3, 5, 8, 12)
# ifelse(test, yes, no)
# returns a value with the same shape as test, usually a vector
# filled with elements selected from either yes or no
# depending on whether the element of test is TRUE or FALSE
ifelse(x %% 2 == 0, "even", "odd")
# loop 1 through 10 (inclusive)
for (n in 0:10) {
print(n)
}
# loop vector elements
x <- c(5, 8, 12, 15)
for (n in x) {
print(n)
}
x <- 1
while (x <= 10) {
print(x)
x = x + 1
}
color <- "b"
switch(color, "r" = "red", "b" = "blue", "unknown")
x <- scan()
iris$Petal.Width [iris$Species == "setosa"]
plot(iris$Petal.Width [iris$Species == "setosa"| iris$Species == "virginica" ])
double <- function(x) {
return(x * 2)
}
x <- matrix(c(3, 5, 8, 12), nrow = 2)
# apply(X, MARGIN, FUN, …)
# x = matrix; MARGIN = 1 for rows, 2 for cols; FUN = function to apply
apply(x, 2, double)
- In R, everything is a object
R doesn't have primitive data types in the way that other languages do. In R even the simplest numeric value is an example of a vector.
- used often:
- logical =
TRUE
,FALSE
- numeric/double = can be a integer or contain a decimal value
- character = enclosed with quotes (single or double)
- logical =
- not used often:
- integer = declare explicitly with
x <- 10L
- complex = numbers, e.g.
3 + 2i
- raw = created with
charToRaw()
- integer = declare explicitly with
- must contain only one data type
- created with
c()
- e.g. numeric vectors
x <- c(5, 8, 12)
- index starts at 1
- can contain many different types of elements
- like vectors, functions, lists...
l <- list(c(3, 5, 8), "my string...", TRUE, list("a", "b"), myFunction)
x <- list(a = "aaa", b = "bbb")
= can have named elements
matrix(data = NA, nrow = 1, ncol = 1, byrow = FALSE, dimnames = NULL)
- like vectors, matrices store information of the same data type
- two-dimensional
- e.g.
m <- matrix(c(3, 5, 8, 12, 15, 18), 2, 3)
m[1,2
= access specific elementm[1,]
= access all of the row 1,m[,2]
= access all of the column 2
- while matrices are two-dimensional, arrays can be any number of dimensions
- store only one data type
array(c('green','yellow'), dim = c(2,3,2))
= creates 2 matrices with 2 rows and 3 columns each
- created using a vector (of categoricals values), it stores the vector along with the its distincts values
v <- c("pinapple", "banana", "banana", "apple", "pinapple", "banana")
f <- factor(v)
print(f) # print vector and levels (each distinct vector value)
print(nlevels(f)) # print how many distinct vector value
print(levels(f)) # print each distinct vector value
- is a form of matrix, which is tabular and can contain different data types
- columns are variables and rows are observations
df <- data.frame(
Name = c("John", "Matt"),
Age = c(25, 27),
City = c("Boston", "NY")
)
print(nrow(df))
print(ncol(df))
print(dim(df)) # get both nrow and ncol
- Subsetting in R is a useful indexing feature for accessing object elements,
- it can be used to select and filter variables and observations.
[]
= get a subset of length 1 or more- usually, object and its subset are of the same type; therefore, subset of vector will be a vector, subset of a data frame will be a data frame...
- however, there's one inconsistency - if the subset contains only one value, R will reduce the result to the lowest dimension and then subset and container may have different type
- both names and indices can be used
- negative integers indicate exclusion
- variables are interpolated
- usually, object and its subset are of the same type; therefore, subset of vector will be a vector, subset of a data frame will be a data frame...
[[]]
= extract only one element (not necessarily just one value); i.e. vectors yield single value, data frames yield column vector- names or indices can be used
- variables are interpolated
- usually, not the same type as the object container
- dimension of returned value isn't necessarily 1
$
= special case of[[
in which you access a single item by a name- therefore,
iris$Species
andiris[["Species"]]
are equivalent - cannot use integer indices
- if name contain special characters, name must be enclose in backticks
- therefore,
a <- c(3, 5, 8,12)
# accessing with numbers
a[1]
a[c(1, 3)] # positive get multiple specified elements
a[-c(2, 4)] # negative exclude elements
# accessing with logical values
a[c(TRUE,FALSE,TRUE,FALSE)] # select elements where the value is TRUE
a[a > 5] # therefore, this is possible
- if two vectors are of unequal length, the shorter one will be recycled in order to match the longer vector
- if longer object length is not a multiple of shorter object length, the program will throw a warning but it'll still return a result
a <- c(2, 3, 5, 8)
b <- c(1, 2)
a * b # result: 2, 6, 5, 16
x <- list("a", "b", "c")
# single bracket returns a object of class 'list'
class(x[2]) # list
# double brackets returns a single element (not of class 'list')
class(x[[2]]) # character
# named lists
y <- list(f = 1:3, s = "a", t = 4:6)
y$f
y[["f"]]
m <- matrix(c(3, 5, 8, 12, 15, 18, 21, 25, 30), nrow = 3, byrow = TRUE)
m[1,] # entire first row
m[, 1] # blank subsetting selects all rows/column; here entire first column
m[2, 1] # element at second row, first column
m[1:2, 2:3] # get rows 1 from 2, their columns 2 from 3
m[c(1, 3), c(1, 3)] # get rows 1 and 3, their columns 1 and 3
# using a 2 column matrix to subset a matrix
# each row of the matrix will specify a row and a column
select <- matrix(c(1, 1, 1, 3, 3, 1, 3, 3), ncol = 2, byrow = TRUE)
m[select] # result: 3 8 21 30
mtcars[3] # single index will return specified column(s)
mtcars[3, 1] # two indices will behave like matrices, first is row and second is column
`hp$Name` or `hp[["Name"]]` # access by name
mtcars[3, "mpg"] # access by both, index and name; third row, column named "mpg"
mtcars$mpg[3] # access by both, name and index
# filtering by column
# column (second argument) is left blank, to return all columns
iris[iris$Species == "setosa", ]
iris[iris$Petal.Width > 0.5 & iris$Species == "setosa", ] # multiple filters
# grepl returns a vector of logical values
g <- grepl("Toyota", rownames(mtcars))
mtcars[g, ]
# grep returns a vector with the indices that contain a match
g <- grep("Toyota", rownames(mtcars))
mtcars[g, ]
# using grepl together with dplyr
library(tidyverse)
iris %>%
filter(grepl("setosa", Species))
- tidyverse is a set of packages that make easier to perform everyday data analyses and work in harmony (packages share common API)
sudo apt install libcurl4-openssl-dev libssl-dev libxml2-dev
= ubuntu packages needed- or
install.packages("tidyverse")
= to install from the r script
- or
library(tidyverse)
= to load a library
library(tidyverse)
- from now on, any tidyverse function (like
dplyr::filter
) can be called withoutdplyr::
- you only need to append
dplyr::
if there're name collisions and you need to call the function that was overwritten
- from now on, any tidyverse function (like
%>%
= simplify chaining, that is, passsing a single data to several functions
library(tidyverse)
# without pipe ('.data')
f <- filter(.data = mpg, model == "a4")
s <- select(.data = f, manufacturer, model, year)
s
# using pipe
mpg %>%
filter(model == "a4") %>%
select(manufacturer, model, year)
- manipulate data sets
library(tidyverse)
mtcars %>%
filter(
mpg > 20,
cyl == 4,
wt < 2.5,
grepl("Toyota", rownames(mtcars))
) %>%
arrange(mpg) %>%
select (mpg, cyl, wt)
- helps create tidy data, that is:
- every column is a variable
- every row is a observation
- every cell is a single value
- lengthens data, increasing the number of rows and decreasing the number of columns
gather()
is retired, recommendation is to use insteadpivot_longer()
library(tidyverse)
df <- data.frame(
name = c("John", "Mary", "Jake"),
a = c(7, 9, 18),
b = c(18, 5, 3),
c = c(32, 17, 35)
)
# 'key' and 'value' will be the names of the new cols
# 'key' will be a categorical variable holding the 'multiple columns' names
# and 'value' will hold the 'multiple columns' values
df %>%
# gather(key, value, ...multiple columns)
gather("drug", "volume", a, b, c)
df %>%
# pivot_longer(columns vector, names_to, values_to)
pivot_longer(
cols = c(a, b, c),
names_to = "drug",
values_to = "volume")
- widens data, increasing the number of columns and decreasing the number of rows
spread()
is retired, recommendation is to use insteadpivot_wider()
library(tidyverse)
df <- data.frame(
name = c("John", "John", "Mary", "Mary"),
drug = c("a", "b", "a", "b"),
volume = c(7, 18, 9, 5)
)
# spread
df %>%
# each individual value in key
# will be converted to a column
spread(key = "drug", value = "volume")
# pivot_wider()
df %>%
# each individual value in names_from
# will be converted to a column
pivot_wider(names_from = "drug", values_from = "volume")
- splits a single column into multiple columns
library(tidyverse)
df <- data.frame(
Name = c("John", "Mary"),
Job = c("Teacher, Designer", "Manager, Developer")
)
df %>%
separate(
col = "Job",
into = c("Job 1", "Job 2"), # names of the new columns to be created
sep = ", "
)
- combines multiple columns into on
library(tidyverse)
df <- data.frame(
Name = c("John", "Mary"),
Job1 = c("Teacher", "Manager"),
Job2 = c("Designer", "Developer")
)
df %>%
# unite(col = name of new column, ...columns to unite, sep = separator)
unite(
col = "Jobs",
"Job1",
"Job2",
sep = ", "
)
- given a regular expression with capturing groups, extract() turns each group into a new column
library(tidyverse)
df <- data.frame(
Name = c(
"John Edwards Smith",
"Mary Kate Miller Brown",
"Matt Richards"
)
)
df %>%
extract(
col = "Name",
into = c("First name", "Last name"),
regex = "([A-z]*).*\\s([A-z]*)"
)
- in the bottom-right panel, click in the file name and select 'Import', 'From text(readr)'
- as you configure you data, the corresponding code line is shown
# read_csv is from readr (included in tidyverse)
library(tidyverse)
setwd("~/Dev/r/")
hp <- read_csv("hp.csv")
hp
# write_csv is from readr (included in tidyverse)
library(tidyverse)
setwd("~/Dev/r/")
write_csv(iris, "iris.csv")
- summarize and describe a given data set
- create a frequency table from a categorical variable (column)
table(iris$Species)
min(mtcars$cyl)
median(mtcars$cyl)
mean(mtcars$cyl)
max(mtcars$cyl)
quantile(mtcars$cyl)
# get all at once
summary(mtcars$cyl)
summary(iris)
,summary(iris$Petals.width)
= details about an object- if variable is categorical, result is a frequency table
- if variable is quantitative, result is a table containing measures of center (mean, median) and measures of spread (min, 1st qu., 3rd qu., max)
- correlation
# correlation between weight and miles per gallon
cor(mtcars$wt, mtcars$mpg) # result: -0.86
- is a set of mathematical equations based on probabilities and used to describe the relationship between two or more variables
- purpose: description, inference (estimates the parameters of a larger population), comparison (compare if two sets of data are different in a statistically significant way) and prediction (about new, unknown observations)
- describes the relationship between two variables, how changes in one variable affects the other variable
- is linear model because assume a straight line
- both variables must be a continuous numeric value
- the variable in the x axis is called 'explanatory variable', and the one in the y axis is called 'outcome variable'
- linear predictor function -
y = m * x + b
- m is the slope of the line (for each unit increase in x, how much does y increase)
- b is the y intercept (the y value when x is equal to 0)
plot(iris$Petal.Length, iris$Petal.Width)
# lm is the R function to create linear models
model <- lm(
formula = Petal.Width ~ Petal.Length,
data = iris
)
# draw straight line on top of the plot
lines(
x = iris$Petal.Length,
y = model$fitted,
col = "red",
lwd = 3
)
# predict new values from model
predict(
object = model,
newdata = data.frame(
Petal.Length = c(2, 5, 7) # arbitrary values
)
)
- plot is a graphical technique for representing a data set
- usually a graph showing the relationship between one or more variables
- in R, plot is usually done
- with base R, that is, without any third-party library
- with a library called ggplot2 (included with tidyverse)
- base R mostly use the
plot(x, y)
function- but there're also the
barplot(), hist()
functions
- but there're also the
- ggplot always use the
ggplot(data = data, mapping = aes())
function,- appended by pipe
+
- and then layers, scales, facets and/or coordinates
library(tidyverse)
# save plot to variable
# only save it, don't display it
p <- ggplot(mtcars, aes(x = cyl)) +
geom_bar()
# wont save flipped plot into the variable
# only displays it
p + coord_flip()
library(tidyverse)
# needed for third variable in aes()
f <- factor(mtcars$am)
levels(f) <- c("Automatic", "Manual")
ggplot(mtcars, aes( x = wt, y = mpg, shape = f, color = f )) +
geom_point() +
labs(
title = "WT VS MPG",
x = "weight",
y = "miles per gallon",
# change legend title with the aes names
shape = "Transmission",
color = "Transmission"
) +
theme( # theme() customize non-data components
plot.title =
element_text( face = "bold",
hjust = 0.5,
margin = margin(8, 0, 16, 0)),
axis.title =
element_text( face = "italic"),
axis.title.x =
element_text( margin = margin(8, 0, 4, 0) ),
axis.title.y =
element_text( margin = margin(0, 8, 0, 4) ),
axis.ticks = element_blank() # remove ticks
)
library(tidyverse)
ggplot(ChickWeight, aes(x = weight)) +
geom_histogram() +
coord_cartesian(xlim = c(200, 300)) # zoom
df <- data.frame(
Month = 1:12,
Num = as.vector(AirPassengers)[1:12]
)
plot(df$Num, type = "l")
polygon(c(min(df$Month), df$Month, max(df$Month)), c(0, df$Num, 0), col = "steelblue")
df <- data.frame(
Month = 1:12,
Num = as.vector(AirPassengers)[1:12]
)
ggplot(df, aes(x = Month, y = Num)) +
# geom_area() + # ymin fixed to 0, which would make plot very high
geom_ribbon(aes(ymin = 100, ymax = Num)) +
geom_line()
- x axis: categorical variable
- y axis: frequency/count
# plot()
plot(iris$Species)
# barplot()
t <- table(iris$Species) # creates frequency table
barplot(t)
ggplot(iris, aes(x = Species)) +
geom_bar()
dotchart(table(mtcars$cyl))
ggplot(mtcars, aes(x = cyl)) +
# stat = the statistical transformation to use on the data for this layer
geom_point(stat = "count") +
coord_flip()
pie(table(mtcars$cyl))
ggplot(
mtcars, aes(x = "", fill = as.factor(cyl))) +
geom_bar() +
coord_polar(theta = "y")
hist(mtcars$mpg)
ggplot(mtcars, aes(x = mpg)) +
geom_histogram(binwidth = 5) # binwidth = bar widths
plot(density(mtcars$mpg))
ggplot(mtcars, aes(x = mpg)) +
geom_density()
t <- table(mtcars$cyl, mtcars$am)
barplot(t, beside = TRUE) # grouped
barplot(t,) # stacked
# percent
percentage <- apply(t, 2, function(x){x*100/sum(x,na.rm=T)})
barplot(percentage)
# grouped
ggplot(
data = mtcars,
aes(x = factor(am), fill = factor(cyl))) +
geom_bar(position = "dodge")
# stacked
ggplot(
data = mtcars,
aes(x = factor(am), fill = factor(cyl))) +
geom_bar()
# percent
ggplot(
data = mtcars,
aes(x = factor(am), fill = factor(cyl))) +
geom_bar(position = "fill")
plot(ChickWeight$Diet, ChickWeight$weight)
ggplot(ChickWeight, aes( x = Diet, y = weight)) +
geom_boxplot()
plot(mtcars$wt, mtcars$mpg)
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point()
library(tidyverse)
ggplot(mtcars, aes(x = wt, y = mpg, size = hp)) +
geom_point()
# both col and shape will need a categorical variable
f <- as.factor(mtcars$am)
levels(f) <- c("Automatic", "Manual") # rename levels
ggplot(mtcars, aes(x = wt, y = mpg, col = f)) +
geom_point()
ggplot(mtcars, aes(x = wt, y = mpg, shape = f)) +
geom_point()
# both col and shape
ggplot(mtcars, aes(x = wt, y = mpg, col = f, shape = f)) +
geom_point()
# multi-panel
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
facet_grid(. ~ cyl)