gapminder <- read.csv(here::here("data", "gapminder5.csv"))
head(gapminder)
## country year pop continent lifeExp gdpPercap
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
gapminder <- read.csv(here::here("data", "gapminder5.csv"))
head(gapminder)
## country year pop continent lifeExp gdpPercap
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
gapminder datasetstr(gapminder)
## 'data.frame': 1704 obs. of 6 variables:
## $ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: chr "Asia" "Asia" "Asia" "Asia" ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
gapminder datasetgapminder datasetgapminder$country <- as.character(gapminder$country)
gapminder$continent <- as.character(gapminder$continent)
str(gapminder)
## 'data.frame': 1704 obs. of 6 variables:
## $ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: chr "Asia" "Asia" "Asia" "Asia" ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
gapminder datasetmean(gapminder$lifeExp[gapminder$country == "Afghanistan"])
## [1] 37.47883
mean(gapminder$lifeExp[gapminder$country == "Albania"])
## [1] 68.43292
#...
gapminder datasetOr we could do save ourselves a lot of typing and time
Loops!
for, while, and the
apply familyforwhileapply family (often preferred to loops because the
code is cleaner)dplyr and data.table offer
better approaches to loopsfor loopfor loops repeat a function for all values in a vector
– don’t cut and paste!for (i in vector) { function(i) }
i is the iterator variable (could be any letter!)i for each interationfor loop# create a vector of values that you want to repeat the function for
obs <- 1:nrow(gapminder)
# initialize the for loop with `for (i in vector)`
for (i in obs) { # the function to repeat is enclosed in braces {}
gapminder[i, "gdp"] <- gapminder[i, "pop"] * gapminder[i, "gdpPercap"]
}
Create a new variable that finds that natural log (log)
of the GDP per capita and of population - call them
log_gdpPercap and log_pop
# initialize the for loop with `for (i in vector)`
for (i in obs) { # the function to repeat is enclosed in braces {}
gapminder[i, "log_gdpPercap"] <- log(gapminder[i, "gdpPercap"])
gapminder[i, "log_pop"] <- log(gapminder[i, "pop"])
}
head(gapminder)
## country year pop continent lifeExp gdpPercap gdp
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453 6567086330
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530 7585448670
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007 8758855797
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971 9648014150
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811 9678553274
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134 11697659231
## log_gdpPercap log_pop
## 1 6.658583 15.94675
## 2 6.710344 16.03915
## 3 6.748878 16.14445
## 4 6.728864 16.26115
## 5 6.606625 16.38655
## 6 6.667101 16.51555
gapminder$vec_log_gdpPercap <- log(gapminder$gdpPercap)
all(gapminder$vec_log_gdpPercap == gapminder$log_gdpPercap)
## [1] TRUE
years <- unique(gapminder$year)
for (i in years) {
mean_le <- mean(gapminder$lifeExp[gapminder$year == i],
na.rm = T)
print(paste0(i, ": ", mean_le))
}
## [1] "1952: 49.0576197183099"
## [1] "1957: 51.5074011267606"
## [1] "1962: 53.6092490140845"
## [1] "1967: 55.6782895774648"
## [1] "1972: 57.6473864788732"
## [1] "1977: 59.5701574647887"
## [1] "1982: 61.5331971830986"
## [1] "1987: 63.2126126760563"
## [1] "1992: 64.160338028169"
## [1] "1997: 65.014676056338"
## [1] "2002: 65.6949225352113"
## [1] "2007: 67.0074225352113"
conts <- unique(gapminder$continent)
for (i in conts) {
mean_le <- mean(gapminder$lifeExp[gapminder$continent == i],
na.rm = T)
print(paste0(i, ": ", mean_le))
}
## [1] "Asia: 60.0649032323232"
## [1] "Europe: 71.9036861111111"
## [1] "Africa: 48.8653301282051"
## [1] "Americas: 64.6587366666667"
## [1] "Oceania: 74.3262083333333"
for loops by defining
different iteratorsfor loopfor (i in conts) {
print(paste0("Continent: ", i))
for (j in years) {
mean_le <- mean(gapminder$lifeExp[gapminder$continent == i &
gapminder$year == j],
na.rm = T)
print(paste0(j, ": ", mean_le))
}
}
## [1] "Continent: Asia"
## [1] "1952: 46.3143939393939"
## [1] "1957: 49.3185442424242"
## [1] "1962: 51.563223030303"
## [1] "1967: 54.66364"
## [1] "1972: 57.3192690909091"
## [1] "1977: 59.6105563636364"
## [1] "1982: 62.6179393939394"
## [1] "1987: 64.8511818181818"
## [1] "1992: 66.5372121212121"
## [1] "1997: 68.0205151515152"
## [1] "2002: 69.2338787878788"
## [1] "2007: 70.7284848484849"
## [1] "Continent: Europe"
## [1] "1952: 64.4085"
## [1] "1957: 66.7030666666667"
## [1] "1962: 68.5392333333333"
## [1] "1967: 69.7376"
## [1] "1972: 70.7750333333333"
## [1] "1977: 71.9377666666667"
## [1] "1982: 72.8064"
## [1] "1987: 73.6421666666667"
## [1] "1992: 74.4401"
## [1] "1997: 75.5051666666667"
## [1] "2002: 76.7006"
## [1] "2007: 77.6486"
## [1] "Continent: Africa"
## [1] "1952: 39.1355"
## [1] "1957: 41.2663461538462"
## [1] "1962: 43.3194423076923"
## [1] "1967: 45.3345384615385"
## [1] "1972: 47.4509423076923"
## [1] "1977: 49.5804230769231"
## [1] "1982: 51.5928653846154"
## [1] "1987: 53.3447884615385"
## [1] "1992: 53.6295769230769"
## [1] "1997: 53.5982692307692"
## [1] "2002: 53.3252307692308"
## [1] "2007: 54.8060384615385"
## [1] "Continent: Americas"
## [1] "1952: 53.27984"
## [1] "1957: 55.96028"
## [1] "1962: 58.39876"
## [1] "1967: 60.41092"
## [1] "1972: 62.39492"
## [1] "1977: 64.39156"
## [1] "1982: 66.22884"
## [1] "1987: 68.09072"
## [1] "1992: 69.56836"
## [1] "1997: 71.15048"
## [1] "2002: 72.42204"
## [1] "2007: 73.60812"
## [1] "Continent: Oceania"
## [1] "1952: 69.255"
## [1] "1957: 70.295"
## [1] "1962: 71.085"
## [1] "1967: 71.31"
## [1] "1972: 71.91"
## [1] "1977: 72.855"
## [1] "1982: 74.29"
## [1] "1987: 75.32"
## [1] "1992: 76.945"
## [1] "1997: 78.19"
## [1] "2002: 79.74"
## [1] "2007: 80.7195"
for loop exercise!for loop exercise!sd) for life expectancy
for each continent for each year?for loop exercise!for (i in conts) {
print(paste0("Continent: ", i))
for (j in years) {
sd_le <- sd(gapminder$lifeExp[gapminder$continent == i &
gapminder$year == j],
na.rm = T)
print(paste0(j, ": ", sd_le))
}
}
## [1] "Continent: Asia"
## [1] "1952: 9.29175069597824"
## [1] "1957: 9.63542861940215"
## [1] "1962: 9.82063194066467"
## [1] "1967: 9.65096458232544"
## [1] "1972: 9.72270004073083"
## [1] "1977: 10.0221969818167"
## [1] "1982: 8.53522140873991"
## [1] "1987: 8.20379188414779"
## [1] "1992: 8.07554897033932"
## [1] "1997: 8.09117060876087"
## [1] "2002: 8.37459538857541"
## [1] "2007: 7.96372447069057"
## [1] "Continent: Europe"
## [1] "1952: 6.36108825405387"
## [1] "1957: 5.29580539238584"
## [1] "1962: 4.30249955966524"
## [1] "1967: 3.79972849846788"
## [1] "1972: 3.2405763693743"
## [1] "1977: 3.12102997680124"
## [1] "1982: 3.21826029893856"
## [1] "1987: 3.16968033940696"
## [1] "1992: 3.20978108986074"
## [1] "1997: 3.10467655135052"
## [1] "2002: 2.92217957861169"
## [1] "2007: 2.9798126601609"
## [1] "Continent: Africa"
## [1] "1952: 5.1515814343277"
## [1] "1957: 5.62012285430095"
## [1] "1962: 5.87536393337021"
## [1] "1967: 6.08267262744012"
## [1] "1972: 6.41625832389558"
## [1] "1977: 6.80819741006083"
## [1] "1982: 7.37594008904693"
## [1] "1987: 7.86408910830706"
## [1] "1992: 9.46107098639753"
## [1] "1997: 9.10338657543333"
## [1] "2002: 9.58649585045544"
## [1] "2007: 9.63078067196179"
## [1] "Continent: Americas"
## [1] "1952: 9.32608188397822"
## [1] "1957: 9.03319227681997"
## [1] "1962: 8.50354373815215"
## [1] "1967: 7.90917103705144"
## [1] "1972: 7.32301680161029"
## [1] "1977: 7.06949561543585"
## [1] "1982: 6.72083381905351"
## [1] "1987: 5.80192884249138"
## [1] "1992: 5.16710380580843"
## [1] "1997: 4.88758389629614"
## [1] "2002: 4.7997054986044"
## [1] "2007: 4.44094763085538"
## [1] "Continent: Oceania"
## [1] "1952: 0.190918830920365"
## [1] "1957: 0.0494974746830535"
## [1] "1962: 0.219203102167821"
## [1] "1967: 0.296984848098351"
## [1] "1972: 0.0282842712474663"
## [1] "1977: 0.898025612106913"
## [1] "1982: 0.636396103067887"
## [1] "1987: 1.4142135623731"
## [1] "1992: 0.869741340859456"
## [1] "1997: 0.905096679918782"
## [1] "2002: 0.890954544295053"
## [1] "2007: 0.729027091403335"
for loops can be slow…very slowfor loops can be slow…very slowapply family of
functions as a faster alternative
apply family is “loop-hiding”apply and its relatives help you write cleaner code,
but do not expect much of a speed boost
apply, lapply,
sapplyapply,
lapply, sapplyapplyapply(matrix, 1 = row or 2 = column, function) - Let’s
say we want to find the mean for each stat in gapminder
vars <- gapminder[, c("lifeExp", "pop", "gdpPercap")]
apply(vars, 2, mean)
## lifeExp pop gdpPercap
## 5.947444e+01 2.960121e+07 7.215327e+03
apply versus forapply(vars, 2, mean)
## lifeExp pop gdpPercap
## 5.947444e+01 2.960121e+07 7.215327e+03
for (i in vars) {
print(mean(i))
}
## [1] 59.47444
## [1] 29601212
## [1] 7215.327
lapply and sapplylapply and sapply iterate over a
values in a vector or list, rather than rows or columns
lapply returns a listsapply returns a simplified list (i.e., a vector)
sapply returns results, so always checklapply and sapplylapply(vector, function)lapply(gapminder, mean)
## $country
## [1] NA
##
## $year
## [1] 1979.5
##
## $pop
## [1] 29601212
##
## $continent
## [1] NA
##
## $lifeExp
## [1] 59.47444
##
## $gdpPercap
## [1] 7215.327
##
## $gdp
## [1] 186809560507
##
## $log_gdpPercap
## [1] 8.158791
##
## $log_pop
## [1] 15.76611
##
## $vec_log_gdpPercap
## [1] 8.158791
sapply(gapminder, mean)
## country year pop continent
## NA 1.979500e+03 2.960121e+07 NA
## lifeExp gdpPercap gdp log_gdpPercap
## 5.947444e+01 7.215327e+03 1.868096e+11 8.158791e+00
## log_pop vec_log_gdpPercap
## 1.576611e+01 8.158791e+00
applyapply
callfunction(x) [function] to the call–x
becomes the iteratorsapply(years, function(x) mean(gapminder$lifeExp[gapminder$year == x]))
## [1] 49.05762 51.50740 53.60925 55.67829 57.64739 59.57016 61.53320 63.21261
## [9] 64.16034 65.01468 65.69492 67.00742
while loopfor loop repeats a function for all values in
vectorwhile loop!while loop syntaxfor loop ->
while (condition) { function }i <- 1952 # define the interator
while (i < 1987) {
sd_lf <- sd(gapminder$lifeExp[gapminder$year == i])
print(paste0(i, ": ", sd_le)
)
i <- i + 5 # increase the iterator by the interval between years
}
## [1] "1952: 0.729027091403335"
## [1] "1957: 0.729027091403335"
## [1] "1962: 0.729027091403335"
## [1] "1967: 0.729027091403335"
## [1] "1972: 0.729027091403335"
## [1] "1977: 0.729027091403335"
## [1] "1982: 0.729027091403335"
while loopi <- 1987 # define the interator
while (i <= 2002) {
sd_lf <- sd(gapminder$lifeExp[gapminder$year == i])
print(paste0(i, ": ", sd_le)
)
i <- i + 5 # increase the iterator by the interval between years
}
## [1] "1987: 0.729027091403335"
## [1] "1992: 0.729027091403335"
## [1] "1997: 0.729027091403335"
## [1] "2002: 0.729027091403335"
while loop cautionary talewhile loop will continually run if the
logical condition is always satisfied!while loop without
increasing the iteratori <- 1987 # define the interator
while (i <= 2002) {
sd_lf <- sd(gapminder$lifeExp[gapminder$year == i])
print(paste0(i, ": ", sd_le)
)
# oops! forgot to increase i by 5! ESC! ESC! ESC!
}
if/else conditional!if/else - executes a function when a condition is
metiffor and while, initialize with
if and then detail condition in parenthesesrandom_year <- sample(years, 1)
random_year
## [1] 1992
if (random_year > 1977) {
print(random_year)
}
## [1] 1992
if statementsyearsset.seed()set.seed(10)
random_year <- sample(years, 1)
if (random_year > 1977) {
print(random_year)
}
## [1] 2002
else clauserandom_year <= 1977? NOTHING!else statement, telling R what to do when
the if condition isn’t metset.seed(1)
random_year <- sample(years, 1)
random_year
## [1] 1992
if (random_year > 1977) {
print(random_year)
} else {
print("sorry, random year is less than 1977")
}
## [1] 1992
if and else togetherrandom_year <- sample(years, 1)
if (random_year > 1977) {
print(paste0(random_year, ": ",
mean(gapminder$lifeExp[gapminder$year == random_year]))
)
} else {
print("sorry, random year is less than 1977")
}
## [1] "sorry, random year is less than 1977"
for and if/else togetherif…else clause to a
for loop# initialize the `for` loop
for () {
if () { #initialize if statement
# function if condition is met
} else {
# function if condition is not met
}
}
for and if/else togetherWhich continents have a mean life expectancy greater than 70 years?
threshold <- 70
for (i in unique(gapminder$continent)) {
tmp <- mean(gapminder$lifeExp[gapminder$continent==i])
if (tmp < threshold) {
print(paste("Mean Life Expectancy in", i, "is less than", threshold))
} else {
print(paste("Mean Life Expectancy in", i, "is greater than", threshold))
}
}
## [1] "Mean Life Expectancy in Asia is less than 70"
## [1] "Mean Life Expectancy in Europe is greater than 70"
## [1] "Mean Life Expectancy in Africa is less than 70"
## [1] "Mean Life Expectancy in Americas is less than 70"
## [1] "Mean Life Expectancy in Oceania is greater than 70"
for and if/else togetherWrite a for loop that reports the mean population for
years greater than or equal to 1987. Make sure the loop prints a message
if the condition is not met!
for and if/else togetherfor (i in years) {
if (i >= 1987) {
mean_pop <- mean(gapminder$lifeExp[gapminder$year == i])
print(paste0(i, ": ", mean_pop))
} else {
print("Sorry, year is less than 1987")
}
}
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "1987: 63.2126126760563"
## [1] "1992: 64.160338028169"
## [1] "1997: 65.014676056338"
## [1] "2002: 65.6949225352113"
## [1] "2007: 67.0074225352113"
[gapminder$lifeExp[gapminder$country == "Germany"]
## [1] 67.500 69.100 70.300 70.800 71.000 72.500 73.800 74.847 76.070 77.340
## [11] 78.670 79.406
`[`(gapminder$lifeExp, gapminder$country == "Germany")
## [1] 67.500 69.100 70.300 70.800 71.000 72.500 73.800 74.847 76.070 77.340
## [11] 78.670 79.406
functionmy_function <- # give the function a name
function(x, y) { # arguments for the function go inside the parentheses
# the expressions do in the braces
}
get_values <-
function(df, variable = "continent") {
vals <- unique(df[[variable]])
print(paste0(variable, ": ", vals))
}
gapminder
datasetreport_mean_sd <-
function(df, variable, country) {
var <- df[[variable]][df$country == country]
m_le <- mean(var)
sd_le <- sd(var)
cat("Country:", country,
"\nMean Life Expectancy:", m_le,
"\nSD Life Expectancy:", sd_le)
}
report_mean_sd(gapminder, "lifeExp", "Bulgaria")
## Country: Bulgaria
## Mean Life Expectancy: 69.74375
## SD Life Expectancy: 3.55268
gapmindermin, maxreport_stats <-
function(df, variable, continent) {
var <- df[[variable]][df$continent == continent]
min_le <- min(var)
max_le <- max(var)
cat("Continent:", continent,
"\nMinimum Life expectancy:", min_le,
"\nMaximum Life expectancy:", max_le)
}
report_stats(gapminder, "lifeExp", "Asia")
## Continent: Asia
## Minimum Life expectancy: 28.801
## Maximum Life expectancy: 82.603
viz_lm <-
function(df, dv, iv, year) {
dat <- df[df[["year"]] == year, ]
y <- log(dat[[dv]])
x <- log(dat[[iv]])
fit <- lm(y ~ x)
plot(y ~ x, main = year,
xlab = iv, ylab = dv)
lines(x, predict(fit), col = 'blue')
}
viz_lm(gapminder, "lifeExp", "gdpPercap", 1977)
for (i in years) {
viz_lm(gapminder, "lifeExp", "gdpPercap", i)
}
rmd_exercise_template.RmdR4RExercise_LastnameFirstname.Rmd.