13  Writing Efficient R Code

NoteLearning Objectives

By the end of this chapter, you will be able to:

  • Use control structures: if-else, for loops, while loops
  • Apply the apply family of functions for vectorised operations
  • Use purrr::map() for functional programming
  • Profile code performance and identify bottlenecks
  • Write faster R code using vectorisation

13.1 Control Structures

13.1.1 If-Else Statements

Code
# Basic if-else
classify_bmi <- function(bmi) {
  if (bmi < 18.5) {
    "Underweight"
  } else if (bmi < 25.0) {
    "Normal"
  } else if (bmi < 30.0) {
    "Overweight"
  } else {
    "Obese"
  }
}

classify_bmi(22.3)
#> [1] "Normal"
classify_bmi(31.5)
#> [1] "Obese"

# Vectorised version using dplyr::case_when
bmi_values <- c(17.2, 22.3, 27.8, 33.1, 19.5)

dplyr::case_when(
  bmi_values < 18.5 ~ "Underweight",
  bmi_values < 25.0 ~ "Normal",
  bmi_values < 30.0 ~ "Overweight",
  TRUE              ~ "Obese"
)
#> [1] "Underweight" "Normal"      "Overweight"  "Obese"       "Normal"
TipPrefer Vectorised Operations

In R, applying a function to each element of a vector using a loop is almost always slower than using vectorised operations. Use ifelse() or case_when() instead of if inside a loop.

13.1.2 For Loops

Code
# A for loop to read and process multiple files
# (demonstrated conceptually)

results <- list()
for (i in 1:5) {
  results[[i]] <- tibble(
    simulation = i,
    mean_val   = mean(rnorm(100, mean = i * 10)),
    sd_val     = sd(rnorm(100))
  )
}

bind_rows(results)
#> # A tibble: 5 × 3
#>   simulation mean_val sd_val
#>        <int>    <dbl>  <dbl>
#> 1          1     9.97  1.01 
#> 2          2    19.9   1.02 
#> 3          3    30.1   0.911
#> 4          4    39.9   0.846
#> 5          5    50.1   1.05

13.1.3 While Loops

Code
# Newton-Raphson: find root of f(x) = x^2 - 2 (i.e., sqrt(2))
x    <- 1.0
tol  <- 1e-10
iter <- 0

while (abs(x^2 - 2) > tol) {
  x    <- x - (x^2 - 2) / (2 * x)
  iter <- iter + 1
}

cat(sprintf("sqrt(2) ≈ %.10f (converged in %d iterations)\n", x, iter))
#> sqrt(2) ≈ 1.4142135624 (converged in 4 iterations)

13.2 The Apply Family

The apply family avoids explicit loops and is often more readable (though not always faster):

Code
# apply: applies a function over rows (MARGIN=1) or columns (MARGIN=2)
mat <- matrix(rnorm(20), nrow = 4)
apply(mat, MARGIN = 1, mean)   # Row means
#> [1] -0.07874748  0.49836352 -0.05551444  0.23110316
apply(mat, MARGIN = 2, sd)     # Column SDs
#> [1] 0.9746192 1.0676863 1.2159076 1.5039742 0.5234661

# lapply: applies to each element of a list, returns a list
datasets <- list(mtcars = mtcars, iris = iris, airquality = airquality)
lapply(datasets, nrow)
#> $mtcars
#> [1] 32
#> 
#> $iris
#> [1] 150
#> 
#> $airquality
#> [1] 153

# sapply: like lapply but simplifies the result
sapply(datasets, nrow)          # Returns named vector
#>     mtcars       iris airquality 
#>         32        150        153
sapply(datasets, function(d) c(rows = nrow(d), cols = ncol(d)))
#>      mtcars iris airquality
#> rows     32  150        153
#> cols     11    5          6

# vapply: like sapply but type-safe (recommended for production code)
vapply(datasets, nrow, FUN.VALUE = integer(1))
#>     mtcars       iris airquality 
#>         32        150        153

# tapply: apply by group
tapply(iris$Sepal.Length, iris$Species, mean)
#>     setosa versicolor  virginica 
#>      5.006      5.936      6.588

13.3 Functional Programming with purrr

purrr provides a consistent, type-safe interface for functional programming:

Code
# map: always returns a list
map(1:5, ~ .x^2)
#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] 4
#> 
#> [[3]]
#> [1] 9
#> 
#> [[4]]
#> [1] 16
#> 
#> [[5]]
#> [1] 25

# map_dbl: returns a double vector
map_dbl(mtcars, mean)
#>        mpg        cyl       disp         hp       drat         wt       qsec 
#>  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
#>         vs         am       gear       carb 
#>   0.437500   0.406250   3.687500   2.812500

# map_df: returns a data frame
iris |>
  group_split(Species) |>
  map_dfr(~ tibble(
    species    = unique(.x$Species),
    mean_sepal = mean(.x$Sepal.Length),
    n          = nrow(.x)
  ))
#> # A tibble: 3 × 3
#>   species    mean_sepal     n
#>   <fct>           <dbl> <int>
#> 1 setosa           5.01    50
#> 2 versicolor       5.94    50
#> 3 virginica        6.59    50
Code
# map2: iterate over two vectors simultaneously
x_vals <- 1:5
y_vals <- 6:10
map2_dbl(x_vals, y_vals, ~ .x + .y)
#> [1]  7  9 11 13 15

# pmap: iterate over any number of inputs
params <- list(
  mean = c(0, 5, 10),
  sd   = c(1, 2, 3),
  n    = c(100, 100, 100)
)
pmap(params, rnorm) |> map_dbl(mean)
#> [1]  0.06778104  5.19024883 10.31970541
Code
# Practical: fit a model for each species, extract R-squared
iris |>
  group_split(Species) |>
  map_dfr(function(df) {
    m <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = df)
    tibble(
      species   = unique(df$Species),
      r_squared = summary(m)$r.squared,
      n         = nrow(df)
    )
  })
#> # A tibble: 3 × 3
#>   species    r_squared     n
#>   <fct>          <dbl> <int>
#> 1 setosa         0.570    50
#> 2 versicolor     0.584    50
#> 3 virginica      0.761    50

13.4 Vectorisation and Performance

Code
# Slow: loop over elements
sum_loop <- function(x) {
  total <- 0
  for (val in x) total <- total + val
  total
}

# Fast: vectorised
n <- 1e6
x <- rnorm(n)

bench::mark(
  loop   = sum_loop(x),
  base_r = sum(x),
  check  = FALSE
)
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 loop        11.21ms   11.4ms      87.6    11.1KB        0
#> 2 base_r       1.43ms   1.44ms     689.         0B        0
Code
# Anti-pattern: growing a vector in a loop
slow_grow <- function(n) {
  x <- c()
  for (i in 1:n) x <- c(x, i^2)
  x
}

# Better: pre-allocate
fast_prealloc <- function(n) {
  x <- numeric(n)
  for (i in 1:n) x[i] <- i^2
  x
}

# Best: fully vectorised
fastest <- function(n) (1:n)^2

bench::mark(
  slow   = slow_grow(1000),
  better = fast_prealloc(1000),
  best   = fastest(1000),
  check  = TRUE
)
#> # A tibble: 3 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 slow         1.07ms   1.12ms      833.    3.88MB    92.9 
#> 2 better      36.08µs  37.39µs    26302.    26.2KB     5.26
#> 3 best         1.92µs   2.52µs   355998.   11.81KB   107.

13.5 Exercises

  1. Write a function using a for loop that takes a data frame and returns the number of missing values in each column. Then write the same function using sapply(). Then using purrr::map_int().

  2. Use purrr::map_dfr() to fit a linear regression of mpg ~ wt separately for each number of cylinders in mtcars. Return a data frame with the intercept, slope, and R-squared for each group.

  3. Benchmark three approaches to computing row means of a 1000×100 matrix: (a) a for loop, (b) apply(), (c) rowMeans(). Which is fastest?

  4. Use purrr::possibly() to handle errors gracefully when applying a function that might fail on some inputs.

  5. Challenge: Read 12 monthly data files (simulate them using write_csv()), process each one, and combine into a single data frame using purrr::map_dfr() and here::here().