14  Creating Your Own Functions

NoteLearning Objectives

By the end of this chapter, you will be able to:

  • Write functions with flexible argument handling
  • Use default arguments and ... (dots) to pass arguments
  • Understand R’s lexical scoping rules
  • Use stopifnot() and tryCatch() for error handling
  • Debug functions using browser() and debug()

14.1 Why Write Functions?

The rule of thumb: if you copy-paste a block of code more than twice, it belongs in a function. Functions:

  1. Reduce repetition — change the logic in one place, not many
  2. Name operations — a good function name is self-documenting
  3. Enable testing — isolated functions can be tested systematically
  4. Enable reuse — in other scripts or packages

14.2 Function Syntax

Code
# General form
# function_name <- function(arg1, arg2 = default) {
#   body
#   return(value)   # or just the last expression
# }

# A simple example
standardise <- function(x) {
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

standardise(c(10, 20, 30, 40, 50))
#> [1] -1.2649111 -0.6324555  0.0000000  0.6324555  1.2649111

# With default arguments
describe_variable <- function(x, digits = 2, na.rm = TRUE) {
  tibble(
    n       = length(x),
    missing = sum(is.na(x)),
    mean    = round(mean(x, na.rm = na.rm), digits),
    sd      = round(sd(x, na.rm = na.rm), digits),
    min     = min(x, na.rm = na.rm),
    max     = max(x, na.rm = na.rm)
  )
}

describe_variable(airquality$Ozone)
#> # A tibble: 1 × 6
#>       n missing  mean    sd   min   max
#>   <int>   <int> <dbl> <dbl> <int> <int>
#> 1   153      37  42.1  33.0     1   168
describe_variable(airquality$Ozone, digits = 1)
#> # A tibble: 1 × 6
#>       n missing  mean    sd   min   max
#>   <int>   <int> <dbl> <dbl> <int> <int>
#> 1   153      37  42.1    33     1   168

14.3 The Dots (...) Argument

... allows a function to accept arbitrary additional arguments and pass them on:

Code
# Pass ... to an inner function
nice_plot <- function(x, y, ...) {
  plot(x, y,
       pch = 19,
       col = "#3498db",
       las = 1,
       ...)   # Any additional plot() arguments are passed through
}

nice_plot(mtcars$wt, mtcars$mpg,
          xlab = "Weight", ylab = "MPG",
          main = "Weight vs. Fuel Efficiency")

14.4 Input Validation

Good functions check their inputs:

Code
compute_ci <- function(x, conf = 0.95, na.rm = TRUE) {

  # Input validation
  if (!is.numeric(x))
    stop("`x` must be a numeric vector", call. = FALSE)

  if (length(conf) != 1 || conf <= 0 || conf >= 1)
    stop("`conf` must be a single number between 0 and 1", call. = FALSE)

  if (na.rm) x <- x[!is.na(x)]

  n     <- length(x)
  alpha <- 1 - conf
  se    <- sd(x) / sqrt(n)
  t_val <- qt(1 - alpha / 2, df = n - 1)

  list(
    mean  = mean(x),
    lower = mean(x) - t_val * se,
    upper = mean(x) + t_val * se,
    conf  = conf
  )
}

compute_ci(airquality$Temp)
#> $mean
#> [1] 77.88235
#> 
#> $lower
#> [1] 76.37051
#> 
#> $upper
#> [1] 79.3942
#> 
#> $conf
#> [1] 0.95
# compute_ci("not numeric")   # Error: x must be numeric

14.5 Scope and Environments

R uses lexical scoping: a function looks for variables in the environment where it was defined, not where it was called.

Code
x <- 10   # Global variable

add_to_x <- function(y) {
  x + y   # Uses x from the global environment
}

add_to_x(5)   # 15
#> [1] 15

# Local variables don't leak out
my_func <- function() {
  local_var <- 42
  local_var
}

my_func()
#> [1] 42
# local_var   # Error: object 'local_var' not found

14.6 Error Handling

Code
# tryCatch: handle errors, warnings, and messages gracefully
safe_log <- function(x) {
  tryCatch(
    expr    = log(x),
    warning = function(w) {
      message("Warning: ", conditionMessage(w))
      NA_real_
    },
    error   = function(e) {
      message("Error: ", conditionMessage(e))
      NA_real_
    }
  )
}

safe_log(10)        # Works
#> [1] 2.302585
safe_log(-1)        # Returns NA with warning
#> [1] NA
safe_log("hello")   # Returns NA with error message
#> [1] NA

# purrr::safely wraps any function to return list(result, error)
safe_sqrt <- purrr::safely(sqrt)
safe_sqrt(4)
#> $result
#> [1] 2
#> 
#> $error
#> NULL
safe_sqrt("abc")
#> $result
#> NULL
#> 
#> $error
#> <simpleError in .f(...): non-numeric argument to mathematical function>

14.7 Debugging

Code
# browser(): pauses execution inside a function
buggy_function <- function(x) {
  result <- x * 2
  browser()           # Pauses here — you can inspect the environment
  result + some_typo  # This will error
}

# debug(): set a function to always enter browser mode
debug(buggy_function)
buggy_function(5)   # Enters browser mode

# traceback(): after an error, shows the call stack
f <- function(x) g(x)
g <- function(x) log(x)
# f("a")       # Error
# traceback()  # Shows which functions were in the call stack

14.8 A Case Study: A Reusable EDA Function

Code
#' Compute a tidy summary of a numeric variable by group
#'
#' @param df A data frame
#' @param var The numeric variable to summarise (unquoted)
#' @param group The grouping variable (unquoted)
#' @param digits Number of decimal places
#' @return A tibble with summary statistics per group
group_summary <- function(df, var, group, digits = 2) {

  # Validate
  if (!is.data.frame(df)) stop("`df` must be a data frame")

  var_name   <- deparse(substitute(var))
  group_name <- deparse(substitute(group))

  if (!var_name %in% names(df))
    stop(paste0("Column '", var_name, "' not found in data frame"))

  df |>
    group_by({{ group }}) |>
    summarise(
      n        = n(),
      missing  = sum(is.na({{ var }})),
      mean     = round(mean({{ var }}, na.rm = TRUE), digits),
      median   = round(median({{ var }}, na.rm = TRUE), digits),
      sd       = round(sd({{ var }}, na.rm = TRUE), digits),
      q25      = round(quantile({{ var }}, 0.25, na.rm = TRUE), digits),
      q75      = round(quantile({{ var }}, 0.75, na.rm = TRUE), digits),
      .groups  = "drop"
    )
}

# Usage
group_summary(gapminder::gapminder |> filter(year == 2007),
              var   = lifeExp,
              group = continent)
#> # A tibble: 5 × 8
#>   continent     n missing  mean median    sd   q25   q75
#>   <fct>     <int>   <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
#> 1 Africa       52       0  54.8   52.9  9.63  47.8  59.4
#> 2 Americas     25       0  73.6   72.9  4.44  71.8  76.4
#> 3 Asia         33       0  70.7   72.4  7.96  65.5  75.6
#> 4 Europe       30       0  77.6   78.6  2.98  75.0  79.8
#> 5 Oceania       2       0  80.7   80.7  0.73  80.5  81.0

14.9 Exercises

  1. Write a function winsorise(x, lower = 0.05, upper = 0.95) that replaces extreme values with the 5th and 95th percentiles. Test it on a vector with obvious outliers.

  2. Write a function multiple_regression_summary(df, outcome, predictors) that fits a linear model and returns a tidy coefficient table with confidence intervals.

  3. Add input validation to the function in Exercise 2: check that the outcome and all predictors exist as columns, that the outcome is numeric, and that there are enough observations.

  4. Demonstrate R’s lexical scoping: write two functions with the same local variable name. Show they don’t interfere with each other.

  5. Challenge: Write a function batch_report(data_dir, output_dir) that reads all .csv files in a directory, applies your group_summary() function, and writes one summary CSV per input file to the output directory. Use purrr::walk().