multiple times addition of two columns with dplyr

Asked 23/7, 2024 at 11:47 Answered 23/7, 2024 at 14:21

Assume I have a tibble (in my example here only with four columns). In real I have two hourly time series therefore 2 times 24 columns:

s1.x,...,s24.x, s1.y,...,s24.y

Now I want to sum up column s1.x with s1.y, s2.x with s2.y to s24.x with s24.y.

a <- tibble(s1.x=2:7, s2.x=3:8, s1.y=4:9, s2.y=5:10)

a %>% 
  mutate(s1.tot=s1.x+s1.y, s2.tot=s2.x+s2.y)

How can I do this in a short (and elegant way) for all 24 hours?

Intoxicated answered 23/7, 2024 at 11:47 Comment(1)

I think your best option would be to pivot your data to long format. That will make this operation - and, most likely, future steps in your process - far simpler. – Chicory 23/7, 2024 at 12:11

You can use Map function:

n = 2      # for your data, use n = 24
a[paste0("s", 1:n, ".tot")] = Map(`+`, a[paste0("s", 1:n, ".x")], a[paste0("s", 1:n, ".y")])

# A tibble: 6 × 6
   s1.x  s2.x  s1.y  s2.y s1.tot s2.tot
  <int> <int> <int> <int>  <int>  <int>
1     2     3     4     5      6      8
2     3     4     5     6      8     10
3     4     5     6     7     10     12
4     5     6     7     8     12     14
5     6     7     8     9     14     16
6     7     8     9    10     16     18

Megaton answered 23/7, 2024 at 13:21 Comment(1)

Suggest automatically setting the value of n as n <- sum(endsWith(names(a), ".x")) – Lifesaver 23/7, 2024 at 14:21

A base R solution

cbind(
    a,
    lapply(
        split.default(a, sub("\\..*", ".tot", names(a))),
        rowSums
    )
)

or even shorter

cbind(a, t(rowsum(t(a), sub("\\..*", ".tot", names(a)))))

which gives

  s1.x s2.x s1.y s2.y s1.tot s2.tot
1    2    3    4    5      6      8
2    3    4    5    6      8     10
3    4    5    6    7     10     12
4    5    6    7    8     12     14
5    6    7    8    9     14     16
6    7    8    9   10     16     18

Benchmarking

If you are care about the speed

edward <- function() {
    a %>%
        mutate(id = 1:nrow(a)) %>%
        pivot_longer(-id,
            names_to = c("s", ".value"),
            names_pattern = "(s\\d+).(.)"
        ) |>
        mutate(tot = x + y, .by = id) |>
        pivot_wider(names_from = s, values_from = c(x, y, tot), names_glue = "{s}.{.value}") %>%
        select(-id)
}


mael <- function() {
    a |>
        mutate(across(matches(".x$"), .names = "{gsub('x', '', .col)}tot") +
            across(matches(".y$")))
}

tic1 <- function() {
    cbind(
        a,
        lapply(
            split.default(a, sub("\\..*", ".tot", names(a))),
            rowSums
        )
    )
}

tic2 <- function() {
    cbind(a, t(rowsum(t(a), sub("\\..*", ".tot", names(a)))))
}

microbenchmark(
    edward(),
    mael(),
    tic1(),
    tic2(),
    unit = "relative",
    check = "equivalent"
)

shows

Unit: relative
     expr       min        lq      mean    median        uq      max neval
 edward() 43.286275 33.290854 29.937941 32.085324 30.763830 8.997682   100
   mael()  8.465686  7.081354  6.652635  6.930750  6.867252 2.572953   100
   tic1()  1.673039  1.469692  1.527453  1.417756  1.315485 1.425672   100
   tic2()  1.000000  1.000000  1.000000  1.000000  1.000000 1.000000   100

Eudoxia answered 23/7, 2024 at 12:22 Comment(1)

Maybe sub("\\..+", ".tot", names(a)) instead of paste0(sub("\\..*", "", names(a)), ".tot") ? – Phenobarbital 23/7, 2024 at 12:25

I would do it with two across in this case, although pivoting might be a better option for more complex operations.

library(dplyr)
a |> 
  mutate(across(matches(".x$"), .names = "{gsub('x', '', .col)}tot") + 
           across(matches(".y$")))

#   s1.x s2.x s1.y s2.y s1.tot s2.tot
# 1    2    3    4    5      6      8
# 2    3    4    5    6      8     10
# 3    4    5    6    7     10     12
# 4    5    6    7    8     12     14
# 5    6    7    8    9     14     16
# 6    7    8    9   10     16     18

Halsy answered 23/7, 2024 at 12:11 Comment(0)

One way is to pivot. But we need to add an id column to do the sum.

a %>% 
  mutate(id=1:nrow(a)) %>%
  pivot_longer(-id,
               names_to=c("s", ".value"),
               names_pattern="(s\\d+).(.)") |>
  mutate(tot=x+y, .by=id) |>
  pivot_wider(names_from=s, values_from=c(x,y,tot), names_glue="{s}.{.value}")

# A tibble: 6 × 7
     id  s1.x  s2.x  s1.y  s2.y s1.tot s2.tot
  <int> <int> <int> <int> <int>  <int>  <int>
1     1     2     3     4     5      6      8
2     2     3     4     5     6      8     10
3     3     4     5     6     7     10     12
4     4     5     6     7     8     12     14
5     5     6     7     8     9     14     16
6     6     7     8     9    10     16     18

Breastsummer answered 23/7, 2024 at 12:16 Comment(0)

You can use Map function:

n = 2      # for your data, use n = 24
a[paste0("s", 1:n, ".tot")] = Map(`+`, a[paste0("s", 1:n, ".x")], a[paste0("s", 1:n, ".y")])

# A tibble: 6 × 6
   s1.x  s2.x  s1.y  s2.y s1.tot s2.tot
  <int> <int> <int> <int>  <int>  <int>
1     2     3     4     5      6      8
2     3     4     5     6      8     10
3     4     5     6     7     10     12
4     5     6     7     8     12     14
5     6     7     8     9     14     16
6     7     8     9    10     16     18

Megaton answered 23/7, 2024 at 13:21 Comment(1)

Suggest automatically setting the value of n as n <- sum(endsWith(names(a), ".x")) – Lifesaver 23/7, 2024 at 14:21

Here's a general tidyverse solution - 24 hours, 7 days (series), 5 observations example.

# Pkgs and seed -----------------------------------------------------------
library(tidyverse)
set.seed(100)

# Toy data - 24 hours a day, 7 days, 5 observations -----------------------
my_df <- paste0(rep(paste0("s", str_pad(1:24, width = 2, pad = "0")), 7), ".", rep(letters[19:25], each = 24))
my_df <- as_tibble(matrix(sample(0:10, (7*24)* 5, replace = TRUE), nrow = 5, dimnames = list(rep(NA, 5), my_df)))

# code - Creates a list and reduces it ------------------------------------
new_df <- reduce(map(
  str_unique(str_extract(colnames(my_df), ".$")),
  \(i) rename_with(select(my_df, ends_with(i)), \(j) str_replace(j, paste0(i, "$"), "tot"))), 
  `+`)

# Output ------------------------------------------------------------------
select(my_df,contains("01"))
#> # A tibble: 5 × 7
#>   s01.s s01.t s01.u s01.v s01.w s01.x s01.y
#>   <int> <int> <int> <int> <int> <int> <int>
#> 1     9     9    10     0     4     2     2
#> 2     6     1     4     2     3     3     3
#> 3     5     4     4     9    10     5     8
#> 4     2     9     4     4     9     0     3
#> 5     8     3     9     2     8    10     2

select(new_df,contains("01"))
#>   s01.tot
#> 1      36
#> 2      22
#> 3      45
#> 4      31
#> 5      42

^{Created on 2024-07-23 with reprex v2.1.0}

Mcgann answered 23/7, 2024 at 14:21 Comment(0)

Benchmarking

Recommended topics

Hot tags