How to separate the units in column name into another column in R

Asked 11/1, 2024 at 8:6 Answered 11/1, 2024 at 12:53

The table has units in column. How can I separate the units in column name into another column?

Example:

SUBJID	Dose (mg)	Tmax (h)
2001	200	5
2002	200	5

This is what i want.

SUBJID	Dose	Dose_unit	Tmax	Tmax_unit
2001	200	mg	200	h
2002	200	mg	200	h

Atwood answered 11/1, 2024 at 8:6 Comment(0)

Here is an approach using pivot_longer/wider and separate_wider_regex:

dat <- data.frame(
  SUBJID = c(2001L, 2002L),
  `Dose (mg)` = c(200L, 200L),
  `Tmax (h)` = c(5L, 5L),
  check.names = FALSE
)

library(tidyr)

dat |>
  pivot_longer(-SUBJID,
    names_to = "name_unit"
  ) |>
  separate_wider_regex(name_unit,
    patterns = c(name = "^.*?", " \\(", unit = ".*", "\\)")
  ) |>
  pivot_wider(
    names_from = name,
    values_from = c(unit, value),
    names_glue = "{name}_{.value}",
    names_vary = "slowest"
  )
#> # A tibble: 2 × 5
#>   SUBJID Dose_unit Dose_value Tmax_unit Tmax_value
#>    <int> <chr>          <int> <chr>          <int>
#> 1   2001 mg               200 h                  5
#> 2   2002 mg               200 h                  5

Halla answered 11/1, 2024 at 8:36 Comment(1)

Great answer! The pipeline looks clear and comprehensive, easy to understand the logic behind, +1! – Fieldsman 11/1, 2024 at 13:32

Here is a base R option, using lapply + cbind

cbind(
    dat[1],
    do.call(
        cbind,
        lapply(
            seq_along(dat)[-1],
            \(k) {
                d <- dat[k]
                nms <- paste0(gsub("\\s+\\(.*$", "", names(d)), c("_val", "_unit"))
                setNames(cbind(d, gsub(".*\\((.*)\\)", "\\1", names(d))), nms)
            }
        )
    )
)

which gives

  SUBJID Dose_val Dose_unit Tmax_val Tmax_unit
1   2001      200        mg        5         h
2   2002      200        mg        5         h

benchmark

Given list of solution


stefan <- function() {
    dat |>
        pivot_longer(-SUBJID,
            names_to = "name_unit"
        ) |>
        separate_wider_regex(name_unit,
            patterns = c(name = "^.*?", " \\(", unit = ".*", "\\)")
        ) |>
        pivot_wider(
            names_from = name,
            values_from = c(unit, value),
            names_glue = "{name}_{.value}",
            names_vary = "slowest"
        )
}

issac <- function() {
    dat |>
        mutate(
            Dose_value = sub("\\D", "", `Dose (mg)`),
            Dose_unit = "mg",
            Tmax_value = sub("\\D", "", `Tmax (h)`),
            Tmax_unit = "h"
        ) |>
        select(SUBJID, Dose_value, Dose_unit, Tmax_value, Tmax_unit)
}

mael <- function() {
    dat |>
        mutate(across(-SUBJID, \(x) str_extract(cur_column(), "(?<=\\().+?(?=\\))"),
            .names = "{gsub(' .*', '', .col)}_unit"
        )) |>
        rename_with(.fn = \(x) word(x, 1), .col = matches("\\(")) %>%
        relocate(SUBJID, order(colnames(.)))
}

tic <- function() {
    cbind(
        dat[1],
        do.call(
            cbind,
            lapply(
                seq_along(dat)[-1],
                \(k) {
                    d <- dat[k]
                    nms <- paste0(gsub("\\s+\\(.*$", "", names(d)), c("_val", "_unit"))
                    setNames(cbind(d, gsub(".*\\((.*)\\)", "\\1", names(d))), nms)
                }
            )
        )
    )
}

and the benchmarking template

dat <- data.frame(
    SUBJID = c(2001L, 2002L),
    `Dose (mg)` = c(200L, 200L),
    `Tmax (h)` = c(5L, 5L),
    check.names = FALSE
)

microbenchmark(
    stefan = stefan(),
    issac = issac(),
    mael = mael(),
    tic = tic(),
    unit = "relative"
)

we see that

Unit: relative
   expr      min        lq      mean    median        uq      max neval
 stefan 40.50826 31.763595 28.900405 32.131738 32.396251 4.452021   100
  issac  8.80123  7.150539  6.919996  7.108832  7.719263 1.362840   100
   mael 12.34542 10.347826 10.030679 10.533903 11.225731 2.257146   100
    tic  1.00000  1.000000  1.000000  1.000000  1.000000 1.000000   100

Fieldsman answered 11/1, 2024 at 12:53 Comment(1)

Really nice and useful to compare all approaches. – Halla 12/1, 2024 at 9:32

A more compact solution is to apply regex patterns to retrieve the unit and assign to a new column, within an across call to apply over multiple columns. You can then use rename_with to remove parentheses and units in the column names, and use relocate to reorder columns.

library(stringr)
library(dplyr)

df |> 
  mutate(across(-SUBJID, \(x) str_extract(cur_column(), "(?<=\\().+?(?=\\))"),
         .names = "{gsub(' .*', '', .col)}_unit")) |> 
  rename_with(.fn = \(x) word(x, 1), .col = matches("\\(")) %>%
  relocate(SUBJID, order(colnames(.)))

#   SUBJID Dose Dose_unit Tmax Tmax_unit
# 1   2001  200        mg    5         h
# 2   2002  200        mg    5         h

Polymerism answered 11/1, 2024 at 10:4 Comment(1)

cool! you made it without pivoting the dataframe, which saves a lot overhead from dataframe reshaping, +1! – Fieldsman 11/1, 2024 at 13:34

This can be another option:

> tibble::tibble(
+   SUBJID = c(2001, 2002),
+   `Dose (mg)` = c(200, 200),
+   `Tmax (h)` = c(5, 5)
+ ) |> 
+   mutate(
+     Dose_value = sub("\\D", "", `Dose (mg)`),
+     Dose_unit = "mg",
+     Tmax_value = sub("\\D", "", `Tmax (h)`),
+     Tmax_unit = "h"
+   ) |> 
+   select(SUBJID, Dose_value, Dose_unit, Tmax_value, Tmax_unit)

# A tibble: 2 × 5
  SUBJID Dose_value Dose_unit Tmax_value Tmax_unit
   <dbl> <chr>      <chr>     <chr>      <chr>    
1   2001 200        mg        5          h        
2   2002 200        mg        5          h

Pillsbury answered 11/1, 2024 at 8:41 Comment(1)

Imagine 100 columns. – Interplead 11/1, 2024 at 8:46

benchmark

Recommended topics

Hot tags