How to separate the units in column name into another column in R
Asked Answered
A

4

7

The table has units in column. How can I separate the units in column name into another column?

Example:

SUBJID Dose (mg) Tmax (h)
2001 200 5
2002 200 5

This is what i want.

SUBJID Dose Dose_unit Tmax Tmax_unit
2001 200 mg 200 h
2002 200 mg 200 h
Atwood answered 11/1, 2024 at 8:6 Comment(0)
H
10

Here is an approach using pivot_longer/wider and separate_wider_regex:

dat <- data.frame(
  SUBJID = c(2001L, 2002L),
  `Dose (mg)` = c(200L, 200L),
  `Tmax (h)` = c(5L, 5L),
  check.names = FALSE
)

library(tidyr)

dat |>
  pivot_longer(-SUBJID,
    names_to = "name_unit"
  ) |>
  separate_wider_regex(name_unit,
    patterns = c(name = "^.*?", " \\(", unit = ".*", "\\)")
  ) |>
  pivot_wider(
    names_from = name,
    values_from = c(unit, value),
    names_glue = "{name}_{.value}",
    names_vary = "slowest"
  )
#> # A tibble: 2 × 5
#>   SUBJID Dose_unit Dose_value Tmax_unit Tmax_value
#>    <int> <chr>          <int> <chr>          <int>
#> 1   2001 mg               200 h                  5
#> 2   2002 mg               200 h                  5
Halla answered 11/1, 2024 at 8:36 Comment(1)
Great answer! The pipeline looks clear and comprehensive, easy to understand the logic behind, +1!Fieldsman
F
4

Here is a base R option, using lapply + cbind

cbind(
    dat[1],
    do.call(
        cbind,
        lapply(
            seq_along(dat)[-1],
            \(k) {
                d <- dat[k]
                nms <- paste0(gsub("\\s+\\(.*$", "", names(d)), c("_val", "_unit"))
                setNames(cbind(d, gsub(".*\\((.*)\\)", "\\1", names(d))), nms)
            }
        )
    )
)

which gives

  SUBJID Dose_val Dose_unit Tmax_val Tmax_unit
1   2001      200        mg        5         h
2   2002      200        mg        5         h

benchmark

Given list of solution


stefan <- function() {
    dat |>
        pivot_longer(-SUBJID,
            names_to = "name_unit"
        ) |>
        separate_wider_regex(name_unit,
            patterns = c(name = "^.*?", " \\(", unit = ".*", "\\)")
        ) |>
        pivot_wider(
            names_from = name,
            values_from = c(unit, value),
            names_glue = "{name}_{.value}",
            names_vary = "slowest"
        )
}

issac <- function() {
    dat |>
        mutate(
            Dose_value = sub("\\D", "", `Dose (mg)`),
            Dose_unit = "mg",
            Tmax_value = sub("\\D", "", `Tmax (h)`),
            Tmax_unit = "h"
        ) |>
        select(SUBJID, Dose_value, Dose_unit, Tmax_value, Tmax_unit)
}

mael <- function() {
    dat |>
        mutate(across(-SUBJID, \(x) str_extract(cur_column(), "(?<=\\().+?(?=\\))"),
            .names = "{gsub(' .*', '', .col)}_unit"
        )) |>
        rename_with(.fn = \(x) word(x, 1), .col = matches("\\(")) %>%
        relocate(SUBJID, order(colnames(.)))
}

tic <- function() {
    cbind(
        dat[1],
        do.call(
            cbind,
            lapply(
                seq_along(dat)[-1],
                \(k) {
                    d <- dat[k]
                    nms <- paste0(gsub("\\s+\\(.*$", "", names(d)), c("_val", "_unit"))
                    setNames(cbind(d, gsub(".*\\((.*)\\)", "\\1", names(d))), nms)
                }
            )
        )
    )
}

and the benchmarking template

dat <- data.frame(
    SUBJID = c(2001L, 2002L),
    `Dose (mg)` = c(200L, 200L),
    `Tmax (h)` = c(5L, 5L),
    check.names = FALSE
)

microbenchmark(
    stefan = stefan(),
    issac = issac(),
    mael = mael(),
    tic = tic(),
    unit = "relative"
)

we see that

Unit: relative
   expr      min        lq      mean    median        uq      max neval
 stefan 40.50826 31.763595 28.900405 32.131738 32.396251 4.452021   100
  issac  8.80123  7.150539  6.919996  7.108832  7.719263 1.362840   100
   mael 12.34542 10.347826 10.030679 10.533903 11.225731 2.257146   100
    tic  1.00000  1.000000  1.000000  1.000000  1.000000 1.000000   100
Fieldsman answered 11/1, 2024 at 12:53 Comment(1)
Really nice and useful to compare all approaches.Halla
P
3

A more compact solution is to apply regex patterns to retrieve the unit and assign to a new column, within an across call to apply over multiple columns. You can then use rename_with to remove parentheses and units in the column names, and use relocate to reorder columns.

library(stringr)
library(dplyr)

df |> 
  mutate(across(-SUBJID, \(x) str_extract(cur_column(), "(?<=\\().+?(?=\\))"),
         .names = "{gsub(' .*', '', .col)}_unit")) |> 
  rename_with(.fn = \(x) word(x, 1), .col = matches("\\(")) %>%
  relocate(SUBJID, order(colnames(.)))

#   SUBJID Dose Dose_unit Tmax Tmax_unit
# 1   2001  200        mg    5         h
# 2   2002  200        mg    5         h
Polymerism answered 11/1, 2024 at 10:4 Comment(1)
cool! you made it without pivoting the dataframe, which saves a lot overhead from dataframe reshaping, +1!Fieldsman
P
1

This can be another option:

> tibble::tibble(
+   SUBJID = c(2001, 2002),
+   `Dose (mg)` = c(200, 200),
+   `Tmax (h)` = c(5, 5)
+ ) |> 
+   mutate(
+     Dose_value = sub("\\D", "", `Dose (mg)`),
+     Dose_unit = "mg",
+     Tmax_value = sub("\\D", "", `Tmax (h)`),
+     Tmax_unit = "h"
+   ) |> 
+   select(SUBJID, Dose_value, Dose_unit, Tmax_value, Tmax_unit)

# A tibble: 2 × 5
  SUBJID Dose_value Dose_unit Tmax_value Tmax_unit
   <dbl> <chr>      <chr>     <chr>      <chr>    
1   2001 200        mg        5          h        
2   2002 200        mg        5          h  
Pillsbury answered 11/1, 2024 at 8:41 Comment(1)
Imagine 100 columns.Interplead

© 2022 - 2025 — McMap. All rights reserved.