pivot_wider issue "Values in `values_from` are not uniquely identified; output will contain list-cols"
Asked Answered
J

6

83

My data looks like this:

# A tibble: 6 x 4
  name          val time          x1
  <chr>       <dbl> <date>     <dbl>
1 C Farolillo     7 2016-04-20  51.5
2 C Farolillo     3 2016-04-21  56.3
3 C Farolillo     7 2016-04-22  56.3
4 C Farolillo    13 2016-04-23  57.9
5 C Farolillo     7 2016-04-24  58.7
6 C Farolillo     9 2016-04-25  59.0

I am trying to use the pivot_wider function to expand out the data based on the name column. I use the following code:

yy <- d %>% 
  pivot_wider(., names_from = name, values_from = val)

Which gives me the following warning message:

Warning message:
Values in `val` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list(val = list)` to suppress this warning.
* Use `values_fn = list(val = length)` to identify where the duplicates arise
* Use `values_fn = list(val = summary_fun)` to summarise duplicates

The output looks like:

       time       x1        out1    out2 
    2016-04-20  51.50000    <dbl>   <dbl>
2   2016-04-21  56.34615    <dbl>   <dbl>
3   2016-04-22  56.30000    <dbl>   <dbl>
4   2016-04-23  57.85714    <dbl>   <dbl>
5   2016-04-24  58.70968    <dbl>   <dbl>
6   2016-04-25  58.96774    <dbl>   <dbl>

I know that here mentions the issue and to resolve it they suggest using summary statistics. However I have time series data and thus do not want to use summary statistics since each day has a single value (and not multiple values).

I know the problem is because the val column has duplicates (i.e. in the above example 7 occurs 3 times.

Any suggestions on how to pivot_wider and overcome this issue?

Data:

    d <- structure(list(name = c("C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica"), val = c(7, 3, 7, 13, 7, 
9, 20, 19, 4, 5, 5, 2, 6, 6, 16, 13, 7, 6, 3, 3, 6, 10, 5, 3, 
5, 3, 4, 4, 10, 11, 4, 13, 8, 2, 8, 10, 3, 10, 14, 4, 2, 4, 6, 
6, 8, 8, 3, 3, 13, 10, 13, 32, 25, 31, 34, 26, 33, 35, 43, 22, 
22, 21, 10, 33, 33, 48, 47, 27, 23, 11, 13, 25, 31, 20, 16, 10, 
9, 23, 11, 23, 26, 16, 34, 17, 4, 24, 21, 10, 26, 32, 10, 5, 
9, 19, 14, 27, 27, 10, 8, 28, 32, 25), time = structure(c(16911, 
16912, 16913, 16914, 16915, 16916, 16917, 16918, 16919, 16920, 
16921, 16922, 16923, 16923, 16924, 16925, 16926, 16927, 16928, 
16929, 16930, 16931, 16932, 16933, 16934, 16935, 16936, 16937, 
16938, 16939, 16940, 16941, 16942, 16943, 16944, 16945, 16946, 
16947, 16948, 16949, 16950, 16951, 16952, 16953, 16954, 16955, 
16956, 16957, 16958, 16959, 16960, 16911, 16912, 16913, 16914, 
16915, 16916, 16917, 16918, 16919, 16920, 16921, 16922, 16923, 
16923, 16924, 16925, 16926, 16927, 16928, 16929, 16930, 16931, 
16932, 16933, 16934, 16935, 16936, 16937, 16938, 16939, 16940, 
16941, 16942, 16943, 16944, 16945, 16946, 16947, 16948, 16949, 
16950, 16951, 16952, 16953, 16954, 16955, 16956, 16957, 16958, 
16959, 16960), class = "Date"), x1 = c(51.5, 56.3461538461538, 
56.3, 57.8571428571429, 58.7096774193548, 58.9677419354839, 64.4615384615385, 
61.9310344827586, 60.3214285714286, 59.4137931034483, 59.5806451612903, 
57.3448275862069, 64.0333333333333, 64.0333333333333, 70.15625, 
71.3636363636364, 62.8125, 56.4375, 56.4516129032258, 51.741935483871, 
52.84375, 53.09375, 52.969696969697, 54, 54.3870967741936, 60.3870967741936, 
64.4516129032258, 66.2903225806452, 68.2333333333333, 69.7741935483871, 
70.5806451612903, 73.8275862068966, 72.8181818181818, 64.6764705882353, 
64.4838709677419, 68.7741935483871, 62.1764705882353, 68.969696969697, 
70.1935483870968, 59.6774193548387, 59.9677419354839, 63.125, 
67.5882352941177, 71.4705882352941, 73.8529411764706, 76.1935483870968, 
72.6451612903226, 76.0645161290323, 76.4193548387097, 81.7741935483871, 
85.0645161290323, 51.5, 56.3461538461538, 56.3, 57.8571428571429, 
58.7096774193548, 58.9677419354839, 64.4615384615385, 61.9310344827586, 
60.3214285714286, 59.4137931034483, 59.5806451612903, 57.3448275862069, 
64.0333333333333, 64.0333333333333, 70.15625, 71.3636363636364, 
62.8125, 56.4375, 56.4516129032258, 51.741935483871, 52.84375, 
53.09375, 52.969696969697, 54, 54.3870967741936, 60.3870967741936, 
64.4516129032258, 66.2903225806452, 68.2333333333333, 69.7741935483871, 
70.5806451612903, 73.8275862068966, 72.8181818181818, 64.6764705882353, 
64.4838709677419, 68.7741935483871, 62.1764705882353, 68.969696969697, 
70.1935483870968, 59.6774193548387, 59.9677419354839, 63.125, 
67.5882352941177, 71.4705882352941, 73.8529411764706, 76.1935483870968, 
72.6451612903226, 76.0645161290323, 76.4193548387097, 81.7741935483871, 
85.0645161290323)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-102L))
Judo answered 13/11, 2019 at 13:3 Comment(0)
P
129

Create a unique identifier row for each name and then use pivot_wider

library(dplyr)

d %>%
  group_by(name) %>%
  mutate(row = row_number()) %>%
  tidyr::pivot_wider(names_from = name, values_from = val) %>%
  select(-row)

# A tibble: 51 x 4
#   time          x1 `C Farolillo` `Plaza Eliptica`
#   <date>     <dbl>         <dbl>            <dbl>
# 1 2016-04-20  51.5             7               32
# 2 2016-04-21  56.3             3               25
# 3 2016-04-22  56.3             7               31
# 4 2016-04-23  57.9            13               34
# 5 2016-04-24  58.7             7               26
# 6 2016-04-25  59.0             9               33
# 7 2016-04-26  64.5            20               35
# 8 2016-04-27  61.9            19               43
# 9 2016-04-28  60.3             4               22
#10 2016-04-29  59.4             5               22
# … with 41 more rows
Panties answered 13/11, 2019 at 13:7 Comment(2)
this gives me an error: Error in select(): ! Can't subset columns that don't exist. ✖ Column row doesn't exist.Bournemouth
This solution contains a duplicated row. Is this intended?Castanon
L
30

Typically the error

Warning message:
Values in `val` are not uniquely identified; output will contain list-cols.

is most often caused by duplicate rows in the data (after excluding the val column), and not duplicates in the val column.

which(duplicated(d))
# [1] 14 65

OP's data seems to have two duplicate rows which is causing this issue. Removing the duplicate rows also gets rid of the error.

yy <- d %>% distinct() %>% pivot_wider(., names_from = name, values_from = val)
yy
# A tibble: 50 x 4
   time          x1 `C Farolillo` `Plaza Eliptica`
   <date>     <dbl>         <dbl>            <dbl>
 1 2016-04-20  51.5             7               32
 2 2016-04-21  56.3             3               25
 3 2016-04-22  56.3             7               31
 4 2016-04-23  57.9            13               34
 5 2016-04-24  58.7             7               26
 6 2016-04-25  59.0             9               33
 7 2016-04-26  64.5            20               35
 8 2016-04-27  61.9            19               43
 9 2016-04-28  60.3             4               22
10 2016-04-29  59.4             5               22
# ... with 40 more rows
Lehr answered 15/2, 2020 at 5:55 Comment(5)
I wouldn't call the other solution a quick/dirty fix, since there are many vaild cases where this is the right way to do it if multiple values per timepoint are allowed, but since OP said that each timepoint should only have one value, your solution solves the problem of duplicate entries.Otherdirected
Agreed, I can see how it could be useful if there are rows which differ only in the value column.Lehr
Removing the duplicate rows in the dataset will cause me to lose time series information. The data contains two different time series C Farolillo and Plaza Eliptica which just so happens to have the same value on the same day. This isn't a true duplicate, just a coincidence.Judo
Trying d[c(13,14),] gives the following two rows: [1] 13 C Farolillo 6 2016-05-02 64.03333 [2] 14 C Farolillo 6 2016-05-02 64.03333. This is two same observations in one day for C Farolillo; so it looked like a duplicates to me.Do d[c(64,65),] for another pair.Lehr
I find this to be the right answer. One can aggregate the duplicated rows in a way before pivoting. For example, we can do group_by(name, x1) %>% summarise(x1 = sum(x1)) or with mean in place of sum. At least, this is the use case what I usually encounter in practice.Actor
M
14

The problem is caused by the fact that the data that you want to spread / pivot wider has duplicate identifiers. While both suggestions above, i.e. creating a unique artifical id from row numbers with mutate(row = row_number()) , or filtering only distinct rows will allow you to pivot wider, but they change the structure of your table, which is likely to have an logical, organizational problem that will come out next time you try to join anything to it.

It is a much better practice to use the id_cols parameter explicity, to see that you actually want to have to be unique after pivotting wide, and if you are running into problems, re-organize the original table first. Of course, you may find reason for filtering to distinct rows, or adding a new ID, most likely you will want to avoid the duplication earlier in your code.

Maurreen answered 18/2, 2020 at 9:36 Comment(5)
I'm having similar problems to the ones above but none of these solutions seems applicable to me. I have duplicate values most likely because my data involves different ratings at different time points. I tried using id_cols but this doesn't work either.Wellestablished
In this case obviously your observations must be unique, inter alia, in time. So the id_cols must take account of all possible time observations. One way to achieve this is to unite <your_id>_<time> into a single ID, or explicitly create a unique row id.Maurreen
I have tried this but not sure how to do it in the long-form first before using pivot_wider. For some reason, the same id number was assigned for two observations a couple of times.Wellestablished
So I don't want to get rid of the duplicates, instead, I'd like to change the duplicated id numberWellestablished
@ConDes did you try someting like: df_wide %>% group_by(old_ID, time_point) %>% mutate(new_ID = paste0(old_ID, "_", 0:n())) ?Tillietillinger
V
9

Though not visible in the OP example, the accepted answer will duplicate rows in some cases when it is not necessary. This approach avoids that in some cases:

d |>
  pivot_wider(names_from = name, values_from = val,
              values_fn = list) |> 
              unnest(cols = everything() )

If list present, to avoid warnings and errors keep values_fn = list

Valina answered 28/9, 2021 at 13:17 Comment(0)
C
0

Another base R

Approach

l = lapply(split(d, d$name), \(x) {x = unique(x); x$name = NULL; x}) 
l = lapply(names(l), \(x) {
  y = colnames(l[[x]])
  y[which(y == "val")] = names(l[x])
  setNames(l[[x]], y)})
l = Reduce(\(...) merge(..., all = TRUE), l)

Where the column renaming part can certainly be written more concise.

Comparison

> identical(x = {
+   # Ronak Shah 
+   library(dplyr)
+   x = 
+     d %>%
+     group_by(name) %>%
+     mutate(row = row_number()) %>%
+     tidyr::pivot_wider(names_from = name, values_from = val) %>%
+     select(-row) 
+   as.data.frame(x[!duplicated(x), ])
+   }, 
+   # Friede 
+   y = {
+     l = lapply(split(d, d$name), \(x) {x = unique(x); x$name = NULL; x}) 
+     l = lapply(names(l), \(x) {
+       y = colnames(l[[x]])
+       y[which(y == "val")] = names(l[x])
+       setNames(l[[x]], y)})
+     Reduce(\(...) merge(..., all = TRUE), l)
+   })
[1] TRUE

@Ronak Shah's approach returns one more row which is duplicated.

Castanon answered 15/2, 2024 at 11:42 Comment(0)
D
-2

This is kind of late in the game, but an option to keep the non-unique observations, but still pivot:

table(d$name) # get the unique names_from and frequencies
# 
#    C Farolillo Plaza Eliptica 
#             51             51  

(d2 <- d %>% mutate(rno = rep(1:51, 2)) %>% 
                  # repeat 1:51 2 times; unique id by names_from

      pivot_wider(names_from = name, values_from = val))
    # # A tibble: 51 × 5
    #    time          x1   rno `C Farolillo` `Plaza Eliptica`
    #    <date>     <dbl> <int>         <dbl>            <dbl>
    #  1 2016-04-20  51.5     1             7               32
    #  2 2016-04-21  56.3     2             3               25
    #  3 2016-04-22  56.3     3             7               31
    #  4 2016-04-23  57.9     4            13               34
    #  5 2016-04-24  58.7     5             7               26
    #  6 2016-04-25  59.0     6             9               33
    #  7 2016-04-26  64.5     7            20               35
    #  8 2016-04-27  61.9     8            19               43
    #  9 2016-04-28  60.3     9             4               22
    # 10 2016-04-29  59.4    10             5               22
    # # … with 41 more rows 
Deist answered 22/9, 2021 at 13:14 Comment(0)

© 2022 - 2025 — McMap. All rights reserved.