How to Join Two Unequal-Length Columns by Matching Identical Strings

Asked 20/8 at 18:5 Answered 21/8 at 5:53

I have two tibbles with unique values, with unequal length like:

df1 <- structure(list(col1 = c("A", "T", "C", "D", "X", "F")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L))
df2 <- structure(list(col2 = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -10L))

> df1
# A tibble: 6 × 1
  col1 
  <chr>
1 A    
2 T    
3 C    
4 D    
5 X    
6 F

> df2
# A tibble: 9 × 1
  col2 
  <chr>
1 A    
2 B    
3 C    
4 D    
5 F    
6 G    
7 H    
8 I    
9 J

I would like to get:

> df3
# A tibble: 11 × 2
   col1  col2 
   <chr> <chr>
 1 A     A    
 2 NA    B    
 3 T     NA   
 4 C     C    
 5 D     D    
 6 X     NA   
 7 F     F    
 8 NA    G    
 9 NA    H    
10 NA    I    
11 NA    J

Each identical string in col1 of df1 and col2 in df2 should be in the same row side by side. For example, if a string in col1 and col2 is the same, they should be in the same row (e.g., string A). If a string exists in col1 but not in col2, it should be NA in col2, and vice versa.

I would be grateful for any further advice.

Thermo answered 20/8 at 18:5 Comment(1)

Does the row order matter? – Peach 20/8 at 18:17

library(dplyr)

full_join(df1, df2, by = join_by(col1 == col2), keep = TRUE)
#> # A tibble: 12 × 2
#>    col1  col2 
#>    <chr> <chr>
#>  1 A     A    
#>  2 T     <NA> 
#>  3 C     C    
#>  4 D     D    
#>  5 X     <NA> 
#>  6 F     F    
#>  7 <NA>  B    
#>  8 <NA>  E    
#>  9 <NA>  G    
#> 10 <NA>  H    
#> 11 <NA>  I    
#> 12 <NA>  J

Concent answered 20/8 at 18:14 Comment(1)

Perfect. Many thanks. I just forgot the keep argument! – Thermo 20/8 at 18:48

A base R solution

lst <- c(df1, df2)
v <- unique(unlist(lst))
list2DF(lapply(lst, \(x) x[match(v, x)]))

gives

   col1 col2
1     A    A
2     T <NA>
3     C    C
4     D    D
5     X <NA>
6     F    F
7  <NA>    B
8  <NA>    E
9  <NA>    G
10 <NA>    H
11 <NA>    I
12 <NA>    J

Another base R solution is using stack + reshape

reshape(
  transform(
    stack(c(df1, df2)),
    r = as.integer(factor(values))
  ),
  direction = "wide",
  idvar = "r",
  timevar = "ind"
)[-1]

which gives

   values.col1 values.col2
1            A           A
2            T        <NA>
3            C           C
4            D           D
5            X        <NA>
6            F           F
8         <NA>           B
11        <NA>           E
13        <NA>           G
14        <NA>           H
15        <NA>           I
16        <NA>           J

The third base R option is using table

d <- table(stack(c(df1, df2)))
as.data.frame(ifelse(d > 0, row.names(d), NA))

which gives

  col1 col2
A    A    A
B <NA>    B
C    C    C
D    D    D
E <NA>    E
F    F    F
G <NA>    G
H <NA>    H
I <NA>    I
J <NA>    J
T    T <NA>
X    X <NA>

Benchmark

tic1 <- \() {
  lst <- c(df1, df2)
  v <- unique(unlist(lst))
  list2DF(lapply(lst, \(x) x[match(v, x)]))
}

tic2 <- \() {
  reshape(
    transform(
      stack(c(df1, df2)),
      r = as.integer(factor(values))
    ),
    direction = "wide",
    idvar = "r",
    timevar = "ind"
  )[-1]
}

tic3 <- \() {
  d <- table(stack(c(df1, df2)))
  as.data.frame(ifelse(d > 0, row.names(d), NA))
}

microbenchmark(
  tic1(),
  tic2(),
  tic3(),
  unit = "relative"
)

shows

Unit: relative
   expr      min        lq      mean   median        uq      max neval
 tic1()  1.00000  1.000000  1.000000  1.00000  1.000000 1.000000   100
 tic2() 40.30049 30.210059 16.153643 30.60000 24.474138 1.423185   100
 tic3() 13.77833  9.902367  6.136188 10.15569  9.180251 1.553357   100

Invariable answered 20/8 at 20:13 Comment(0)

A base solution using merge, by adding a new column to use for merging and removing it afterwards.

merge(cbind(x=df1[[1]], df1), cbind(x=df2[[1]], df2), all=TRUE)[-1]
#   col1 col2
#1     A    A
#2  <NA>    B
#3     C    C
#4     D    D
#5  <NA>    E
#6     F    F
#7  <NA>    G
#8  <NA>    H
#9  <NA>    I
#10 <NA>    J
#11    T <NA>
#12    X <NA>

Circumcision answered 21/8 at 5:53 Comment(0)

Benchmark

Recommended topics

Hot tags