A base option using grepl
and merge
(Variant1).
i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
s <- do.call(rbind, Map(function(i, j) cbind(ID = df1$ID[i], Coupons = j),
i, df2$Coupons))
merge(df1, s, all.x = TRUE)
# ID Items Coupons
#1 1 item-6, item-5, item-3 <NA>
#2 2 item-6, item-4, item-9, item-7 coupon-1
#3 2 item-6, item-4, item-9, item-7 coupon-2
#4 3 item-6, item-4, item-5 coupon-1
#5 4 item-1, item-7, item-2, item-9 coupon-2
#6 5 item-5, item-8, item-7, item-2 <NA>
#7 6 item-10, item-1, item-6, item-4 coupon-1
#8 7 item-6, item-7, item-9, item-4, item-5 coupon-1
#9 7 item-6, item-7, item-9, item-4, item-5 coupon-2
#10 8 item-6, item-9, item-1, item-3, item-5 <NA>
#11 9 item-6, item-8, item-7, item-3, item-9 coupon-2
#12 10 item-4, item-7, item-5, item-9 coupon-2
Instead of using merge
subsetting the matches (Variant 2).
i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
m <- rbind(cbind(df1[!Reduce(`|`, i),], Coupons = NA), do.call(rbind,
Map(function(i, j) cbind(df1[i,], Coupons = j), i, df2$Coupons)) )
m[order(m$ID),]
# ID Items Coupons
#1 1 item-6, item-5, item-3 <NA>
#4 2 item-6, item-4, item-9, item-7 coupon-1
#8 2 item-6, item-4, item-9, item-7 coupon-2
#5 3 item-6, item-4, item-5 coupon-1
#9 4 item-1, item-7, item-2, item-9 coupon-2
#2 5 item-5, item-8, item-7, item-2 <NA>
#6 6 item-10, item-1, item-6, item-4 coupon-1
#7 7 item-6, item-7, item-9, item-4, item-5 coupon-1
#10 7 item-6, item-7, item-9, item-4, item-5 coupon-2
#3 8 item-6, item-9, item-1, item-3, item-5 <NA>
#11 9 item-6, item-8, item-7, item-3, item-9 coupon-2
#12 10 item-4, item-7, item-5, item-9 coupon-2
Another variant (3)
i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
i <- c(i, list(!Reduce(`|`, i)))
cbind(df1[unlist(lapply(i, which)),], Coupons = rep(c(df2$Coupons, NA),
sapply(i, sum)))
Another variant, testing for the string only in cases where the first string had a hit (Variant4).
i <- lapply(strsplit(df2$Items, ", ", TRUE), function(s) {
Reduce(function(a, b) a[grep(b, df1$Items[a], fixed=TRUE)],
s[-1], grep(s[[1]], df1$Items, fixed=TRUE)) })
j <- unique(unlist(i))
i <- if(length(j>0)) c(list(seq_len(nrow(df1))[-j]), i) else c(list(seq_len(nrow(df1))), i)
cbind(df1[unlist(i),], Coupons = rep(c(NA, df2$Coupons), lengths(i)))
Benchmark
bench::mark(check=FALSE,
varaint1 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
s <- do.call(rbind, Map(function(i, j) cbind(ID = df1$ID[i], Coupons = j),
i, df2$Coupons))
merge(df1, s, all.x = TRUE)},
variant2 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
rbind(cbind(df1[!Reduce(`|`, i),], Coupons = NA), do.call(rbind,
Map(function(i, j) cbind(df1[i,], Coupons = j), i, df2$Coupons)) )},
variant3 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
i <- c(i, list(!Reduce(`|`, i)))
cbind(df1[unlist(lapply(i, which)),], Coupons = rep(c(df2$Coupons, NA), sapply(i, sum))) },
variant4 = {i <- lapply(strsplit(df2$Items, ", ", TRUE), function(s) {
Reduce(function(a, b) a[grep(b, df1$Items[a], fixed=TRUE)], s[-1], grep(s[[1]], df1$Items, fixed=TRUE)) })
j <- unique(unlist(i))
i <- if(length(j>0)) c(list(seq_len(nrow(df1))[-j]), i) else c(list(seq_len(nrow(df1))), i)
cbind(df1[unlist(i),], Coupons = rep(c(NA, df2$Coupons), lengths(i))) }
)
Result
expression min median itr/s…¹ mem_a…² gc/se…³ n_itr n_gc total…⁴ result
<bch:expr> <bch:tm> <bch:t> <dbl> <bch:b> <dbl> <int> <dbl> <bch:t> <list>
1 varaint1 435µs 465µs 2131. 17.55KB 21.0 1016 10 477ms <NULL>
2 variant2 703µs 758µs 1322. 16.09KB 21.2 625 10 473ms <NULL>
3 variant3 223µs 241µs 4015. 9.87KB 23.3 1895 11 472ms <NULL>
4 variant4 208µs 224µs 4323. 24.57KB 20.9 2066 10 478ms <NULL>
In this case variant4 is the faster and variant3 uses lowest amount of memory.
Comparing with other methods.
set.seed(99)
df1 <- tibble::tibble(
ID = 1:10,
Items = replicate(10, paste0('item-', sample(1:10, sample(3:5)[1]), collapse = ', '))
)
df2 <- tibble::tibble(
Items = c("item-4, item-6", "item-7, item-9"),
Coupons = c("coupon-1", "coupon-2")
)
library(dplyr)
library(fuzzyjoin)
library(stringr)
library(data.table)
bench::mark(check=FALSE,
Darren1 = {fuzzy_left_join(df1, rename(df2, key = Items), by = c("Items" = "key"),
match_fun = Vectorize(\(x, y) all(strsplit(y, ', ')[[1]] %in% strsplit(x, ', ')[[1]]))) %>%
select(-key)},
Darren2 = {df2_pattern <- df2 %>%
mutate(key = sapply(str_split(Items, ', '), \(x) str_c("(?=.*", x, ")", collapse = "")), .keep = "unused")
fuzzy_left_join(df1, df2_pattern, by = c("Items" = "key"),
match_fun = str_detect) %>%
select(-key) },
arg0naut91A = {df1 %>%
left_join(
full_join(df1, df2 %>% rename(CouponItems = Items), by = character()) %>%
rowwise %>%
filter(all(unlist(strsplit(CouponItems, ', ')) %in% unlist(strsplit(Items, ', ')))) %>%
select(-CouponItems), multiple = "all"
)},
arg0naut91B = {df1 %>%
left_join(
cross_join(df1, df2 %>% rename(CouponItems = Items)) %>%
rowwise %>%
filter(all(unlist(strsplit(CouponItems, ', ')) %in% unlist(strsplit(Items, ', ')))) %>%
select(-CouponItems), multiple = "all"
)},
Thomas = {unique(
na.omit(
setDT(df2)[, .(Items = unlist(strsplit(Items, ", "))), Coupons][
setDT(df1)[, .(Items = unlist(strsplit(Items, ", "))), ID],
on = "Items"
]
)[
,
.SD[uniqueN(Items) > 1], .(ID, Coupons)
][, Items := NULL]
)[df1,
on = "ID",
allow.cartesian = TRUE
][
,
.(ID, Items, Coupons)
]},
GKi3 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
i <- c(i, list(!Reduce(`|`, i)))
cbind(df1[unlist(lapply(i, which)),], Coupons = rep(c(df2$Coupons, NA), sapply(i, sum))) },
GKi4 = {i <- lapply(strsplit(df2$Items, ", ", TRUE), function(s) {
Reduce(function(a, b) a[grep(b, df1$Items[a], fixed=TRUE)], s[-1], grep(s[[1]], df1$Items, fixed=TRUE)) })
j <- unique(unlist(i))
i <- if(length(j>0)) c(list(seq_len(nrow(df1))[-j]), i) else c(list(seq_len(nrow(df1))), i)
cbind(df1[unlist(i),], Coupons = rep(c(NA, df2$Coupons), lengths(i))) }
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total…¹
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:t>
1 Darren1 34.6ms 34.61ms 28.5 223.7KB 85.4 3 9 105ms
2 Darren2 32.1ms 33.66ms 29.7 173.8KB 53.5 5 9 168ms
3 arg0naut91A 18.6ms 21.15ms 47.6 95.3KB 36.6 13 10 273ms
4 arg0naut91B 14.8ms 16.73ms 60.9 76.3KB 37.2 18 11 296ms
5 Thomas 3.6ms 4.13ms 222. 688.7KB 18.3 97 8 437ms
6 GKi3 289.1µs 322.51µs 2857. 42.4KB 24.0 1308 11 458ms
7 GKi4 284.4µs 312.12µs 3035. 46.5KB 23.6 1412 11 465ms
In this case GKi3 and GKi4 are the fastest and uses the lowest amount of memory.
data.table
solution is likely going to be faster than existing answers. I have added thedata.table
tag to your question. You should make clear in your question that speed is important to you. – Homy