How to create new dataset based on existing dataset - r

I have the following dataset:
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
I would like to create the following dataset in R:
Individual sequence_special_drug special_drug prior_special_drug prior_traditional_drug during_special_drug
A 1 C none A, B none
A 2 D C none B, Z, Z
A 3 E C, D A, B, B, Z, Z, A none
Is there a quick way to do this? I have many individuals, but these are all the possible scenarios. A special_drug is identified by the sequence number; those with 'NA' are a traditional_drug.
prior_special_drug will contain any special_drug previously identified, so for the first special_drug C there is no previous special_drug, for the second special_drug D, there is one previous special_drug that is C, and for the third special_drug there are two previous special_drugs C and D.
prior_traditional_drug is the same but will contain anything that has been identified in sequence_special_drug as NA. So for the first special_drug (C), the two prior_traditional_drugs are A and B. For the third special_drug, the prior_traditional_drugs are A, B, B, Z, Z, A.
during_special_drug will contain every traditional_drug that have been referenced during the administration of special_drug. This can be identified in the dataset through the repetition of sequence_special_drug (e.g. 2 -> NA NA -> 2 -> NA -> 2) therefore B, Z, Z.
EDIT - For 2 individuals:
dat <- read.table(
text = "
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
B 1 D
B NA B
B NA Z
B 1 D
B NA Z
B 1 D
B NA A
B 2 E",
header = TRUE)
I would expect:
- WRONG "none" line 3 under prior_traditional_drug -
Individual sequence_special_drug special_drug prior_special_drug prior_traditional_drug during_special_drug
A 1 C none A, B none
A 2 D C none B, Z, Z
A 3 E C, D A, B, B, Z, Z, A none
B 1 D none none B, Z, Z
B 2 E D B, Z, Z, A none
- RIGHT "A, B" line 3 under prior_traditional_drug -
Individual sequence_special_drug special_drug prior_special_drug prior_traditional_drug during_special_drug
A 1 C none A, B none
A 2 D C A, B B, Z, Z
A 3 E C, D A, B, B, Z, Z, A none
B 1 D none none B, Z, Z
B 2 E D B, Z, Z, A none
But I obtained:
Error message with my own dataset
> special_drug <- example_data %>%
+ nest_by(individual) %>%
+ mutate(
+ spec_drug = list(get_all_drugs(data))
+ ) %>%
+ unnest(spec_drug) %>%
+ select(-data) %>%
+ ungroup()
`summarise()` has grouped output by 'sequence_special_drug'. You can override using the `.groups` argument.
Error: Problem with `mutate()` input `spec_drug`.
x Problem with `mutate()` input `flag3`.
x `false` must be a list, not a character vector.
ℹ Input `flag3` is `if_else(flag1 == 1, list(character(0)), flag3)`.
ℹ Input `spec_drug` is `list(get_all_drugs(data))`.
Run `rlang::last_error()` to see where the error occurred.
> rlang::last_error()
Error in is_rlang_error(parent) :
argument "parent" is missing, with no default
My own dataset is more like this:
example_data <- read.table(
text = "
individual sequence_special_drug all_drugs
77779 NA Name1
77779 1 Name2
77779 1 Name2
77779 1 Name2
77779 2 Name3
4444 NA Name1
4444 1 Name4
4444 2 Name3
4444 3 Name7",
header = TRUE)
But the dataset below also generates the same error message:
example_data <- read.table(
text = "
individual sequence_special_drug all_drugs
A NA A
A 1 C
A 2 D
A 2 D
A 2 D
A 3 E
B NA B
B 1 D
B 2 E
B 3 F",
header = TRUE)

Here is my suggestion using {tidyverse}. I wrote a function to get each column and then put them together in get_all_drugs(). Then, I ran the function through the nested data by individual, as in the example below.
library(tidyverse)
example_data <- read.table(
text = "
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
B 1 D
B NA B
B NA Z
B 1 D
B NA Z
B 1 D
B NA A
B 2 E",
header = TRUE)
get_special_drugs <- function(.data) {
.data %>%
filter(sequence_special_drug != 0) %>%
distinct() %>%
select(sequence_special_drug, special_drug = all_drugs) %>%
mutate(prior_special_drug = as.list(accumulate(special_drug, c))) %>%
rowwise() %>%
mutate(prior_special_drug = list(
prior_special_drug[prior_special_drug != special_drug]
)) %>%
ungroup()
}
fix_drug_sequence <- function(.data) {
.data %>%
mutate(
seq_drug = replace_na(sequence_special_drug, 0),
flag = if_else(seq_drug == 0 & seq_drug != lead(seq_drug),
lead(seq_drug),
seq_drug),
flag = if_else(flag == 0 & flag != lead(flag),
lead(flag),
flag)
) %>%
select(-sequence_special_drug) %>%
rename(sequence_special_drug = flag)
}
get_prior_traditional_drug <- function(...) {
fix_drug_sequence(...) %>%
group_by(sequence_special_drug) %>%
mutate(
flag1 = max(seq_drug == sequence_special_drug & row_number() == 1),
) %>%
group_by(sequence_special_drug, flag1) %>%
summarise(
flag2 = list(all_drugs[seq_drug == 0])
) %>%
ungroup() %>%
mutate(
flag3 = as.list(accumulate(flag2, append)),
flag3 = if_else(flag1 == 1, lag(flag3), flag3)
) %>%
select(sequence_special_drug, prior_traditional_drug = flag3)
}
get_during_special_drugs <- function(...) {
fix_drug_sequence(...) %>%
group_by(sequence_special_drug) %>%
mutate(
flag = cumsum(seq_drug == sequence_special_drug)
) %>%
filter(flag > 0) %>%
summarise(
during_special_drug = list(all_drugs[seq_drug == 0])
)
}
get_all_drugs <- function(.data) {
spec_drug <- get_special_drugs(.data)
prior_traditional <- get_prior_traditional_drug(.data)
during_spec <- get_during_special_drugs(.data)
list(spec_drug, prior_traditional, during_spec) %>%
reduce(left_join, by = "sequence_special_drug")
}
special_drug <- example_data %>%
nest_by(individual) %>%
mutate(
spec_drug = list(get_all_drugs(data))
) %>%
unnest(spec_drug) %>%
select(-data) %>%
ungroup()
special_drug

Here is my inelegant solution only for this specific problem, but it maybe useful to give you a hint.
library(data.table)
dt <- fread(
"
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
"
)
df <- unique(na.omit(dt))
setnames(df,"all_drugs","special_drug")
df
#> individual sequence_special_drug special_drug
#> 1: A 1 C
#> 2: A 2 D
#> 3: A 3 E
## add row ideantifier in dt
dt[,rd:=rowid(individual)]
## create prior_special_drug
df[,prior_special_drug:=shift(special_drug)]
df[3,4] <- df[special_drug < "E", paste(special_drug,collapse = ", ")]
df
#> individual sequence_special_drug special_drug prior_special_drug
#> 1: A 1 C <NA>
#> 2: A 2 D C
#> 3: A 3 E C, D
special.drug = df$special_drug
special.drug
#> [1] "C" "D" "E"
posi <- c(
dt[,first(.I[all_drugs==special.drug[1]])], #first position of C
dt[,first(.I[all_drugs==special.drug[2]])], #first position of D
dt[,last(.I[all_drugs==special.drug[2]])], #last position of D
dt[,last(.I[all_drugs==special.drug[3]])] #last position of E
)
posi
#> [1] 3 4 9 11
# dt[is.na(sequence_special_drug) & rd < posi[1], all_drugs]
# dt[is.na(sequence_special_drug) & rd %between% posi[2:3], all_drugs]
# dt[is.na(sequence_special_drug) & rd < posi[4], all_drugs]
drug <- c(
paste(dt[is.na(sequence_special_drug) & rd < posi[1], all_drugs],collapse = ", "),
paste(dt[is.na(sequence_special_drug) & rd %between% posi[2:3], all_drugs],collapse = ", "),
paste(dt[is.na(sequence_special_drug) & rd < posi[4], all_drugs],collapse = ", ")
)
drug
#> [1] "A, B" "B, Z, Z" "A, B, B, Z, Z, A"
## create prior_traditional_drug and during_special_drug
df[,prior_traditional_drug := drug]
df[,prior_traditional_drug := ifelse(special_drug == "D",NA,prior_traditional_drug)]
df[,during_special_drug := drug]
df[,during_special_drug := ifelse(special_drug %in% c("C","E"),NA,during_special_drug)]
## replace NA with "none" in df
for (jj in 1:ncol(df))
set(df,
i = which(is.na(df[[jj]])),
j = jj,
v = "none"
)
df
#> individual sequence_special_drug special_drug prior_special_drug
#> 1: A 1 C none
#> 2: A 2 D C
#> 3: A 3 E C, D
#> prior_traditional_drug during_special_drug
#> 1: A, B none
#> 2: none B, Z, Z
#> 3: A, B, B, Z, Z, A none
Created on 2021-06-06 by the reprex package (v2.0.0)

Related

Defining indices for row sequences more succintly

I have a dataframe like this:
set.seed(123)
df <- data.frame(A = sample(LETTERS[1:5], 50, replace = TRUE),
B = sample(LETTERS[1:5], 50, replace = TRUE))
I want to filter the dataframe on two parameters: (i) the target rows that match a certain criterion and (ii) a certain number of rows that precede the target rows. Specifically, I want to filter rows where A == "A" & B == "A" as well as the five rows preceding the target row. I can do this with a two-step operation: first by defining a function, and second by using the function as input for slice:
Sequ <- function(col1, col2) {
# get row indices of target row with function `which`
inds <- which(col1 == "A" & col2 == "A")
# sort row indices of the rows before target row AND target row itself
sort(unique(c(inds-5, inds-4, inds-3,inds-2, inds-1, inds)))
}
library(dplyr)
df %>%
slice(Sequ(col1 = A, col2 = B))
A B
1 D C
2 D B
3 C B
4 C D
5 B B
6 A A
7 E B
8 E D
9 D C
10 D D
11 A A
12 C C
13 D E
14 B E
15 B E
16 B A
17 A A
18 C D
19 C B
20 B D
21 A B
22 A A
But surely there must be a more efficient replacement for this part: sort(unique(c(inds-5, inds-4, inds-3,inds-2, inds-1, inds))). In case I want to filter not just the preceding 5 but, say, 10 or 100 rows this way of defining each index individually becomes quickly impractical. How can this part be coded more economically?
1) Define bothA which takes a matrix and returns TRUE if any row is all A's. Then use rollapply to apply it as a moving window.
library(zoo)
bothA <- function(x) any(rowSums(rbind(x) == "A") == 2)
ok <- rollapply(df, 6, bothA, align = "left", partial = TRUE, by.column = FALSE)
df[ok, ]
2) or in a pipe
df %>%
filter(rollapply(., 6, bothA, align = "left", partial = TRUE, by.column = FALSE))
3) This also works:
ok <- rollapply(rowSums(df == "A") == 2, 6, any, align = "left", partial = TRUE)
df[ok, ]
Here is a dplyr solution that can be directly used in a pipe, with no need for filter.
Sequ <- function(x, col1, col2, value = "A"){
x %>%
mutate(grp = lag(cumsum({{col1}} == value & {{col2}} == value), default = 0)) %>%
group_by(grp) %>%
slice_tail(n = 5) %>%
ungroup() %>%
select(-grp)
}
df %>% Sequ(A, B)
## A tibble: 23 x 2
# A B
# <chr> <chr>
# 1 B D
# 2 C C
# 3 E A
# 4 D B
# 5 A A
# 6 C D
# 7 E E
# 8 C E
# 9 C C
#10 A A
## … with 13 more rows
One dplyr and purrr solution could be:
df %>%
filter(!row_number() %in% unlist(map(which(A == "A" & B == "A"), ~ (.x-5):.x)))

Count common sets of items between different customers

I have data on customers and the different products they have purchased:
Customer Product
1 A
1 B
1 C
2 D
2 E
2 F
3 A
3 B
3 D
4 A
4 B
I would like to check which sets of products that occur together across different customers. I want to get the count for product combinations of different lengths. For example, the product combination A and B together occurs in three different customers; the product group A, B and C occurs in one customer. And so on for all different sets of 2 or more products in the data. Something like:
Product Group Number
A, B, C 1
D, E, F 1
A, B, D 1
A, B 3
Thus, I'm counting the A, B combination in customers who only have product A and B (e.g. customer 4), and in customers who have A and B, but also any other product (e.g. customer 1, who has A, B and C).
Does anyone have any ideas how to do that with either a tidyverse or base R approach? I feel like it ought to be pretty trivial - maybe pivot_wider first, then count?
I have found this question and answer that can do what I need for pairs of products, but I need to count combinations also for more products than two.
If you have the possibility to use a non-base package, you can use a tool dedicated for the task of finding item sets: arules::apriori. It is much faster on larger data sets.
library(arules)
# coerce data frame to binary incidence matrix
# use apriori to get "frequent itemsets"
r = apriori(data = as.matrix(table(dat) > 0),
# set: type of association mined, minimal support needed of an item set,
# minimal number of items per item set
par = list(target = "frequent itemsets",
support = 0,
minlen = 2))
# coerce itemset to data.frame, select relevant rows and columns
d = as(r, "data.frame")
d[d$count > 0, c("items", "count")]
# items count
# 4 {B,C} 1
# 5 {A,C} 1
# 6 {E,F} 1
# 7 {D,E} 1
# 10 {D,F} 1
# 13 {B,D} 1
# 14 {A,D} 1
# 15 {A,B} 3
# 25 {A,B,C} 1
# 26 {D,E,F} 1
# 35 {A,B,D} 1
Timing on larger data set: 10000 customers with up to 6 products each. apriori is quite a lot faster.
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_henrik(dat) 38.95475 39.8621 41.44454 40.67313 41.05565 57.64655 20
# f_allan(dat) 4578.20595 4622.2363 4664.57187 4654.58713 4679.78119 4924.22537 20
# f_jay(dat) 2799.10516 2939.9727 2995.90038 2971.24127 2999.82019 3444.70819 20
# f_uwe_dt(dat) 2943.26219 3007.1212 3028.37550 3027.46511 3060.38380 3076.25664 20
# f_uwe_dplyr(dat) 6339.03141 6375.7727 6478.77979 6448.56399 6521.54196 6816.09911 20
10000 customers with up to 10 products each. apriori is several hundred times faster.
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_henrik(dat) 58.40093 58.95241 59.71129 59.63988 60.43591 61.21082 20
# f_jay(dat) 52824.67760 53369.78899 53760.43652 53555.69881 54049.91600 55605.47980 20
# f_uwe_dt(dat) 22612.87954 22820.12012 22998.85072 22974.32710 23220.00390 23337.22815 20
# f_uwe_dplyr(dat) 26083.20240 26255.88861 26445.49295 26402.67887 26659.81195 27046.83491 20
On the larger data set, Allan's code gave warnings (In rawToBits(as.raw(x)) : out-of-range values treated as 0 in coercion to raw) on the toy data, which seemed to affect the result. Thus, it is not included in the second benchmark.
Data and benchmark code:
set.seed(3)
n_cust = 10000
n_product = sample(2:6, n_cust, replace = TRUE) # 2:10 in second run
dat = data.frame(
Customer = rep(1:n_cust, n_product),
Product = unlist(lapply(n_product, function(n) sample(letters[1:6], n)))) # 1:10 in 2nd run
library(microbenchmark)
res = microbenchmark(f_henrik(dat),
f_allan(dat),
f_jay(dat),
f_uwe_dt(dat),
f_uwe_dplyr(dat),
times = 20L)
Check for equality:
henrik = f_henrik(dat)
allan = f_allan(dat)
jay = f_jay(dat)
uwe_dt = f_uwe_dt(dat)
uwe_dplyr = f_uwe_dplyr(dat)
# change outputs to common format for comparison
# e.g. string format, column names, order
henrik$items = substr(henrik$items, 2, nchar(henrik$items) - 1)
henrik$items = gsub(",", ", ", henrik$items)
l = list(
henrik = henrik, allan = allan, jay = jay, uwe_dt = uwe_dt, uwe_dplyr = uwe_dplyr)
l = lapply(l, function(d){
d = setNames(as.data.frame(d), c("items", "count"))
d = d[order(d$items), ]
row.names(d) = NULL
d
})
all.equal(l[["henrik"]], l[["allan"]])
# TRUE
all.equal(l[["henrik"]], l[["jay"]])
# TRUE
all.equal(l[["henrik"]], l[["uwe_dt"]])
# TRUE
all.equal(l[["henrik"]], l[["uwe_dplyr"]])
# TRUE
Functions:
f_henrik = function(dat){
r = apriori(data = as.matrix(table(dat) > 0),
par = list(target = "frequent itemsets",
support = 0,
minlen = 2))
d = as(r, "data.frame")
d[d$count > 0, c("items", "count")]
}
f_allan = function(dat){
all_multiples <- function(strings)
{
n <- length(strings)
do.call("c", sapply(1:2^n, function(x) {
mystrings <- strings[as.character(rawToBits(as.raw(x))[seq(n)]) == "01"]
if (length(mystrings) > 1) paste(mystrings, collapse = ", ") else NULL
}))
}
dat %>%
group_by(Customer) %>%
arrange(Product) %>%
summarize(Product_group = all_multiples(Product)) %>%
group_by(Product_group) %>%
count(Product_group)
}
f_jay = function(dat){
a <- split(dat$Product, dat$Customer) ## thx to #Henrik
r <- range(lengths(a))
pr <- unlist(lapply(r[1]:r[2], function(x)
combn(unique(dat$Product), x, list)), recursive=F)
or <- rowSums(outer(pr, a, Vectorize(function(x, y) all(x %in% y))))
res <- data.frame(p.group=sapply(pr, toString), number=or)
res[res$number > 0, ]
}
f_uwe_dt = function(dat){
setorder(setDT(dat), Customer, Product)
dat[, .(Product.Group = unlist(lapply(tail(seq(.N), -1L),
function(m) combn(unique(Product), m, toString, FALSE)))),
by = Customer][
, .N, by = Product.Group]
}
f_uwe_dplyr = function(dat){
dat %>%
arrange(Customer, Product) %>%
group_by(Customer) %>%
summarise(Product.Group = n() %>%
seq() %>%
tail(-1L) %>%
lapply(function(m) combn(unique(Product), m, toString, FALSE)) %>%
unlist()) %>%
ungroup() %>%
count(Product.Group)
}
If you define a little helper function that gets all multiple groupings:
all_multiples <- function(strings)
{
n <- length(strings)
do.call("c", sapply(1:2^n, function(x) {
mystrings <- strings[as.character(rawToBits(as.raw(x))[seq(n)]) == "01"]
if (length(mystrings) > 1) paste(mystrings, collapse = ", ") else NULL
}))
}
then you can do this nicely in a tidyverse pipe:
dat %>%
group_by(Customer) %>%
arrange(Product) %>%
summarize(Product_group = all_multiples(Product)) %>%
group_by(Product_group) %>%
count(Product_group)
#> # A tibble: 11 x 2
#> # Groups: Product_group [11]
#> Product_group n
#> <chr> <int>
#> 1 A, B 3
#> 2 A, B, C 1
#> 3 A, B, D 1
#> 4 A, C 1
#> 5 A, D 1
#> 6 B, C 1
#> 7 B, D 1
#> 8 D, E 1
#> 9 D, E, F 1
#> 10 D, F 1
#> 11 E, F 1
For the sake of completeness, here is a solution in data.table syntax which can be translated to dplyr syntax as well.
For both implementations, the core idea is the same:
sort by Product (which is an important step which has been neglected by the other answers posted so far)
For each Customer, create the product groups by using combn() with varying lengths m. Product.Group is a kind of natural key created by concatenating the included products using the toString() function.
Here, we can see why sorting Product is important : products B, A as well as A, B should appear in the same product group A, B.
Finally, count the number of occurrences by Product.Group
data.table version
library(data.table)
setorder(setDT(df), Customer, Product)
df[, .(Product.Group = unlist(lapply(tail(seq(.N), -1L),
function(m) combn(unique(Product), m, toString, FALSE)))),
by = Customer][
, .N, by = Product.Group]
Product.Group N
1: A, B 3
2: A, C 1
3: B, C 1
4: A, B, C 1
5: D, E 1
6: D, F 1
7: E, F 1
8: D, E, F 1
9: A, D 1
10: B, D 1
11: A, B, D 1
dplyr version
library(dplyr)
df %>%
arrange(Customer, Product) %>%
group_by(Customer) %>%
summarise(Product.Group = n() %>%
seq() %>%
tail(-1L) %>%
lapply(function(m) combn(unique(Product), m, toString, FALSE)) %>%
unlist()) %>%
ungroup() %>%
count(Product.Group)
Product.Group n
<chr> <int>
1 A, B 3
2 A, B, C 1
3 A, B, D 1
4 A, C 1
5 A, D 1
6 B, C 1
7 B, D 1
8 D, E 1
9 D, E, F 1
10 D, F 1
11 E, F 1
Data
library(data.table)
df <- fread("
Customer Product
1 A
1 B
1 C
2 D
2 E
2 F
3 A
3 B
3 D
4 A
4 B")
You could split the data along customers, then get all combinations of product-pairs and triples using combn. Then find matches using %in% with outer, create data frame by collapsing products using toString and finally discard elements which are zero.
# a <- aggregate(Product ~ Customer, dat, I)$Product ## old solution
# if (is.matrix(a)) a <- as.data.frame(t(a)) ## old solution
a <- split(dat$Product, dat$Customer) ## thx to #Henrik
r <- range(lengths(a))
pr <- unlist(lapply(r[1]:r[2], function(x)
combn(unique(dat$Product), x, list)), recursive=F)
or <- rowSums(outer(pr, a, Vectorize(function(x, y) all(x %in% y))))
res <- data.frame(p.group=sapply(pr, toString), number=or)
res[res$number > 0, ]
# p.group number
# 1 A, B 3
# 2 A, C 1
# 3 A, D 1
# 6 B, C 1
# 7 B, D 1
# 13 D, E 1
# 14 D, F 1
# 15 E, F 1
# 16 A, B, C 1
# 17 A, B, D 1
# 35 D, E, F 1
Data
dat <- read.table(header=TRUE, text="Customer Product
1 A
1 B
1 C
2 D
2 E
2 F
3 A
3 B
3 D
4 A
4 B")

check a data.frame column (character) against 4 lists

I want to check a word (in a column in a data-frame) against 4 lists (a, b, c, d):
if df$word is in a then df$code <- 1
if df$word is in b then df$code <- 2
if df$word is in c then df$code <- 3
if df$word is in d then df$code <- 4
if df$word is in a & b then df$code <- 1 2
if df$word is in a & c then df$code <- 1 3
if df$word is in a & d then df$code <- 1 4
if df$word is in b & c then df$code <- 2 3
if df$word is in b & d then df$code <- 2 4
if df$word is in c & d then df$code <- 3 4
etc.
What is the most efficient way to do so?
Example
df <- data.frame(word = c("book", "worm", "digital", "context"))
a <- c("book", "context")
b <- c("book", "worm", "context")
c <- c("digital", "worm", "context")
d <- c("context")
Expected output:
book 1 2
worm 2 3
digital 3
context 1 2 3 4
We can use a double sapply loop where for every element in the data frame we check in which list element it is present and get the corresponding list number.
lst <- list(a, b, c, d)
df$output <- sapply(df$V1, function(x) paste0(which(sapply(lst,
function(y) any(grepl(x,y)))), collapse = ","))
df
# V1 output
#1 book 1,2
#2 worm 2,3
#3 digital 3
#4 context 1,2,3,4
data
df <- read.table(text = "book
worm
digital
context")
Try this:
df <- data.frame(x =c("book", "worm","digital", "context"))
a <- c("book", "context")
b<- c("book", "worm", "context")
c <- c("digital", "worm", "context")
d <- c("context")
anno <- function(x){
rslt = ""
if (x %in% a) rslt =paste0(rslt," 1")
if (x %in% b) rslt =paste0(rslt," 2")
if (x %in% c) rslt =paste0(rslt," 3")
if (x %in% d) rslt =paste0(rslt," 4")
return(stringr::str_trim(rslt))
}
df$code <- sapply(df$x, anno)
df
#> x code
#> 1 book 1 2
#> 2 worm 2 3
#> 3 digital 3
#> 4 context 1 2 3 4
Created on 2018-08-17 by the reprex package (v0.2.0.9000).
This can also be accomplished in two steps:
Combine the four lists and reshape into long format
Aggregate while joing with df
using data.table:
library(data.table)
long <-setDT(melt(list(a, b, c, d), value.name = "word"))
long[setDT(df), on = "word", by = .EACHI, .(code = toString(L1))][]
word code
1: book 1, 2
2: worm 2, 3
3: digital 3
4: context 1, 2, 3, 4

Replace values of multiple columns from one dataframe using another dataframe with conditions

Hi I have two data frames as followed:
df1:
ID x y z
1 a b c
2 a b c
3 a b c
4 a b c
and df2:
ID x y
2 d NA
3 NA e
and I am after a result like this:
df1:
ID x y z
1 a b c
2 d b c
3 a e c
4 a b c
I have been trying to use the match function as suggested by some other posts but I keep getting the issue where my df1 dataframe being replaced with NA values from df2.
This is the code I have been using without luck
for (i in names(df2)[2:length(names(df2))]) {
df1[i] <- df2[match(df1$ID, df2$ID)]
}
Thanks
Your code didn't work for me so I change it a little but it works. If you are reading data from an external file use the stringAsFactor = FALSE when you read it so you don't run into problems.
df1 = data.frame("ID" = 1:4,"x" = rep("a",4), "y" =rep("b",4),"z" = rep("c",4),
stringsAsFactors=FALSE)
df2 = data.frame("ID" = 2:3,"x" = c("d",NA), "y" = c(NA,"e"),stringsAsFactors=FALSE)
for(i in 1:nrow(df2)){
new_data = df2[i,-which(apply(df2[i,],2,is.na))]
pos = as.numeric(new_data[1])
col_replace = intersect(colnames(new_data),colnames(df1))
df1[pos,col_replace] = new_data
}
A solution using dplyr. The idea is to convert both data frames to long format, conduct join and replace the values, and convert the format back to wide format. df5 is the final output.
library(dplyr)
library(tidyr)
df3 <- df1 %>% gather(Col, Value, -ID)
df4 <- df2 %>% gather(Col, Value, -ID, na.rm = TRUE)
df5 <- df3 %>%
left_join(df4, by = c("ID", "Col")) %>%
mutate(Value.x = ifelse(!is.na(Value.y), Value.y, Value.x)) %>%
select(ID, Col, Value.x) %>%
spread(Col, Value.x)
df5
# ID x y z
# 1 1 a b c
# 2 2 d b c
# 3 3 a e c
# 4 4 a b c
DATA
df1 <- read.table(text = "ID x y z
1 a b c
2 a b c
3 a b c
4 a b c",
header = TRUE, stringsAsFactors = FALSE)
df2 <- read.table(text = "ID x y
2 d NA
3 NA e",
header = TRUE, stringsAsFactors = FALSE)
As mentioned by alistaire this is an update join. It is available with the data.table package:
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = "ID", x := ifelse(is.na(i.x), x, i.x)]
df1[df2, on = "ID", y := ifelse(is.na(i.y), y, i.y)]
df1
ID x y z
1: 1 a b c
2: 2 d b c
3: 3 a e c
4: 4 a b c
If there are many columns with replacement values, it might be worthwhile to follow www's suggestion to do the replacement after reshaping to long format where column names are treated as data:
library(data.table)
melt(setDT(df1), "ID")[
melt(setDT(df2), "ID", na.rm = TRUE), on = .(ID, variable), value := i.value][
, dcast(.SD, ID ~ variable)]
ID x y z
1: 1 a b c
2: 2 d b c
3: 3 a e c
4: 4 a b c
Data
df1 <- fread(
"ID x y z
1 a b c
2 a b c
3 a b c
4 a b c")
df2 <- fread(
"ID x y
2 d NA
3 NA e")

parent child structure in R dataframe

I have a csv that contains an org structure as follows plus some additional columns. I use R to create charts and it works great !.
The challenge is when trying to create the charts for a subset manager and its children/grandchildren.
Is there any filtering that is possible in dplr or any alternative package?
Sample format:
emp_id mgr_id nest_id
A A 0
B A 1
C B 2
D C 3
D1 D 4
D2 D 4
E C 3
E1 E 4
F C 3
G B 2
H G 3
The subset I need is for manager "C"
Scenario 1:emp_id==C should contain all nodes of 'D','D1','D2','E','E1','F'
expected structure:
manager,all_children
C D
C D1
C D2
C E
C E1
C F
Scenario 2:emp_id==C should contain all above nodes but retain mgr_id structure for 'D','E'
expected structure:
manager,all_children
C D
C E
C F
D D1
D D2
E E1
Consider the base package with by which creates a df list for every level of mgr_id (not just C):
SCENARIO 1
dfList <- by(df, df$mgr_id, function(i){
names(i) <- paste0(names(i), "_") # SUFFIX UNDERSCORE (TO AVOID DUP COLUMNS)
child <- merge(i, df, by.x="mgr_id_", by.y="emp_id")[,1:2]
grandchild <- merge(child, df, by.x="emp_id_", by.y="mgr_id")[c("mgr_id_", "emp_id")]
names(child) <- gsub("*_$", "", names(child)) # REMOVE LAST UNDERSCORE
names(grandchild) <- gsub("*_$", "", names(grandchild)) # REMOVE LAST UNDERSCORE
rbind(child, grandchild)
})
dfList$C
# mgr_id emp_id
# 1 C D
# 2 C E
# 3 C F
# 4 C D1
# 5 C D2
# 6 C E1
SCENARIO 2 (where the selected columns change in grandchild and then first column rename)
dfList <- by(df, df$mgr_id, function(i){
names(i) <- paste0(names(i), "_") # SUFFIX UNDERSCORE (TO AVOID DUP COLUMNS)
child <- merge(i, df, by.x="mgr_id_", by.y="emp_id")[,1:2]
grandchild <- merge(child, df, by.x="emp_id_", by.y="mgr_id")[c("emp_id_", "emp_id")]
names(child) <- gsub("*_$", "", names(child)) # REMOVE LAST UNDERSCORE
names(grandchild) <- gsub(".*_$", "", names(grandchild)) # REMOVE LAST UNDERSCORE
names(grandchild)[1] <- "mgr_id"
rbind(child, grandchild)
})
dfList$C
# mgr_id emp_id
# 1 C D
# 2 C E
# 3 C F
# 4 D D1
# 5 D D2
# 6 E E1
Here is one solution using functions from dplyr and data.table. dt3 is the output for scenario 1, while dt4 is the output for scenario 2.
# Load packages
library(dplyr)
library(data.table)
# Create example data frame
dt <- read.table(text = "emp_id mgr_id nest_id
A A 0
B A 1
C B 2
D C 3
D1 D 4
D2 D 4
E C 3
E1 E 4
F C 3
G B 2
H G 3",
header = TRUE, stringsAsFactors = FALSE)
# Process the data
dt2 <- dt %>%
# Filter levels lower than 1
filter(nest_id > 1) %>%
mutate(group_id = ifelse(nest_id > 2, 0, 1)) %>%
# Create "run_id", which will be used to fill manager label
mutate(run_id = rleid(group_id)) %>%
mutate(run_id = ifelse(run_id %% 2 == 0, run_id - 1, run_id)) %>%
group_by(run_id) %>%
mutate(manager = first(emp_id)) %>%
# Select for manager C
filter(manager %in% "C") %>%
ungroup() %>%
# Remove rows if manager == emp_id
filter(manager != emp_id) %>%
rename(all_children = emp_id)
# Scenario 1
dt3 <- dt2 %>% select(manager, all_children)
# Scenario 2
dt4 <- dt2 %>%
select(manager = mgr_id, all_children) %>%
arrange(manager, all_children)

Resources