Find variables that occur only in ONE row in R - r

Using BASE R, I wonder how to answer the following question:
Are there any value on X or Y that occurs only in one row but not others? If yes, produce my desired output below.
f <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,2,2,3,1,4,3,3),
Y = c(99,7,8,7,6,7,7,7))
Desired output:
list(BB = c(X = 4, Y = 6), AA = c(Y = c(99, 8)))
# $BB
# X Y
# 4 6
# $AA
# Y1 Y2 # Would be a plus if shows `Y Y` instead of `Y1 Y2`
# 99 8

There are two big ideas with this base approach:
Since we need to compare all the values, we should just recombine everything into one data.frame.
Making the unsplit data.frame long will save us some extra steps.
#https://stackoverflow.com/questions/58786052/find-variables-that-occur-only-once-across-a-split-data-frame-in-r/58788854#58788854
f <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,2,2,3,1,4,3,3),
Y = c(99,7,8,7,6,7,7,7))
m <- split(f, f$id) # Here is `m`
unsplit <- do.call(rbind, c(m, make.row.names = F))
molten <- data.frame(unsplit[, 1, drop = F], stack(unsplit[, -1]))
# res <- subset(molten, !duplicated(values) & !duplicated(values, fromLast = T))
res <- molten[as.logical(ave(molten[['values']], molten[['ind']], FUN = function(x) !duplicated(x) & !duplicated(x, fromLast = T))), ]
#I would stop here
res
#> id values ind
#> 6 BB 4 X
#> 9 AA 99 Y
#> 11 AA 8 Y
#> 13 BB 6 Y
#to get exact output
res_vector <- res$values
names(res_vector) <- res$ind
split(res_vector, as.character(res$id))
#> $AA
#> Y Y
#> 99 8
#>
#> $BB
#> X Y
#> 4 6
Created on 2019-11-10 by the reprex package (v0.3.0)
Here's another base approach that may be less complicated:
####Way 1 with rapply
vec <- rapply(lapply(m, '[', mods), I)
unique_vec <- vec[!duplicated(vec) & !duplicated(vec, fromLast = T)]
vec_names <- do.call(rbind, strsplit(names(unique_vec), '.', fixed = T))
names(unique_vec) <- substr(vec_names[, 2], 1, 1) #turns Y1 into Y
split(unique_vec, vec_names[, 1])
###Way 2 with data.frame already do.call(rbind, m)
vec <- unlist(
lapply(f[, -1],
function(x){
ind <- !duplicated(x) & !duplicated(x, fromLast = T)
ret <- x[ind]
names(ret) <- f[ind, 1]
ret
}
)
)
#this is likely overly simplified:
split(vec, sub('.*\\.', '', names(vec)))
#this leads to exact result
vec_names <- do.call(rbind, strsplit(names(vec), '.', fixed = T))
names(vec) <- vec_names[, 1]
split(vec, vec_names[, 2])
$AA
Y Y
99 8
$BB
X Y
4 6
OP brings up using table() in a hint. duplicated() is very performant:
unlist(lapply(f[mods], function(y) names(which(table(y) == 1))))
# X Y1 Y2 Y3
# "4" "6" "8" "99"
vec
#X.BB Y.AA Y.AA Y.BB
# 4 99 8 6
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc
<bch:expr> <bch> <bch:> <dbl> <bch:byt>
1 table_meth 321us 336us 2794. 10.3KB
2 dup_meth 132us 136us 7105. 31.7KB
bench::mark(
table_meth = {unlist(lapply(f[mods], function(y) names(which(table(y) == 1))))},
dup_meth = {
#could get slight performance boost with
#f_id <- f[['id']]
unlist(
lapply(f[, -1],
function(x){
ind <- !duplicated(x) & !duplicated(x, fromLast = T)
ret <- x[ind]
names(ret) <- f[ind, 1]
#names(ret) <- f_id[ind]
ret
}
)
)}
, check = F
)
And similar idea in data.table:
library(data.table)
molten_dt <- melt(rbindlist(m), id.vars = 'id')
molten_dt[!duplicated(value, by = variable) &
!duplicated(value, by = variable, fromLast = T)]
And similar idea in dplyr:
library(dplyr)
library(tidyr)
m%>%
bind_rows()%>%
pivot_longer(cols = -id)%>%
group_by(name)%>%
filter(!duplicated(value) & !duplicated(value, fromLast = T))%>%
group_by(id)%>%
group_split()

It's not pure functional programming but it is base R:
lapply(split(df, df$id), function(z){
X <- z$X[which(!(z$X %in% df$X[duplicated(df$X)]))]
Y <- z$Y[which(!(z$Y %in% df$Y[duplicated(df$Y)]))]
cbind(X, Y)
}
)
Data:
df <-
structure(list(
id = structure(
c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L),
.Label = c("AA", "BB", "CC"),
class = "factor"
),
X = c(1,
2, 2, 3, 1, 4, 3, 3),
Y = c(99, 7, 8, 7, 6, 7, 7, 7)
),
class = "data.frame",
row.names = c(NA,-8L))

Related

Create loop with dynamic column names and repeating values based on defined i

I have the following dataframe:
id <- c("A", "B", "C")
col1 <- c(1, 3, 5)
col2 <- c(6, 12, 9)
col3 <- c(2, 4, 30)
df <- data.frame(id, col1, col2, col3)
Essentially, I want every i to be replaced by 20, 25, 30, 35, 40. This loop works but it works very, very slowly.
library(dplyr)
library(tibble)
library(foreach)
library(tidyverse)
library(purrr)
id <- c("A", "B", "C")
col1 <- c(1, 3, 5)
col2 <- c(6, 12, 9)
col3 <- c(2, 4, 30)
df <- data.frame(id, col1, col2, col3)
vals <- c(seq(from=20, to=40, by=5))
final <- foreach(i = vals, .combine='cbind') %do% {
# if cell is greater than i, then code 0
df_2 <- df %>% mutate(across(starts_with("col"), ~ +(. < i)))
# transpose the dataset
rownames(df_2) <- df_2$id
df_2$id <- NULL
df_2_t <- as.data.frame(t(df_2))
# sum the rows
df_2_t <- cbind(id = rownames(df_2_t), df_2_t)
rownames(df_2_t) <- 1:nrow(df_2_t)
df_2_t <- df_2_t %>%
mutate(sum = rowSums(.[2:ncol(.)]))
# merge a new column
id2 <- c("col1", "col2", "col3")
D <- c(3, 4, 5)
id_d <- data.frame(id2, D)
df_2_t_d <- left_join(df_2_t, id_d, by = c("id" = "id2"))
# divide D by the number of letters (there are 3 letter columns -- A, B, C)
df_2_t_d$letters <- rep(3)
df_2_t_d <- df_2_t_d %>%
mutate(frac = D/letters)
# recode all 1s to the frac
letters <- grep("^A|^B|^C", names(df_2_t_d))
df_2_t_d[letters] <- apply(df_2_t_d[letters], 2, function(x) ifelse(x == 1, df_2_t_d$frac, 0))
# drop two columns
df_2_t_d <- select(df_2_t_d, -c(D, letters))
# transpose again
rownames(df_2_t_d) <- df_2_t_d$id
df_2_t_d$id <- NULL
df_2_t_d2 <- as.data.frame(t(df_2_t_d))
df_2_t_d2_sum <- df_2_t_d2 %>%
mutate(rowSums(.[1:3])) %>%
transmute(!!paste0('sum_', i) := rowSums(select(., starts_with('col'))))
}
df_2_t_d2 <- cbind(list_name = rownames(df_2_t_d2), df_2_t_d2)
rownames(df_2_t_d2) <- 1:nrow(df_2_t_d2)
df_2_t_d2 <- select(df_2_t_d2, list_name)
abc <- cbind(df_2_t_d2, df_2_t_d2_sum)
View(abc)
If there's any way to speed it up, suggestions are welcome!
Here's a way to do this map_dfc :
library(dplyr)
library(purrr)
vals <- seq(from=20, to=40, by=5)
bind_cols(
df, map_dfc(vals, function(x) df %>%
mutate(across(starts_with("col"), ~ +(. < x))) %>%
transmute(!!paste0('sum_', x) := rowSums(select(., starts_with('col'))))))
Or in base R :
cols <- grep('col', names(df))
df[paste0('sum_', vals)] <- lapply(vals, function(x) rowSums(+(df[cols] < x)))
df
# id col1 col2 col3 sum_20 sum_25 sum_30 sum_35 sum_40
#1 A 1 6 2 3 3 3 3 3
#2 B 3 12 4 3 3 3 3 3
#3 C 5 9 30 2 2 2 3 3

R: How to output a column based on the same formula for each input column?

In the latest DataFramesMeta.jl package for Julia one can do
x = [:x1, :x2]
n = [:n1, :n2]
#transform(df, cols(x) = :y .+ cols(n))
and it will output the columns x1 = y + x1 and x2 = y + x2.
The question, how best to do that in R elegantly? I can do it like this
library(dplyr)
df = data.frame(y = 1:3, n1 = 1:3, n2 = 1:3)
x = c("m1", "m2")
n = c("n1", "n2")
code = glue::glue_data(list(x=x, n=n), "{x}=y+{n}")
code = glue::glue("vars({paste(code, collapse=',')})")
mutate(df, !!!eval(parse(text=code)))
or in Base
res = lapply(n, function(n) {
df$y + df[, n]
}) %>% data.frame
names(res) <- x
df = cbind(df, res)
But it feels hacky. data.table and Base solutions welcome.
You can do :
df[x] <- lapply(df[n], function(p) df$y + p)
df
# y n1 n2 m1 m2
#1 1 1 1 2 2
#2 2 2 2 4 4
#3 3 3 3 6 6
Similarly with purrr::map :
df[x] <- purrr::map(df[n], ~df$y + .x)
and data.table :
library(data.table)
setDT(df)[, (x):= lapply(.SD, function(p) p + y), .SDcols = n]
Late realisation that we don't need any apply family of functions, we can directly do.
df[x] <- df[n] + df$y
We can use sweep
df[x] <- sweep(df[n], 1, df$y, FUN = `+`)

Access all the columns with a particular name in nested lists in R

I wonder how I can access all the columns with a particular name in nested lists. Below there is a reproducible example. How can I call all the "mean" columns and collate all in a single data.frame where the data.frame as two other columns which specify associated classes and Output1/Output2 (Example 1). Example 2 is a little more complicated where the nested "mean" list is a data.frame. I need to access both "ts" and "value" columns. In other words, I need to know the ts corresponding to each value (in addition to classes and Output1/Output2).
Example 1
classes <- c("F", "G", "M", "O")
classes <- structure(unique(classes), names = unique(classes))
S1 = data.frame(X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25))
S2 = data.frame(X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25))
P <- lapply(classes, function(c){
Output1 <- list ("model" = lm(X3~ X1+X2, data = S1),"mean" = apply(S1[S1$X4 == c, 1:3], 2, mean), "sum" = apply(S1[S1$X4 == c, 1:3], 2, sum))
Output2 <- list ("model" = lm(X3~ X1+X2, data = S2), "mean" = apply(S2[S2$X4 == c, 1:3], 2, mean), "sum" = apply(S2[S2$X4 == c, 1:3], 2, sum))
output <- list ("Output1" = Output1, "Output2" = Output2)
return(output)
})
Example 2
classes <- c("F", "G", "M", "O")
classes <- structure(unique(classes), names = unique(classes))
S1 = data.frame( X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25), ts = seq(from = ISOdate(1910,1,1), by = "30 min", length.out = 100 ))
S2 = data.frame( X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25),ts = seq(from = ISOdate(1910,1,1), by = "30 min", length.out = 100 ))
P <- lapply(classes, function(c){
Output1 <- list ("model" = lm(X3~ X1+X2, data = S1),"mean" = data.frame(ts = S1[S1$X4 == c, "ts"],
value = S1[S1$X4 == c, "X1"]) ,
"sum" = apply(S1[S1$X4 == c, 1:3], 2, sum))
Output2 <- list ("model" = lm(X3~ X1+X2, data = S2),
"mean" = data.frame(ts = S2[S2$X4 == c, "ts"],
value = S2[S2$X4 == c, "X1"]),
"sum" = apply(S2[S2$X4 == c, 1:3], 2, sum))
output <- list ("Output1" = Output1, "Output2" = Output2)
return(output)
})
We can get "mean" columns from P using rvest's pluck and bind them together with map_df.
purrr::map_df(P, ~rvest::pluck(.x, "mean"), .id = "Class")
# A tibble: 12 x 3
# Class Output1 Output2
# <chr> <dbl> <dbl>
# 1 F 0.0315 -0.0946
# 2 F 0.0935 0.219
# 3 F 0.155 0.172
# 4 G 0.123 0.182
# 5 G -0.114 -0.128
# 6 G -0.0654 -0.0990
# 7 M 0.111 0.0794
# 8 M -0.176 0.405
# 9 M 0.265 -0.0747
#10 O 0.0207 -0.250
#11 O -0.0407 0.0117
#12 O -0.162 -0.195
In base R, you can do :
temp <- lapply(P, function(x) sapply(x, `[[`, "mean"))
do.call(rbind, Map(cbind.data.frame, temp, Class = names(temp)))
EDIT
For the dataframe example, we can use bind_rows after pluck.
map_df(P, ~rvest::pluck(.x, "mean") %>% bind_rows(.id= "output"), .id = "Class")

R: Change objects inside loop

a <- data.frame(x = c(1, 2))
b <- data.frame(x = c(3, 4))
for (df in list(a, b)) {
print(df)
df$y <- c(5, 6)
}
Each of the data frames gets printed out correctly, but adding an additional column fails.
Extensive web search suggested something like
lapply(list(a, b), function(df){
df$y <- c(5, 6)
})
but this didn't help me.
What I'd be also very interested is, why the print statement in the for loop works, but the addition of the y column fails.
This is surprising for me.
You have to return the df's with that additional column.
Try:
lapply(list(a, b), function(df){
df$y <- c(5, 6); return(df)
})
The output is:
[[1]]
x y
1 1 5
2 2 6
[[2]]
x y
1 3 5
2 4 6
As #dash2 supposes you may want to assign those changed df's to your list of df's. So the full code could look like:
a <- data.frame(x = c(1, 2))
b <- data.frame(x = c(3, 4))
l <- list(a, b)
l <- lapply(l, function(df){ df$y <- c(5, 6); return(df) })
> a <- data.frame(x = c(1, 2))
> b <- data.frame(x = c(3, 4))
> l <- list(a = a, b = b)
> list2env(lapply(l, function(x) {x$y <- c(5, 6);x}), envir = .GlobalEnv)
<environment: R_GlobalEnv>
> a
x y
1 1 5
2 2 6
> b
x y
1 3 5
2 4 6

paste grid -- expand.grid for string concatenation

If we want to get all combinations of two vectors, we can use rep/recycling rules:
x <- 1:4
y <- 1:2
cbind(rep(x, each = length(y)), rep(y, length(x)))
# [,1] [,2]
# [1,] 1 1
# [2,] 1 2
# [3,] 2 1
# [4,] 2 2
# [5,] 3 1
# [6,] 3 2
# [7,] 4 1
# [8,] 4 2
But expand.grid is much nicer -- it handles all the repetition for us.
expand.grid(x, y)
# Var1 Var2
# 1 1 1
# 2 2 1
# 3 3 1
# 4 4 1
# 5 1 2
# 6 2 2
# 7 3 2
# 8 4 2
Is there a simple version of this for concatenating strings? Like paste.grid? I have a named object where a lot of the objects have names like x_y_z where x, y, and z vary like x and y above.
For example, suppose x can be "avg" or "median", y can be "male" or "female", and z can be "height" or "weight". How can we concisely get all 8 combinations of the three?
Using rep is a pain:
x <- c("avg", "median")
y <- c("male", "female")
z <- c("height", "weight")
paste(rep(x, each = length(y) * length(z)),
rep(rep(y, each = length(z)), length(x)),
rep(z, length(x) * length(y)), sep = "_")
And repurposing expand.grid is a bit clunky (and probably inefficient):
apply(expand.grid(x, y, z), 1, paste, collapse = "_")
Am I missing something? Is there a better way to do this?
Yes, this is what interaction does
levels(interaction(x,y,z,sep='_'))
The implementation is pretty much the same as your rep code.
Outputs:
[1] "avg_female_height" "median_female_height" "avg_male_height" "median_male_height" "avg_female_weight"
[6] "median_female_weight" "avg_male_weight" "median_male_weight"
Using data.table's CJ cross-joining function:
library(data.table)
CJ(x,y,z)[, paste(V1,V2,V3, sep = "_")]
#[1] "avg_female_height" "avg_female_weight" "avg_male_height" "avg_male_weight"
#[5] "median_female_height" "median_female_weight" "median_male_height" "median_male_weight"
Or a variation of your apply approach would be:
do.call(paste, c(expand.grid(x, y, z), sep = "_"))
#[1] "avg_male_height" "median_male_height" "avg_female_height" "median_female_height"
#[5] "avg_male_weight" "median_male_weight" "avg_female_weight" "median_female_weight"
Rudimentary (microbenchmark::microbenchmark) benchmarking shows a pretty significant speed-up by using:
library(tidyr)
library(magrittr)
df <- data.frame(x, y, z)
df %>%
complete(x, y, z) %>%
unite("combo", x, y, z, sep = "_")
A bit slower, but perhaps more straight forward and vectorized variant the apply technique:
df <- expand.grid(x, y, z)
df$combo <- paste(df$Var1, df$Var1, df$Var3, sep = "_")
Someone should chime in with a data.table approach...
Benchmarking: Small Grid (256 elements)
set.seed(21034)
x <- sample(letters, 4, TRUE)
y <- sample(letters, 4, TRUE)
z <- sample(letters, 4, TRUE)
a <- sample(letters, 4, TRUE)
library(data.table)
library(microbenchmark)
library(magrittr)
library(tidyr)
microbenchmark(times = 25L,
DT1 = CJ(x, y, z, a)[ , paste(V1, V2, V3, V4, sep = "_")],
DT2 = CJ(x, y, z, a)[ , do.call(paste, c(.SD, sep = "_"))],
app1 = do.call(paste, c(expand.grid(x, y, z, a), sep = "_")),
app2 = paste((df <- expand.grid(x, y, z, a))$Var1,
df$Var2, df$Var3, sep = "_"),
magg_outer = outer(x, y, paste, sep = "_") %>%
outer(z, paste, sep = "_") %>%
outer(a, paste, sep = "_") %>% as.vector,
magg_tidy = data.frame(x, y, z, a) %>%
complete(x, y, z, a) %>%
unite("combo", x, y, z, a, sep = "_"),
interaction = levels(interaction(x, y, z, a, sep = "_")),
original = apply(expand.grid(x, y, z, a), 1, paste, collapse = "_"),
rep = paste(rep(x, each = (ny <- length(y)) * (nz <- length(z)) *
(na <- length(a))),
rep(rep(y, each = nz * na), (nx <- length(x))),
rep(rep(z, each = na), nx * ny), sep = "_"),
Reduce = Reduce(function(x, y) paste(rep(x, each = length(y)),
rep(y, length(x)), sep = "_"),
list(x, y, z, a)))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# DT1 529.578 576.6400 624.00002 589.8270 604.9845 5449.287 1000 d
# DT2 561.028 606.4220 639.94659 620.4335 636.2735 5484.514 1000 d
# app1 201.043 225.4475 240.36960 233.4795 243.7090 4244.687 1000 b
# app2 196.692 225.6130 244.33543 234.0455 243.7925 4110.605 1000 b
# magg_outer 164.352 194.1395 205.30300 204.4220 211.1990 456.122 1000 b
# magg_tidy 1872.228 2038.1560 2150.98234 2067.8770 2126.1025 21891.884 1000 f
# interaction 254.885 295.1935 313.54392 306.6680 316.8095 4196.465 1000 c
# original 852.018 935.4960 976.24388 954.5115 972.5550 4973.724 1000 e
# rep 50.737 54.1515 60.22671 55.3660 56.9220 3823.655 1000 a
# Reduce 58.395 65.3860 68.46049 66.8920 68.5640 158.184 1000 a
Benchmarking: Large Grid (1,000,000 elements)
set.seed(21034)
x <- sprintf("%03d", sample(100))
y <- sprintf("%03d", sample(100))
z <- sprintf("%02d", sample(10))
a <- sprintf("%02d", sample(10))
library(data.table)
library(microbenchmark)
library(magrittr)
library(tidyr)
microbenchmark(times = 25L,
DT1 = CJ(x, y, z, a)[ , paste(V1, V2, V3, V4, sep = "_")],
DT2 = CJ(x, y, z, a)[ , do.call(paste, c(.SD, sep = "_"))],
app1 = do.call(paste, c(expand.grid(x, y, z, a), sep = "_")),
app2 = paste((df <- expand.grid(x, y, z, a))$Var1,
df$Var2, df$Var3, sep = "_"),
magg_outer = outer(x, y, paste, sep = "_") %>%
outer(z, paste, sep = "_") %>%
outer(a, paste, sep = "_") %>% as.vector,
magg_tidy = data.frame(x, y, z, a) %>%
complete(x, y, z, a) %>%
unite("combo", x, y, z, a, sep = "_"),
interaction = levels(interaction(x, y, z, a, sep = "_")),
original = apply(expand.grid(x, y, z, a), 1, paste, collapse = "_"),
rep = paste(rep(x, each = (ny <- length(y)) * (nz <- length(z)) *
(na <- length(a))),
rep(rep(y, each = nz * na), (nx <- length(x))),
rep(rep(z, each = na), nx * ny), sep = "_"),
Reduce = Reduce(function(x, y) paste(rep(x, each = length(y)),
rep(y, length(x)), sep = "_"),
list(x, y, z, a)))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# DT1 360.6528 467.8408 517.4579 520.1484 549.1756 861.1567 25 ab
# DT2 355.0438 504.9642 572.0732 551.9106 615.6621 927.3210 25 b
# app1 727.4513 766.3053 926.1888 910.3998 957.7610 1690.1540 25 c
# app2 472.5724 567.1121 633.5304 600.3779 634.3158 1135.7535 25 b
# magg_outer 384.0112 475.5070 600.6317 525.8936 676.7134 927.6736 25 b
# magg_tidy 520.6428 602.5028 695.5500 680.8821 748.8746 1180.1107 25 bc
# interaction 353.7317 481.4732 531.0035 518.7084 585.0872 693.5171 25 ab
# original 4965.1156 5358.8704 5914.3560 5780.6609 6074.7470 9024.6476 25 d
# rep 206.0964 236.5811 273.1093 252.8179 285.0910 455.1776 25 a
# Reduce 322.0695 390.2595 446.3948 424.9185 508.5235 621.1878 25 ab
What about using outer()? Your two examples become
x <- 1:4
y <- 1:2
as.vector(outer(x, y, paste, sep = "_"))
## [1] "1_1" "2_1" "3_1" "4_1" "1_2" "2_2" "3_2" "4_2"
library(magrittr)
x <- c("avg", "median")
y <- c("male", "female")
z <- c("height", "weight")
outer(x, y, paste, sep = "_") %>% outer(z, paste, sep = "_") %>% as.vector
## [1] "avg_male_height" "median_male_height" "avg_female_height" "median_female_height" "avg_male_weight"
## [6] "median_male_weight" "avg_female_weight" "median_female_weight"
The second example can be simplified a little with Reduce():
Reduce(function(a, b) outer(a, b, paste, sep = "_"), list(x, y, z)) %>% as.vector
It's not efficient, however. Using microbenchmark, I find that your solution using rep() is about 10 times faster.

Resources