I have this reproducible dataframe:
df <- data.frame(ID = c("A", "A", "B", "B", "B","C", "C", "D"), cost = c("0.5", "0.4", "0.7", "0.8", "0.5", "1.3", "1.3", "2.6"))
I'm trying to groupby the ID, to test if there are differences in the cost column and update a new column called Test diff
Intermediate Output
ID cost Testdiff
1 A 0.5 Y
2 A 0.4 Y
3 B 0.7 Y
4 B 0.8 Y
5 B 0.5 Y
6 C 1.3 N
7 C 1.3 N
8 D 2.6 N
I'm looking at using a dplyr example to do this but I"m unsure if match is the correct function.
df %>% group_by(ID) %>% mutate(Testdiff = ifelse(match(cost) == T, "Y", "N"))
Once that is completed, I want to keep the 1st row of the unique ID, giving me this output
ID cost Testdiff
1 A 0.5 Y
2 B 0.7 Y
3 C 1.3 N
4 D 2.6 N
We could use n_distinct and then slice
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Testdiff = n_distinct(cost) > 1) %>%
slice(1)
# ID cost Testdiff
# <fct> <fct> <lgl>
#1 A 0.5 TRUE
#2 B 0.7 TRUE
#3 C 1.3 FALSE
#4 D 2.6 FALSE
If you want output to be "Y"/"N" instead of TRUE/FALSE
df %>%
group_by(ID) %>%
mutate(Testdiff = ifelse(n_distinct(cost) > 1, "Y", "N")) %>%
slice(1)
We could use ave and aggregate to solve it using base R
df$Testdiff <- ifelse(with(df, ave(cost, ID, FUN = function(x)
length(unique(x)))) > 1, "Y", "N")
aggregate(.~ID, df, head, n = 1)
# ID cost Testdiff
#1 A 0.5 Y
#2 B 0.7 Y
#3 C 1.3 N
#4 D 2.6 N
Since we have dplyr and base R already why not add in data.table:
library(data.table)
setDT(df)[, .(cost = cost[1], testdiff = uniqueN(cost) > 1), by = ID]
ID cost testdiff
1: A 0.5 TRUE
2: B 0.7 TRUE
3: C 1.3 FALSE
4: D 2.6 FALSE
A different tidyverse possibility could be:
df %>%
group_by(ID) %>%
mutate(Testdiff = ifelse(all(cost == first(cost)), "N", "Y")) %>%
filter(row_number() == 1)
ID cost Testdiff
<fct> <fct> <chr>
1 A 0.5 Y
2 B 0.7 Y
3 C 1.3 N
4 D 2.6 N
Or:
df %>%
group_by(ID) %>%
mutate(Testdiff = ifelse(all(cost == first(cost)), "N", "Y")) %>%
top_n(1, wt = desc(row_number()))
Related
given a dataframe below
A B C
A 0 3 1.1
B 3.1 0 .9
C 2 2.1 0
And a key of values below
Label Num
w 0
x 1
y 2
z 3
how do I generate an output
A B C
A w z x
B z w x
C y y w
data.table approach
library(data.table)
#make them data.tables, keep rownames
setDT(df, keep.rownames = "id")
setDT(df2)
# melt to long format
df.melt <- melt(df, id.vars = "id", variable.name = "id2", variable.factor = FALSE)
# perform rolling join
df.melt[, new_value := df2[df.melt, Label, on = .(Num = value), roll = Inf]]
# cast to wide again
dcast(df.melt, id ~ id2, value.var = "new_value")
id A B C
1: A w z x
2: B z w w
3: C y y w
sample data
df <- read.table(text = " A B C
A 0 3 1.1
B 3.1 0 .9
C 2 2.1 0", header = TRUE)
df2 <- read.table(text = "Label Num
w 0
x 1
y 2
z 3", header = TRUE)
Dplyr approach :
library(dplyr)
library(tidyr)
df <- tibble(rownames=c("A", "B", "C"),
A=c(0, 3.1, 2),
B=c(3,0,2.1),
C=c(1.1, 0.9, 0))
df2 <- tibble(label = c("w", "x", "y", "z"),
values = c(0,1,2,3))
df%>%mutate(across(A:C, round, digit=0)) %>%
pivot_longer(-1, names_to="colnames", values_to=("values")) %>%
left_join(df2) %>%
select(1,2,4) %>%
pivot_wider(., 1:3, names_from="colnames", values_from="label")
# A tibble: 3 x 4
rownames A B C
<chr> <chr> <chr> <chr>
1 A w z x
2 B z w x
3 C y y w
First use across to round all values, then pivot_longer to get a "long" format whhere you can left_join the two tables by values. Select to remove the column containing the numbers, then a last pivot_wider to return to original format.
I am trying to fill a new column with appropriate values from a list using dplyr. I tried to come up with a simple reproducible example, which can be found below. In short, I want to add a column "Param" to a dataframe, based on the values of the existing columns. The matching values are found in a separate list. I've tried functions as ifelse()and switch but I cannot make it work. Any tips on how this can be achieved?
Thank you in advance!
library(dplyr)
# Dataframe to start with
df <- as.data.frame(matrix(data = c(rep("A", times = 3),
rep("B", times = 3),
rep(1:3, times = 2)), ncol = 2))
colnames(df) <- c("Method", "Type")
df
#> Method Type
#> 1 A 1
#> 2 A 2
#> 3 A 3
#> 4 B 1
#> 5 B 2
#> 6 B 3
# Desired dataframe
desired <- cbind(df, Param = c(0.9, 0.8, 0.7, 0.6, 0.5, 0.4))
desired
#> Method Type Param
#> 1 A 1 0.9
#> 2 A 2 0.8
#> 3 A 3 0.7
#> 4 B 1 0.6
#> 5 B 2 0.5
#> 6 B 3 0.4
# Failed attempt
param <- list("A" = c("1" = 0.9, "2" = 0.8, "3" = 0.7),
"B" = c("1" = 0.6, "2" = 0.5, "3" = 0.4))
param
#> $A
#> 1 2 3
#> 0.9 0.8 0.7
#>
#> $B
#> 1 2 3
#> 0.6 0.5 0.4
df %>%
mutate(Param = ifelse(.$Method == "A", param$A[[.$Type]],
ifelse(.$Method == "B", param$B[[.$Type]], NA)))
#> Error: Problem with `mutate()` column `Param`.
#> ℹ `Param = ifelse(...)`.
#> x attempt to select more than one element in vectorIndex
You can unlist your list and just add it to your df.
df$Param <- unlist(param)
Method Type Param
1 A 1 0.9
2 A 2 0.8
3 A 3 0.7
4 B 1 0.6
5 B 2 0.5
6 B 3 0.4
As mentioned by #dario including matching data in dataframe would be easier.
library(dplyr)
library(tidyr)
df %>%
nest(data = Type) %>%
left_join(stack(param) %>% nest(data1 = values), by = c('Method' = 'ind')) %>%
unnest(c(data, data1))
# Method Type values
# <chr> <chr> <dbl>
#1 A 1 0.9
#2 A 2 0.8
#3 A 3 0.7
#4 B 1 0.6
#5 B 2 0.5
#6 B 3 0.4
Sure this could be cleaner, but it will get the job done: Option 1:
df %>%
mutate(
Param = unlist(param)[
match(
paste0(
df$Method,
df$Type
),
names(
do.call(
c,
lapply(
param,
names
)
)
)
)
]
)
Option 2: (cleaner version):
df %>%
type.convert() %>%
left_join(
do.call(cbind, param) %>%
data.frame() %>%
mutate(Type = as.integer(row.names(.))) %>%
pivot_longer(!Type, names_to = "Method", values_to = "Param"),
by = c("Type", "Method")
)
I'm using group by funciton in a dataset using R software. But the target of the id would duplicate. Here is the sample dataset:
ID Var1
A 1
A 3
B 2
C 3
C 1
D 2
In tradtional groupby function by each id, I can do
DT<- data.table(dataset )
DT[,sum(Var1),by = ID]
and get the result:
ID V1
A 4
B 2
C 4
D 2
However, I've to group ID by A+B and B+C and D
(PS. say that F=A+B ,G=B+C)
and the target result dataset below:
ID V1
F 6
G 6
D 2
IF I use recoding technique on ID, the duplicate B would be covered twice.
IS there any one have the solution?
MANY THANKS!
library(dplyr)
library(tidyr)
df <- df %>% mutate(F=ifelse(ID %in% c("A", "B"), 1, 0),
G = ifelse(ID %in% c("B", "C"), 1, 0),
D = ifelse(ID == "D", 1, 0))
df %>%
gather(var, val, F:D) %>%
filter(val==1) %>%
group_by(var) %>%
summarise(V1=sum(V1))
# # A tibble: 3 x 2
# var V1
# <chr> <dbl>
# 1 D 2
# 2 F 6
# 3 G 6
I have a table df that looks like this:
a <- c(10,20, 20, 20, 30)
b <- c("u", "u", "u", "r", "r")
c <- c("a", "a", "b", "b", "b")
df <- data.frame(a,b,c)
I would like to create a new table that contains the mean of col a, grouped by variable c. And I would like to have a column with the counts of the occurrence of b types within each group c.
I would therefore like the result table to look like df2:
a_m <- c(15, 23.3)
c <- c("a", "b")
counts_b <-c("2 u", "1 u, 2 r")
df2 <- data.frame(a_m, c, counts_b)
What I have so far is:
df2 <- df %>% group_by(c) %>% summarise(a_m = mean(a, na.rm = TRUE))
I do not know how to add the column counts_b in the example df2.
Giulia
Here's a way using a little table magic:
df %>%
group_by(c) %>%
summarise(a_mean = mean(a),
b_list = paste(names(table(b)), table(b), collapse = ', '))
# A tibble: 2 x 3
c a_mean b_list
<fct> <dbl> <chr>
1 a 15.0 r 0, u 2
2 b 23.3 r 2, u 1
Here is another solution using reshape2. The output format may be more convenient to work with, each value of b has its own column with the number of occurrences.
out1 <- dcast(df, c ~ b, value.var="c", fun.aggregate=length)
c r u
1 a 0 2
2 b 2 1
out2 <- df %>% group_by(c) %>% summarise(a_m = mean(a))
# A tibble: 2 x 2
c a_m
<fctr> <dbl>
1 a 15.00000
2 b 23.33333
df2 <- merge(out1, out2, by=c)
c r u a_m
1 a 0 2 15.00000
2 b 2 1 23.33333
I want to join two tibbles by a range or a virtual column. but it seems the by - parameter just allow to handle chr oder vector(chr) of existing column names.
In my example I have a tibble d with a column value, and a tibble r with a from and a to column.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
> d
# A tibble: 26 x 1
value
<dbl>
1 1.0
2 1.2
3 1.4
4 1.6
5 1.8
6 2.0
7 2.2
8 2.4
9 2.6
10 2.8
# ... with 16 more rows
> r
# A tibble: 6 x 3
from to class
<int> <dbl> <chr>
1 1 2 A
2 2 3 B
3 3 4 C
4 4 5 D
5 5 6 E
6 6 Inf F
now I want to join the value column in d within the range of from and to in r:
d %>% inner_join(r, by = "value between from and to") # >= and <
I can't find a way to do this so decided to join the floor of value in d with the from column in r
d %>% inner_join(r, by = c("floor(value)" = "from"))
of course i can create a second column to solve that:
d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, class)
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
but isn't there a more comfortable way?
Thanks
I don't think inequality joins is implemented in dplyr yet, or it ever will (see this discussion on Join on inequality constraints), but this is a good situation to use an SQL join:
library(tibble)
library(sqldf)
as.tibble(sqldf("select d.value, r.class from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Alternatively, if you want to integrate the join into your dplyr chain, you can use fuzzyjoin::fuzzy_join:
library(dplyr)
library(fuzzyjoin)
d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"),
match_fun = list(`>=`, `<`)) %>%
select(value, class)
Result:
# A tibble: 31 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 A
7 2.0 B
8 2.2 B
9 2.4 B
10 2.6 B
# ... with 21 more rows
Notice I added single quotes around from and to since those are reserved words for the SQL language.
Ok thanks for advices, this was pretty interesting. I finally wrote a function range_join (inspired by #ycw's code) and compared all described solution in view of runtime.
I like fuzzy_join but with only 50k rows in d it needs more than 40sec. Thats too slow.
Here the result with 5k rows in d
library(dplyr)
library(fuzzyjoin)
library(sqldf)
#join by range by #WiWeber
range_join <- function(x, y, value, left, right){
x_result <- tibble()
for (y_ in split(y, 1:nrow(y)))
x_result <- x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
return(x_result)
}
#dynamic join by #ycw
dynamic_join <- function(d, r){
d$type <- NA_character_
for (r_ in split(r, r$type))
d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
return(d)
}
d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)
# #useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Sys.time() - start
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
select(value, type)
Sys.time() - start
# #jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start
# #WiWeber floor
start <- Sys.time()
d2 <- d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, type)
Sys.time() - start
# #WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
inner_join(r, by = "join") %>%
filter(value >= from, value < to) %>%
select(value, type)
Sys.time() - start
# #hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
mutate(
type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
) %>%
select(value, type)
Sys.time() - start
# #ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start
# #WiWeber range_join
start <- Sys.time()
d2 <- d %>%
range_join(r, "value", "from", "to") %>%
select(value, type)
Sys.time() - start
Results:
# #useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs
# #jonathande4 cut pretty fast
Time difference of 0.004637003 secs
# #WiWeber floor
Time difference of 0.02223396 secs
# #WiWeber cross join - filter
Time difference of 0.0201931 secs
# #hardik-gupta sapply
Time difference of 5.166633 secs
# #ycw dynamic join
Time difference of 0.03124094 secs
# #WiWeber range_join
Time difference of 0.02691698 secs
greez WiWeber
You use the cut function to create a "class" in object d and then use a left join.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)
To get the right buckets, you just need to work with the cut function to get what you want.
We can use sapply for this
library(tibble)
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d <- cbind(d, data.frame(class = (unlist(sapply(d$value, function (x) r[which(x >= r$from & x < r$to), "class"]))) ) )
d
value class
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
11 3.0 C
12 3.2 C
13 3.4 C
14 3.6 C
15 3.8 C
16 4.0 D
17 4.2 D
18 4.4 D
19 4.6 D
20 4.8 D
21 5.0 E
22 5.2 E
23 5.4 E
24 5.6 E
25 5.8 E
26 6.0 F
We can use mutate and case_when from dplyr.
library(dplyr)
d2 <- d %>%
mutate(class = case_when(
value >= 1 & value < 2 ~ "A",
value >= 2 & value < 3 ~ "B",
value >= 3 & value < 4 ~ "C",
value >= 4 & value < 5 ~ "D",
value >= 5 & value < 6 ~ "E",
value >= 6 ~ "F"
))
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
Update
Here is a workaround by defining a function for this task.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
library(dplyr)
# Define a function for dynamic join
dynamic_join <- function(d, r){
if (!("class" %in% colnames(d))){
d[["class"]] <- NA_character_
}
d <- d %>%
mutate(class = ifelse(value >= r$from & value < r$to, r$class, class))
return(d)
}
re_dynamic_join <- function(d, r){
r_list <- split(r, r$class)
for (i in 1:length(r_list)){
d <- dynamic_join(d, r_list[[i]])
}
return(d)
}
# Apply the function
d2 <- d %>% re_dynamic_join(r)
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
I really liked #WiWeber's range_join function, but it gives an error if a record is not within range. Here's a modification
library(dplyr)
d <- tibble(value = c(seq(1,4, by = 0.2),9))
r <- tibble(from = seq(1,5), to = c(seq(2,5),8), class = LETTERS[seq(1,5)])
range_join <- function(x, y, value, left, right){
all_matches <- tibble()
x = as.data.frame(x)
y = as.data.frame(y)
x$index=x[,value]
for (i in 1:nrow(y)){
matches = x %>% filter(index>=y[i,left] & index<= y[i,right])
if (nrow(matches)>0){
all_matches = all_matches %>% bind_rows(matches %>% cbind(y[i,]))
}
}
all_matches = all_matches %>% select(-index)
return(all_matches)
}
data <- d %>%
range_join(r, "value", "from", "to")
data