Spread R Data.Table - r

data1=data.frame("Grade"=c(1,2,3,1,2,3),
"Group"=c(A,A,A,B,B,B),
"Score"=c(5,7,10,7,7,8))
data2=data.frame("Grade"=c(1,2,3),
"Combine"=c(12,14,18),
"A"=c(5,7,10),
"B"=c(7,7,8))
I have 'data1' and wish for 'data2' where you transpose Group from 'data1' into 'A' and 'B' and then finally add 'Combine' which sums 'A' and 'B'

You tagged this with data.table, so here's a data.table approach.
library(data.table)
data1 <- as.data.table(data1)
data2 <- dcast(data1,Grade ~ Group)
data2[,Combine := A + B]
data2
Grade A B Combine
1: 1 5 7 12
2: 2 7 7 14
3: 3 10 8 18

We can use pivot_wider from tidyr
library(dplyr)
library(tidyr)
data1 %>%
pivot_wider(names_from = Group, values_from = Score) %>%
mutate(Combine = A + B)

You can do
library(tidyverse)
data1 %>%
spread(Group, Score) %>%
mutate(Combine = A+B)
Grade A B Combine
1 1 5 7 12
2 2 7 7 14
3 3 10 8 18

in Base R
data2 <- data.frame("Grade" = 1:3)
grade.locations <- lapply(1:3,grep,data1$Grade)
for(i in 1:3){
data2$Combine[i] <- sum(data1[grade.locations[[i]],3])
data2$A[i] <- data1[grade.locations[[i]][1],3]
data2$B[i] <- data1[grade.locations[[i]][2],3]
}

Related

How can I check if a individual in a column exists in multiple groups?

I have the data below:
group_id <- c(1,1,1,2,2,2,3,3,3,3,4,4)
member_id <- c(1,2,3,4,4,3,5,6,8,10,12,12)
df <- data.frame(group_id,member_id)
> df
group_id member_id
1 1 1
2 1 2
3 1 3
4 2 4
5 2 4
6 2 3
7 3 5
8 3 6
9 3 8
10 3 10
11 4 12
12 4 12
Each member is paired with a group_id. I would like to extract which member id exists in multiple groups. For the above case, a member_id of 3 exists in both group_id 1 and 2. Is there a way I can use data.table or dplyr to figure out which member_id's exist in more than one group?
You can use n_distinct in dplyr and uniqueN in data.table to get count of unique group_id in each member_id. Select only those member_id which occur more than once.
library(dplyr)
df %>%
group_by(member_id) %>%
filter(n_distinct(group_id) > 1) %>%
pull(member_id) %>% unique
#[1] 3
data.table :
library(data.table)
unique(setDT(df)[, .SD[uniqueN(group_id) > 1], member_id]$member_id)
and for completeion here is base R approach -
unique(subset(df, ave(group_id, member_id, FUN = function(x)
length(unique(x))) > 1, select = member_id))
Using dplyr
library(dplyr)
df %>%
group_by(member_id) %>%
slice(which(n_distinct(group_id) > 1)) %>%
ungroup %>%
distinct(memberid) %>%
pull(memberid)

Subsetting a dataframe according to a list of vectors

I have a list of vectors of characters, called l. For example:
set.seed(42) ## for sake of reproducibility
genes <- paste("gene",1:20,sep="")
tot=data.frame(term=sample(genes,30, replace=T), num=sample(1:10, 30, replace=T), stringsAsFactors =
FALSE)
s1<-sample(genes,2, replace=F)
s2<-sample(genes,4, replace=F)
s3<-sample(genes,3, replace=F)
s4<-sample(genes,2, replace=F)
s5<-sample(genes,2, replace=F)
s6<-sample(genes,3, replace=F)
l=list(s1,s2,s3,s4,s5,s6)
By considering tot[tot$term%in%l[[1]],], I obtain:
term num
1 gene17 4
3 gene1 6
7 gene17 2
26 gene1 6
and I put
df=tot[tot$term%in%l[[1]],]
sum(df$num)
I can obtain the total values of second column, i.e. 18. For the other elements of the list I obtain, respectively: 32 13 19 17 29. This can be achieved by a for loop:
v<-vector()
for (j in 1:length(l)) {
df=tot[tot$term%in%l[[j]],]
v<-c(v,sum(df$num))
}
I would like to know if there is a more simple way of doing this.
It can be simplified with sapply
v2 <- sapply(l, function(j) sum(tot$num[tot$term %in% j]))
-checking with OP's loop output
identical(v, v2)
#[1] TRUE
Or a more compact way with map
library(purrr)
map_dbl(l, ~ sum(tot$num[tot$term %in% .x]))
Or with tidyverse
library(dplyr)
stack(setNames(l, seq_along(l))) %>%
group_by(ind) %>%
summarise(Sum = tot %>%
filter(term %in% values) %>%
pull(num) %>%
sum) %>%
pull(Sum)
Here is one tidyverse way :
library(tidyverse)
enframe(l, value = 'term') %>%
unnest(term) %>%
left_join(tot, by = 'term') %>%
group_by(name) %>%
summarise(num = sum(num, na.rm = TRUE))
# name num
#* <int> <int>
#1 1 18
#2 2 32
#3 3 13
#4 4 19
#5 5 17
#6 6 29

Use numeric value of `colnames` to recalculate columns in a data.frame using `dplyr`'s `mutate_at` or an alternative

I have a df where I want to recalculate some columns based on the value of that columns colnames:
library(dplyr)
df <- data.frame(year = 1:3, "10" = 0:2, "20" = 3:5)
colnames(df)[2:3] <- c("10", "20")
df
year 10 20
1 1 0 3
2 2 1 4
3 3 2 5
The expected output is col_name - col_values. I can generate the expected output with:
df %>% mutate(`10` = 10 - `10`) %>% mutate(`20` = 20 - `20`)
year 10 20
1 1 10 17
2 2 9 16
3 3 8 15
How can I generate the same output without explicitly copying the respecting colnames value?
I tried the following code (which works):
df %>% mutate(`10` = as.numeric(colnames(.)[2]) - `10`) %>% mutate(`20` = as.numeric(colnames(.)[3]) - `20`)
So I tried to further reduce this but could only think of:
df %>% mutate_at(vars(-year), ~ as.numeric(colnames(.)[.]))
which can obviously not work since the . has two meanings..
How can I achieve my expected output using mutate_at or an alternative?
Reshape, do stuff, then reshape again:
gather(df, key = "k", value = "v", -year) %>%
mutate(v = as.numeric(k) - v) %>%
spread(key = "k", value = "v")
# year 10 20
# 1 1 10 17
# 2 2 9 16
# 3 3 8 15
In base R, we can use lapply
df[-1] <- lapply(names(df[-1]), function(x) as.numeric(x) - df[,x])
# year 10 20
#1 1 10 17
#2 2 9 16
#3 3 8 15
Or mapply
df[-1] <- mapply(`-`, as.numeric(names(df[-1])), df[-1])
Here is one option with mutate_at
library(rlang)
library(tidyverse)
df %>%
mutate_at(2:3, list(~ as.numeric(as_name(quo(.)))- .))
# year 10 20
#1 1 10 17
#2 2 9 16
#3 3 8 15
Or this can be also done with deparse(substitute
df %>%
mutate_at(2:3, list(~ as.numeric(deparse(substitute(.))) - .))
Or using an option with map
map_dfc(names(df)[2:3], ~
df %>%
select(.x) %>%
mutate(!! .x := as.numeric(.x) - !! sym(.x))) %>%
bind_cols(df %>%
select(year), .)
Or with imap
df[-1] <- imap(df[-1], ~ as.numeric(.y) - .x)

Dynamic Cumsum with condition

I have the following problem I am trying to solve. Here some example data:
library(tidyverse)
library(lubridate)
date <- data.frame(date=seq(ymd('2018-01-01'),ymd('2018-02-28'), by = '1 day'))
group <- data.frame(group=c("A","B"))
subgroup <- data.frame(subgroup=c("C","D"))
DF <- merge(merge(date,group,by=NULL),subgroup,by=NULL)
DF$group_value <- apply(DF, 1, function(x) sample(8:12,1))
DF$subgroup_value <- apply(DF, 1, function(x) sample(1:5,1))
DF <- DF %>%
arrange(date,group,subgroup)
I now want to calculate the following:
for every given day t, group and subgroup combination calculate the
number of days until the (backward) cumsum of subgroup_value is equal
or greater than the group value of day t.
I know how to do that by using some for loops and some dplyr functionality, but this is just terrible slow:
for(i in seq(1,nrow(date),1)) {
for(j in seq(1,nrow(group),1)) {
for(k in seq(1,nrow(subgroup),1)) {
tmp <- DF %>%
filter(date<=date[i] & group == group[j] & subgroup == subgroup[k]) %>%
arrange(desc(date))
tmp$helper <- 1
tmp <- tmp %>%
mutate(
cs_helper = cumsum(helper),
cs_subgroup_value = cumsum(subgroup_value),
nr_days = case_when (
cs_subgroup_value >= group_value ~ cs_helper,
TRUE ~ NA_real_)
)
#this is the final result for date[i], group[j], subgroup[k]
value <- min(tmp[,"nr_days"], na.rm=T)
}
}
}
Example
head(DF,10)
date group subgroup group_value subgroup_value result
1 2018-01-01 A C 12 2 NA
2 2018-01-02 A C 11 4 NA
3 2018-01-03 A C 11 4 NA
4 2018-01-04 A C 9 5 2
5 2018-01-05 A C 12 5 3
6 2018-01-06 A C 10 3 3
7 2018-01-07 A C 12 5 3
8 2018-01-08 A C 8 1 3
9 2018-01-09 A C 12 4 4
10 2018-01-10 A C 9 1 4
So for row 10, I need to sum the last 4 values of subgroup to be greater or equal to 9.
I am sure that this code can be highly optimized by using some vectorized version but I am struggle to find a good starting point for that (As you can see from the code above, I am a newbie in R)
My question is: How would you approach this problem in order to vectorize it for speed optimisation?
Thanks!
Stephan
Here's an attempt, take a copy of each group/subgroups data frame, and cross join to the data. This is then filtered to only find the days before. This allows us for each day to calculate all of the cumulative sums
DF %>%
group_by(group, subgroup) %>%
mutate(day = row_number(), J = TRUE) %>%
nest() %>%
arrange(group, subgroup) %>%
mutate(data = map(data, function(d) {
inner_join(d, transmute(d, x = day, v = subgroup_value, J), by = "J") %>%
filter(day >= x) %>%
mutate(x = day - x + 1) %>%
arrange(day, x) %>%
group_by(date, group_value, date, subgroup_value) %>%
mutate(vv = cumsum(v),
vv = ifelse(vv >= group_value, vv, NA),
xx = ifelse(!is.na(vv), x, NA)) %>%
group_by(date, group_value, day, subgroup_value) %>%
summarise(x = min(xx, na.rm = TRUE), v = min(vv, na.rm = TRUE))
})) %>%
unnest()

How to match values of several variables to a variable in a look up table?

I have two datasets:
loc <- c("a","b","c","d","e")
id1 <- c(NA,9,3,4,5)
id2 <- c(2,3,7,5,6)
id3 <- c(2,NA,5,NA,7)
cost1 <- c(10,20,30,40,50)
cost2 <- c(50,20,30,30,50)
cost3 <- c(40,20,30,10,20)
dt <- data.frame(loc,id1,id2,id3,cost1,cost2,cost3)
id <- c(1,2,3,4,5,6,7)
rate <- c(0.9,0.8,0.7,0.6,0.5,0.4,0.3)
lookupd_tb <- data.frame(id,rate)
what I want to do, is to match the values in dt with lookup_tb for id1,id2 and id3 and if there is a match, multiply rate for that id to its related cost.
This is my approach:
dt <- dt %>%
left_join(lookupd_tb , by=c("id1"="id")) %>%
dplyr :: mutate(cost1 = ifelse(!is.na(rate), cost1*rate, cost1)) %>%
dplyr :: select (-rate)
what I am doing now, works fine but I have to repeat it 3 times for each variable and I was wondering if there is a more efficient way to do this(probably using apply family?)
I tried to join all three variables with id in my look up table but when rate is joined with my dt, all the costs (cost1, cost2 and cost3) will be multiply by the same rate which I don't want.
I appreciate your help!
A base R approach would be to loop through the columns of 'id' using sapply/lapply, get the matching index from the 'id' column of 'lookupd_tb', based on the index, get the corresponding 'rate', replace the NA elements with 1, multiply with 'cost' columns and update the 'cost' columns
nmid <- grep("id", names(dt))
nmcost <- grep("cost", names(dt))
dt[nmcost] <- dt[nmcost]*sapply(dt[nmid], function(x) {
x1 <- lookupd_tb$rate[match(x, lookupd_tb$id)]
replace(x1, is.na(x1), 1) })
Or using tidyverse, we can loop through the sets of columns i.e. 'id' and 'cost' with purrr::map2, then do the same approach as above. The only diference is that here we created new columns instead of updating the 'cost' columns
library(tidyverse)
dt %>%
select(nmid) %>%
map2_df(., dt %>%
select(nmcost), ~
.x %>%
match(., lookupd_tb$id) %>%
lookupd_tb$rate[.] %>%
replace(., is.na(.),1) * .y ) %>%
rename_all(~ paste0("costnew", seq_along(.))) %>%
bind_cols(dt, .)
In tidyverse you can also try an alternative approach by transforming the data from wide to long
library(tidyverse)
dt %>%
# data transformation to long
gather(k, v, -loc) %>%
mutate(ID=paste0("costnew", str_extract(k, "[:digit:]")),
k=str_remove(k, "[:digit:]")) %>%
spread(k, v) %>%
# left_join and calculations of new costs
left_join(lookupd_tb , by="id") %>%
mutate(cost_new=ifelse(is.na(rate), cost,rate*cost)) %>%
# clean up and expected output
select(loc, ID, cost_new) %>%
spread(ID, cost_new) %>%
left_join(dt,., by="loc") # or %>% bind_cols(dt, .)
loc id1 id2 id3 cost1 cost2 cost3 costnew1 costnew2 costnew3
1 a NA 2 2 10 50 40 10 40 32
2 b 9 3 NA 20 20 20 20 14 20
3 c 3 7 5 30 30 30 21 9 15
4 d 4 5 NA 40 30 10 24 15 10
5 e 5 6 7 50 50 20 25 20 6
The idea ist to bring the data in suitable long format for the lef_joining using a gather & spread combination with new index columns k and ID. After the calculation we will transform to the expected output using a second spread and binding to dt

Resources