Spread one column in multiple columns - r

I have one column "m" that contains multiple values associated with one subject (ID). I need to spread the values in this column in 5 different columns to obtain the second table that I provided below. I also need to associate names to those columns.
f <- read.table(header = TRUE, text = "
Scale ID m
1 1 1 0.4089795
2 1 1 0.001041055
3 1 1 0.1843616
4 1 1 0.03398921
5 1 1 FALSE
6 3 1 0.1179424
7 3 1 0.3569155
8 3 1 0.2006204
9 3 1 0.04024855
10 3 1 FALSE
")
Here's what the output should look like
ID Scale x y z a b
1 1 1 0.4089795 0.001041055 0.1843616 0.03398921 FALSE
2 1 3 0.1179424 0.356915500 0.2006204 0.04024855 FALSE
Thanks for any help!

df <- read.table(header = TRUE, text = "
Scale ID m
1 1 1 0.4089795
2 1 1 0.001041055
3 1 1 0.1843616
4 1 1 0.03398921
5 1 1 FALSE
6 3 1 0.1179424
7 3 1 0.3569155
8 3 1 0.2006204
9 3 1 0.04024855
10 3 1 FALSE
")
library(tidyverse)
df %>%
group_by(Scale, ID) %>% # for each combination of Scale and ID
mutate(names = c("x","y","z","a","b")) %>% # add column names
ungroup() %>% # forget the grouping
spread(-Scale, -ID) %>% # reshape data
select(Scale, ID, x, y, z, a, b) # order columns
# # A tibble: 2 x 7
# Scale ID x y z a b
# <int> <int> <fct> <fct> <fct> <fct> <fct>
# 1 1 1 0.4089795 0.001041055 0.1843616 0.03398921 FALSE
# 2 3 1 0.1179424 0.3569155 0.2006204 0.04024855 FALSE

Related

How to change iteratively the values of a column in R without a loop?

I would like to change iteratively the values of a column (value2 in the example). value2 at time i is conditioned by value1 and updated value2 at time i and i-1.
Time values are stocked in ascending order.
Treatment is done separetely for each value of the group colum.
But as describe on my example, I can't succeed to update value2 with accumulate2 (purrr package).
Maybe someone could give me some advices to do this.
Thank you.
input <- data.frame(group=c(1,1,1,2,2,2,2),
time=c(1,2,3,1,2,3,4),
value1=c(4,2,2,3,3,3,3),
value2=c(4,2,1,3,3,1,1))
input<-arrange(input, group,time)
my_function <- function(df) {
df %>%
as_tibble() %>%
group_by(group) %>%
mutate(value2=purrr::accumulate2(.x = value2, .y = ((value1==lag(value1))
& (lag(value2)==value1)
& (value1!=value2))[-1],
.f = function(.i_1, .i, .y) {
if (.y) {.i_1} else {.i}
}) %>% unlist())
}
> input
group time value1 value2
1 1 1 4 4
2 1 2 2 2
3 1 3 2 1
4 2 1 3 3
5 2 2 3 3
6 2 3 3 1
7 2 4 3 1
output <- my_function(input)
> output
group time value1 value2
1 1 1 4 4
2 1 2 2 2
3 1 3 2 2 -> data change (OK)
4 2 1 3 3
5 2 2 3 3
6 2 3 3 3 -> data change (OK)
7 2 4 3 1 -> no data change / should be replaced by 3
It seems that your problem lies in your algorithm. Unfortunately, as you didn't explain it here, we cannot help you in that matter.
purrr::accumulate2 can be hard to use, so I advise you to split your code as much as possible. This will make your code much more readable, and will make debugging and finding errors much easier.
For instance, consider this:
library(tidyverse)
input <- data.frame(group=c(1,1,1,2,2,2,2),
time=c(1,2,3,1,2,3,4),
value1=c(4,2,2,3,3,3,3),
value2=c(4,2,1,3,3,1,1))
input <- arrange(input, group,time)
#document your functions when it
#' #param .i_1 this is ...
#' #param .i this is ...
#' #param .y this is ...
my_accu_function = function(.i_1, .i, .y) {
if(.y) {.i_1} else {.i}
}
my_function <- function(df) {
df %>%
as_tibble() %>%
group_by(group) %>%
mutate(
cond = value1==lag(value1) &
lag(value2)==value1 &
value1!=value2,
value2_update=purrr::accumulate2(.x = value2,
.y = cond[-1],
.f = my_accu_function) %>% unlist()
)
}
input
#> group time value1 value2
#> 1 1 1 4 4
#> 2 1 2 2 2
#> 3 1 3 2 1
#> 4 2 1 3 3
#> 5 2 2 3 3
#> 6 2 3 3 1
#> 7 2 4 3 1
output = my_function(input)
output
#> # A tibble: 7 x 6
#> # Groups: group [2]
#> group time value1 value2 cond value2_update
#> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>
#> 1 1 1 4 4 FALSE 4
#> 2 1 2 2 2 FALSE 2
#> 3 1 3 2 1 TRUE 2
#> 4 2 1 3 3 FALSE 3
#> 5 2 2 3 3 FALSE 3
#> 6 2 3 3 1 TRUE 3
#> 7 2 4 3 1 FALSE 1
stopifnot(output$value2_update[7]==3)
#> Error: output$value2_update[7] == 3 is not TRUE
Created on 2022-05-11 by the reprex package (v2.0.1)
You can see that cond is FALSE in the end, so accumulate2 did its job putting the current value 1 and not the previous value 3.
If you explain your algorithm to us, maybe we can help you with setting the proper condition cond so that you get the right output.

How should a function be applied by row on a dataframe to generate a new or expanded dataframe in r

I am trying to expand an existing dataset, which currently looks like this:
df <- tibble(
site = letters[1:3],
years = rep(4, 3),
tr = c(3, 6, 4)
)
tr is the total number of replicates for each site/year combination. I simply want to add in the replicates and later the response variable for each replicate. This was easy for a single site/year combination using the following function:
f <- function(site=NULL, years=NULL, t=NULL){
df <- tibble(
site = rep(site, each = t, times= years),
tr = rep(1:t, times = years),
year = rep(1:years, each = t)
)
df
}
# For one site:
f(site='a', years=4, t=3)
# Producing this:
# # A tibble: 12 x 3
# site tr year
# <chr> <int> <int>
# 1 a 1 1
# 2 a 2 1
# 3 a 3 1
# 4 a 1 2
# 5 a 2 2
# 6 a 3 2
# 7 a 1 3
# 8 a 2 3
# 9 a 3 3
# 10 a 1 4
# 11 a 2 4
# 12 a 3 4
How can the function be applied to each row of the input dataframe to produce the final dataframe? One of the apply functions in base r or the pmap_df() in the purrr package would seem ideal, but being unfamiliar with how these functions work, all my efforts have only produced errors.
If we want to apply the same function, use pmap
library(purrr)
pmap_dfr(df, ~ f(..1, ..2, ..3))
# A tibble: 52 x 3
# site tr year
# * <chr> <int> <int>
# 1 a 1 1
# 2 a 2 1
# 3 a 3 1
# 4 a 1 2
# 5 a 2 2
# 6 a 3 2
# 7 a 1 3
# 8 a 2 3
# 9 a 3 3
#10 a 1 4
# … with 42 more rows
another option is condense from the devel version of dplyr
library(tidyr)
df %>%
group_by(rn = row_number()) %>%
condense(out = f(site, years, tr)) %>%
unnest(c(out))
Or in base R, we can also use do.call with Map
do.call(rbind, do.call(Map, c(f, unname(as.data.frame(df)))))
well in base R, you could do:
do.call(rbind,do.call(Vectorize(f,SIMPLIFY = FALSE),unname(df)))
# A tibble: 52 x 3
site tr year
* <chr> <int> <int>
1 a 1 1
2 a 2 1
3 a 3 1
4 a 1 2
5 a 2 2
6 a 3 2
7 a 1 3
8 a 2 3
9 a 3 3
10 a 1 4
# ... with 42 more rows
do.call(rbind, lapply(split(df, df$site), function(x){
with(x, data.frame(site,
years = rep(sequence(years), each = tr),
tr = rep(sequence(tr), years)))
}))
We can use Map to apply f to every value of site, years and tr.
do.call(rbind, Map(f, df$site, df$years, df$tr))
# A tibble: 52 x 3
# site tr year
# * <chr> <int> <int>
# 1 a 1 1
# 2 a 2 1
# 3 a 3 1
# 4 a 1 2
# 5 a 2 2
# 6 a 3 2
# 7 a 1 3
# 8 a 2 3
# 9 a 3 3
#10 a 1 4
# … with 42 more rows
Akrun's answer worked well for me, so I modified it to make the function to be applied to each row of the dataframe a little more explicit:
df1 <- pmap_df(df, function(site, years, tr){
site = rep(site, each = tr, times=years)
year = rep(1:years, each = tr)
tr = rep(1:tr, times=years)
return(tibble(site, year, tr))
})

Add original values for columns after group by

For the dataframe below I want to add the original values for Var_x after a group_by on ID and event and a max() on quest, but I cannot get my code right. Any suggestions? By the way, in my original dataframe more than 1 column needs to be added.
df <- data.frame(ID = c(1,1,1,1,1,1,2,2,2,3,3,3),
quest = c(1,1,2,2,3,3,1,2,3,1,2,3),
event = c("A","B","A","B","A",NA,"C","D","C","D","D",NA),
VAR_X = c(2,4,3,6,3,NA,6,4,5,7,5,NA))
Code:
df %>%
group_by(ID,event) %>%
summarise(quest = max(quest))
Desired output:
ID quest event VAR_X
1 1 2 B 6
2 1 3 A 3
3 2 2 D 4
4 2 3 C 5
5 3 2 D 5
Start by omiting the na values and in the end do an inner_join with the original data set.
df %>%
na.omit() %>%
group_by(ID, event) %>%
summarise(quest = max(quest)) %>%
inner_join(df, by = c("ID", "event", "quest"))
## A tibble: 5 x 4
## Groups: ID [3]
# ID event quest VAR_X
# <dbl> <fct> <dbl> <dbl>
#1 1 A 3 3
#2 1 B 2 6
#3 2 C 3 5
#4 2 D 2 4
#5 3 D 2 5
df %>%
drop_na() %>% # remove if necessary ..
group_by(ID, event) %>%
filter(quest == max(quest)) %>%
ungroup()
# A tibble: 5 x 4
# ID quest event VAR_X
#<dbl> <dbl> <chr> <dbl>
# 1 1 2 B 6
# 2 1 3 A 3
# 3 2 2 D 4
# 4 2 3 C 5
# 5 3 2 D 5

bind_rows to each group of tibble

Consider the following two tibbles:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value = 1:6)
So a and b have the same columns and b has an additional column called id.
I want to do the following: group b by id and then add tibble a on top of each group.
So the output should look like this:
# A tibble: 10 x 3
id time value
<chr> <int> <int>
1 a -1 100
2 a 0 200
3 a 1 1
4 a 2 2
5 a 3 3
6 b -1 100
7 b 0 200
8 b 1 4
9 b 2 5
10 b 3 6
Of course there are multiple workarounds to achieve this (like loops for example). But in my case I have a large number of IDs and a very large number of columns.
I would be thankful if anyone could point me towards the direction of a solution within the tidyverse.
Thank you
We can expand the data frame a with id from b and then bind_rows them together.
library(tidyverse)
a2 <- expand(a, id = b$id, nesting(time, value))
b2 <- bind_rows(a2, b) %>% arrange(id, time)
b2
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
split from base R will divide a data frame into a list of subsets based on an index.
b %>%
split(b[["id"]]) %>%
lapply(bind_rows, a) %>%
lapply(select, -"id") %>%
bind_rows(.id = "id")
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a 1 1
# 2 a 2 2
# 3 a 3 3
# 4 a -1 100
# 5 a 0 200
# 6 b 1 4
# 7 b 2 5
# 8 b 3 6
# 9 b -1 100
# 10 b 0 200
An idea (via base R) is to split your data frame and create a new one with id + the other data frame and rbind, i.e.
df = do.call(rbind, lapply(split(b, b$id), function(i)rbind(data.frame(id = i$id[1], a), i)))
which gives
id time value
a.1 a -1 100
a.2 a 0 200
a.3 a 1 1
a.4 a 2 2
a.5 a 3 3
b.1 b -1 100
b.2 b 0 200
b.3 b 1 4
b.4 b 2 5
b.5 b 3 6
NOTE: You can remove the rownames by simply calling rownames(df) <- NULL
We can nest and add the relevant rows to each nested item :
library(tidyverse)
b %>%
nest(-id) %>%
mutate(data= map(data,~bind_rows(a,.x))) %>%
unnest
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
Maybe not the most efficient way, but easy to follow:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value =
1:6)
a.a <- a %>% add_column(id = rep("a",length(a)))
a.b <- a %>% add_column(id = rep("b",length(a)))
joint <- bind_rows(b,a.a,a.b)
(joint <- arrange(joint,id))

Add missing subtotals to each group using dplyr

I need to add a new row to each id group where the key= "n" and value is the total - a + b
x <- data_frame( id = c(1,1,1,2,2,2,2),
key = c("a","b","total","a","x","b","total"),
value = c(1,2,10,4,1,3,12) )
# A tibble: 7 × 3
id key value
<dbl> <chr> <dbl>
1 1 a 1
2 1 b 2
3 1 total 10
4 2 a 4
5 2 x 1
6 2 b 3
7 2 total 12
In this example, the new rows should be
1 n 7
2 n 5
I tried getting the a+b subtotal and joining that to the total count to get the difference, but after using nine dplyr verbs I seem to be going in the wrong direction. Thanks.
This isn't a join, it's just binding new rows on:
x %>% group_by(id) %>%
summarize(
value = sum(value[key == 'total']) - sum(value[key %in% c('a', 'b')]),
key = 'n'
) %>%
bind_rows(x) %>%
select(id, key, value) %>% # back to original column order
arrange(id, key) # and a start a row order
# # A tibble: 9 × 3
# id key value
# <dbl> <chr> <dbl>
# 1 1 a 1
# 2 1 b 2
# 3 1 n 7
# 4 1 total 10
# 5 2 a 4
# 6 2 b 3
# 7 2 n 5
# 8 2 total 12
# 9 2 x 1
Here's a way using data.table, binding rows as in Gregor's answer:
library(data.table)
setDT(x)
dcast(x, id ~ key)[, .(id, key = "n", value = total - a - b)][, rbind(.SD, x)][order(id)]
id key value
1: 1 n 7
2: 1 a 1
3: 1 b 2
4: 1 total 10
5: 2 n 5
6: 2 a 4
7: 2 x 1
8: 2 b 3
9: 2 total 12

Resources