Identifying missing observations in groups - r

I have some difficulties with my code, and I hope some of you could help.
The dataset looks something like this:
df <- data.frame("group" = c("A", "A", "A","A_1", "A_1", "B","B","B_1"),
"id" = c("id1", "id2", "id3", "id2", "id3", "id5","id1","id1"),
"time" = c(1,1,1,3,3,2,2,5),
"Val" = c(10,10,10,10,10,12,12,12))
"group" indicate the group the individual "id" is in. "A_1" indicate that a subject has left the group.
For instance, one subject "id1" leaves the "group A" that becomes group "A_1", where only "id2" and "id3" are members. Similarly "id5" leaves group B that becomes "B_1" with only id1 as a member.
What I would like to have in the final dataset is an opposite type of groups identification, that should look something like this:
final <- data.frame("group" = c("A", "A", "A","A_1", "B","B","B_1"),
"id" = c("id1", "id2", "id3", "id1", "id5","id1","id5"),
"time" = c(1,1,1,3,2,2,5),
"Val" = c(10,10,10,10,12,12,12),
"groupid" = c("A", "A", "A","A", "B","B","B"))
Whereby "A_1" and "B_1" only indicate the subjects, "id1" and "id5" respectively, that have left the original group, rather than identifying remaining subjects.
Does anyone have suggestions on how I could systematically do this?
I thank you in advance for your help.
Follow up:
My data is a little more complex that in the above example as there are multiple "exits" from treatements, moreover group identifier can be of different character leghts (here for instance AAA and B). The data looks more like the following:
df2 <- data.frame("group" = c("AAA", "AAA", "AAA","AAA","AAA_1","AAA_1", "AAA_1","AAA_2","AAA_2","B","B","B_1"),
"id" = c("id1", "id2", "id3","id4", "id2", "id3","id4", "id2","id3", "id5","id1","id1"),
"time" = c(1,1,1,1,3,3,3,6,6,2,2,5),
"Val" = c(10,10,10,10,10,10,10,10,10,12,12,12))
Where at time 3 id1 leaves groups AAA, that becomes groups AAA_1, while at time 6, also id4 leaves group AAA, that becomes group AAA_2. As discussed previously, i would like groups with "_" to identify those id that left the group rather than the one remaining. Hence the final dataset should look something like this:
final2 <- data.frame("group" = c("A", "A", "A","A","A_1","A_2",
"B","B","B_1"),
"id" = c("id1", "id2", "id3","id4", "id1", "id4", "id5","id1","id5"),
"time" = c(1,1,1,1,3,6,2,2,5),
"Val" = c(10,10,10,10,10,10,12,12,12))
thanks for helping me with this

Ok you can try with dplyr in this way: maybe it's not elegant, but you get the result. The idea behind is to first fetch the ones that are in group ... but not in the relative ..._1 and change their group, fetch the others, and rbind them together:
library(dplyr)
# first you could find the one that are missing in the ..._1 groups
# and change their group to ..._1
dups <-
df %>%
group_by(id, groupid = substr(group,1,1)) %>%
filter(n() == 1)%>%
mutate(group = paste0(group,'_1')) %>%
left_join(df %>%
select(group, time, Val) %>%
distinct(), by ='group') %>%
select(group, id, time = time.y, Val = Val.y) %>%
ungroup()
dups
# A tibble: 2 x 5
groupid group id time Val
<chr> <chr> <fct> <dbl> <dbl>
1 A A_1 id1 3 10
2 B B_1 id5 5 12
# now you can select the ones that are in both groups:
dups2 <-
df %>%
filter(nchar(as.character(group)) == 1) %>%
mutate(groupid = substr(group,1,1))
dups2
group id time Val groupid
1 A id1 1 10 A
2 A id2 1 10 A
3 A id3 1 10 A
4 B id5 2 12 B
5 B id1 2 12 B
Last, rbind() them, arrange() them and order() the columns:
rbind(dups, dups2) %>%
arrange(group) %>%
select(group, id, time, Val, groupid)
# A tibble: 7 x 5
group id time Val groupid
<chr> <fct> <dbl> <dbl> <chr>
1 A id1 1 10 A
2 A id2 1 10 A
3 A id3 1 10 A
4 A_1 id1 3 10 A
5 B id5 2 12 B
6 B id1 2 12 B
7 B_1 id5 5 12 B
Hope it helps!
EDIT:
You can generalize it with some work, here my attempt, hope it helps:
library(dplyr)
df3 <- df2
# you have to set a couple of fields you need:
df3$group <-ifelse(
substr(df2$group,(nchar(as.character(df2$group))+1)-1,nchar(as.character(df2$group))) %in% c(0:9),
paste0(substr(df2$group,1,1),"_",substr(df2$group,(nchar(as.character(df2$group))+1)-1,nchar(as.character(df2$group)))),
paste0(substr(df2$group,1,1),"_0")
)
df3$util <- as.numeric(substr(df3$group,3,3))+1
# two empty lists to populate with a nested loop:
changed <- list()
final_changed <- list()
Now first we find who changes, then the other: the idea is the same of the previous part:
for (j in c("A","B")) {
df3_ <- df3[substr(df3$group,1,1)==j,]
for (i in unique(df3_$util)[1:length(unique(df3_$util))-1]) {
temp1 <- df3_[df3_$util == i,]
temp2 <- df3_[df3_$util == i+1,]
changes <- temp1[!temp1$id %in% temp2$id,]
changes$group <- paste0(j,'_',i )
changes <- changes %>% left_join(temp2, by = 'group') %>%
select(group , id = id.x, time = time.y, Val = Val.y)
changed[[i]] <- changes
}
final_changed[[j]] <- changed
}
change <- do.call(rbind,(do.call(Map, c(f = rbind, final_changed)))) %>% distinct()
change
group id time Val
1 A_1 id1 3 10
2 B_1 id5 5 12
3 A_2 id4 6 10
Then the remains, and put together:
remain <-
df3 %>% mutate(group = gsub("_0", "", .$group)) %>%
filter(nchar(as.character(group)) == 1) %>% select(-util)
rbind(change, remain) %>%
mutate(groupid = substr(group,1,1)) %>% arrange(group) %>%
select(group, id, time, Val, groupid)
group id time Val groupid
1 A id1 1 10 A
2 A id2 1 10 A
3 A id3 1 10 A
4 A id4 1 10 A
5 A_1 id1 3 10 A
6 A_2 id4 6 10 A
7 B id5 2 12 B
8 B id1 2 12 B
9 B_1 id5 5 12 B

Related

Replace a value in a data frame from other dataframe in r

Hi I have two dataframes, based on the id match, i wanted to replace table a's values with that of table b.
sample dataset is here :
a = tibble(id = c(1, 2,3),
type = c("a", "x", "y"))
b= tibble(id = c(1,3),
type =c("d", "n"))
Im expecting an output like the following :
c= tibble(id = c(1,2,3),
type = c("d", "x", "n"))
In dplyr v1.0.0, the rows_update() function was introduced for this purpose:
rows_update(a, b)
# Matching, by = "id"
# # A tibble: 3 x 2
# id type
# <dbl> <chr>
# 1 1 d
# 2 2 x
# 3 3 n
Here is an option using dplyr::left_join and dplyr::coalesce
library(dplyr)
a %>%
rename(old = type) %>%
left_join(b, by = "id") %>%
mutate(type = coalesce(type, old)) %>%
select(-old)
## A tibble: 3 × 2
# id type
#. <dbl> <chr>
#1 1 d
#2 2 x
#3 3 n
The idea is to join a with b on column id; then replace missing values in type from b with values from a (column old is the old type column from a, avoiding duplicate column names).

Find novel categories between groups

I am trying to identify which trees are different between two groups a & b across different forest types (type).
My dummy example:
dd1 <- data.frame(
type = rep(1, 5),
grp = c('a', 'a', 'a', 'b', 'b'),
sp = c('oak', 'beech', 'spruce',
'oak', 'yew')
)
dd2 <- data.frame(
type = rep(2, 3),
grp = c('a', 'b', 'b'),
sp = c('oak', 'beech', 'spruce')
)
dd <- rbind(dd1, dd2)
I can find unique species by each group (in reality, two groups: type & grp) by distinct:
dd %>%
group_by(type, grp) %>%
distinct(sp)
But instead I want to know which trees in group b are different from group a?
Expected output:
type grp sp
<dbl> <chr> <chr>
1 1 b yew # here, only `yew` is a new one; `oak` was previously listed in group `a`
2 2 b beech # both beech and spruce are new compared to group `a`
3 2 b spruce
How can I do this? Thank you!
The condition to filter is
library(dplyr)
dd %>%
group_by(type) %>%
filter(grp == 'b' & !sp %in% sp[grp == 'a']) %>%
ungroup()
# # A tibble: 3 × 3
# type grp sp
# <dbl> <chr> <chr>
# 1 1 b yew
# 2 2 b beech
# 3 2 b spruce
You could try an anti_join:
library(dplyr)
library(tidyr)
dd |>
anti_join(dd |> filter(grp == "a"), by = c("sp", "type"))
Output:
type grp sp
1 1 b yew
2 2 b beech
3 2 b spruce

Find unique entries in otherwise identical rows

I am currently trying to find a way to find unique column values in otherwise duplicate rows in a dataset.
My dataset has the following properties:
The dataset's columns comprise an identifier variable (ID) and a large number of response variables (x1 - xn).
Each row should represent one individual, meaning the values in the ID column should all be unique (and not repeated).
Some rows are duplicated, with repeated entries in the ID column and seemingly identical response item values (x1 - xn). However, the dataset is too large to get a full overview over all variables.
As demonstrated in the code below, if rows are truly identical for all variables, then the duplicate row can be removed with the dplyr::distinct() function. In my case, not all "duplicate" rows are removed by distinct(), which can only mean that not all entries are identical.
I want to find a way to identify which entries are unique in these otherwise duplicate rows.
Example:
library(dplyr)
library(janitor)
df <- data.frame(
"ID" = rep(1:3, each = 2),
"x1" = rep(4:6, each = 2),
"x2" = c("a", "a", "b", "b", "c", "d"),
"x3" = c(7, 10, 8, 8, 9, 11),
"x4" = rep(letters[4:6], each = 2),
"x5" = c("x", "p", "y", "y", "z", "q"),
"x6" = rep(letters[7:9], each = 2)
)
# The dataframe with all entries
df
A data.frame: 6 × 7
ID x1 x2 x3 x4 x5 x6
1 4 a 7 d x g
1 4 a 10 d p g
2 5 b 8 e y h
2 5 b 8 e y h
3 6 c 9 f z i
3 6 d 11 f q i
# The dataframe
df %>%
# with duplicates removed
distinct() %>%
# filtered for columns only containing duplicates in the ID column
janitor::get_dupes(ID)
ID dupe_count x1 x2 x3 x4 x5 x6
1 2 4 a 7 d x g
1 2 4 a 10 d p g
3 2 6 c 9 f z i
3 2 6 d 11 f q i
In the example above I demonstrate how dplyr::distinct() will remove fully duplicate rows (ID = 2), but not rows that are different in some columns (rows where ID = 1 and 3, and columns x2, x3 and x5).
What I want is an overview over which columns that are not duplicates for each value:
df %>%
distinct() %>%
janitor::get_dupes(ID) %>%
# Here I want a way to find columns with unidentical entries:
find_nomatch()
ID x2 x3 x5
1 7 x
1 10 p
3 c 9 z
3 d 11 q
A data.table alternative. Coerce data frame to a data.table (setDT). Melt data to long format (melt(df, id.vars = "ID")).
Within each group defined by 'ID' and 'variable' (corresponding to the columns in the wide format) (by = .(ID, variable)), count number of unique values (uniqueN(value)) and check if it's equal to the number of rows in the subgroup (== .N). If so (if), select the entire subgroup (.SD).
Finally, reshape the data back to wide format (dcast).
library(data.table)
setDT(df)
d = melt(df, id.vars = "ID")
dcast(d[ , if(uniqueN(value) == .N) .SD, by = .(ID, variable)], ID + rowid(ID, variable) ~ variable)
# ID ID_1 x2 x3 x5
# 1: 1 1 <NA> 7 x
# 2: 1 2 <NA> 10 p
# 3: 3 1 c 9 z
# 4: 3 2 d 11 q
A bit more simple than yours I think:
library(dplyr)
library(janitor)
df <- data.frame(
"ID" = rep(1:3, each = 2),
"x1" = rep(4:6, each = 2),
"x2" = c("a", "a", "b", "b", "c", "d"),
"x3" = c(7, 10, 8, 8, 9, 11),
"x4" = rep(letters[4:6], each = 2),
"x5" = c("x", "p", "y", "y", "z", "q"),
"x6" = rep(letters[7:9], each = 2)
)
d <- df %>%
distinct() %>%
janitor::get_dupes(ID)
d %>%
group_by(ID) %>%
# Check for each id which row elements are different from the of the first
group_map(\(.x, .id) apply(.x, 1, \(.y) .x[1, ] != .y))%>%
do.call(what = cbind) %>% # Bind results for all ids
apply(1, any) %>% # return true if there are differences anywhere
c(T, .) %>% # Keep id column
`[`(d, .)
#> ID x2 x3 x5
#> 1 1 a 7 x
#> 2 1 a 10 p
#> 3 3 c 9 z
#> 4 3 d 11 q
Created on 2022-01-18 by the reprex package (v2.0.1)
Edit
d %>%
group_by(ID) %>%
# Check for each id which row elements are different from the of the first
group_map(\(.x, .id) apply(.x, 1, \(.y) !Vectorize(identical)(unlist(.x[1, ]), .y))) %>%
do.call(what = cbind) %>% # Bind results for all ids
apply(1, any) %>% # return true if there are differences anywhere
c(T, .) %>% # Keep id column
`[`(d, .)
#> ID x2 x3 x5
#> 1 1 a 7 x
#> 2 1 a 10 p
#> 3 3 c 9 z
#> 4 3 d 11 q
Created on 2022-01-19 by the reprex package (v2.0.1)
I have been working on this issue for some time and I found a solution, though it tooks more step than I would've though necessary. I can only presume there's a more elegant solution out there. Anyway, this should work:
df <- df %>%
distinct() %>%
janitor::get_dupes(ID)
# Make vector of unique values from the duplicated ID values
l <- distinct(df, ID) %>% unlist()
# Lapply on each ID
df <- lapply(
l,
function(x) {
# Filter rows for the duplicated ID
dplyr::filter(df, ID == x) %>%
# Transpose dataframe (converts it into a matrix)
t() %>%
# Convert back to data frame
as.data.frame() %>%
# Filter columns that are not identical
dplyr::filter(!if_all(everything(), ~ . == V1)) %>%
# Transpose back
t() %>%
# Convert back to data frame
as.data.frame()
}
) %>%
# Bind the dataframes in the list together
bind_rows() %>%
# Finally the columns are moved back in ascending order
relocate(x2, .before = x3)
#Remove row names (not necessary)
row.names(df) <- NULL
df
A data.frame: 4 × 3
x2 x3 x5
NA 7 x
NA 10 p
c 9 z
d 11 q
Feel free to comment
If you just want to keep the first instance of each identifier:
df <- data.frame(
"ID" = rep(1:3, each = 2),
"x1" = rep(4:6, each = 2),
"x2" = rep(letters[1:3], each = 2),
"x3" = c(7, 10, 8, 8, 9, 11),
"x4" = rep(letters[4:6], each = 2)
)
df %>%
distinct(ID, .keep_all = TRUE)
Output:
ID x1 x2 x3 x4
1 1 4 a 7 d
2 2 5 b 8 e
3 3 6 c 9 f

How to choose the most common value in a group related to other group in R?

I have in R the following data frame:
ID = c(rep(1,5),rep(2,3),rep(3,2),rep(4,6));ID
VAR = c("A","A","A","A","B","C","C","D",
"E","E","F","A","B","F","C","F");VAR
CATEGORY = c("ANE","ANE","ANA","ANB","ANE","BOO","BOA","BOO",
"CAT","CAT","DOG","ANE","ANE","DOG","FUT","DOG");CATEGORY
DATA = data.frame(ID,VAR,CATEGORY);DATA
That looks like this table below :
ID
VAR
CATEGORY
1
A
ANE
1
A
ANE
1
A
ANA
1
A
ANB
1
B
ANE
2
C
BOO
2
C
BOA
2
D
BOO
3
E
CAT
3
E
CAT
4
F
DOG
4
A
ANE
4
B
ANE
4
F
DOG
4
C
FUT
4
F
DOG
ideal output given the above data frame in R I want to be like that:
ID
TEXTS
category
1
A
ANE
2
C
BOO
3
E
CAT
4
F
DOG
More specifically: I want for ID say 1 to search the most common value in the column VAR which is A and then to search the most common value in the column CATEGORY related to the most common value A which is the ANE and so forth.
How can I do it in R ?
Imagine that it is sample example.My real data frame contains 850.000 rows and has 14000 unique ID.
Another dplyr strategy using count and slice:
library(dplyr)
DATA %>%
group_by(ID) %>%
count(VAR, CATEGORY) %>%
slice(which.max(n)) %>%
select(-n)
ID VAR CATEGORY
<dbl> <chr> <chr>
1 1 A ANE
2 2 C BOA
3 3 E CAT
4 4 F DOG
dplyr
library(dplyr)
DATA %>%
group_by(ID) %>%
filter(VAR == names(sort(table(VAR), decreasing=TRUE))[1]) %>%
group_by(ID, VAR) %>%
summarize(CATEGORY = names(sort(table(CATEGORY), decreasing=TRUE))[1]) %>%
ungroup()
# # A tibble: 4 x 3
# ID VAR CATEGORY
# <dbl> <chr> <chr>
# 1 1 A ANE
# 2 2 C BOA
# 3 3 E CAT
# 4 4 F DOG
Data
DATA <- structure(list(ID = c(1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4), VAR = c("A", "A", "A", "A", "B", "C", "C", "D", "E", "E", "F", "A", "B", "F", "C", "F"), CATEGORY = c("ANE", "ANE", "ANA", "ANB", "ANE", "BOO", "BOA", "BOO", "CAT", "CAT", "DOG", "ANE", "ANE", "DOG", "FUT", "DOG")), class = "data.frame", row.names = c(NA, -16L))
We could modify the Mode to return the index and use that in slice after grouping by 'ID'
Modeind <- function(x) {
ux <- unique(x)
which.max(tabulate(match(x, ux)))
}
library(dplyr)
DATA %>%
group_by(ID) %>%
slice(Modeind(VAR)) %>%
ungroup
-output
# A tibble: 4 x 3
ID VAR CATEGORY
<dbl> <chr> <chr>
1 1 A ANE
2 2 C BOO
3 3 E CAT
4 4 F DOG
A base R option with nested subset + ave
subset(
subset(
DATA,
!!ave(ave(ID, ID, VAR, FUN = length), ID, FUN = function(x) x == max(x))
),
!!ave(ave(ID, ID, VAR, CATEGORY, FUN = length), ID, VAR, FUN = function(x) seq_along(x) == which.max(x))
)
gives
ID VAR CATEGORY
1 1 A ANE
6 2 C BOO
9 3 E CAT
11 4 F DOG
Explanation
The inner subset + ave is to filter out the rows with the most common VAR values (grouped by ID)
Based on the trimmed data frame the previous step, the outer subset + ave is to filter out the rows with the most common CATEGORY values ( grouped by ID + VAR)

Tidying table with multiple groups of wide columns, using tidyverse

I often find myself in a situation where I have a table that contains multiple groups of wide columns, like so:
replicate groupA VA1 VA2 groupB VB1 VB2
1 1 a 0.3429166 -2.30336406 f 0.05363582 1.6454078
2 2 b -1.3183732 -0.13516849 g -0.42586417 0.1541541
3 3 c -0.7908358 -0.10746447 h 1.05134242 1.4297350
4 4 d -0.9963677 -1.82557058 i -1.14532536 1.0815733
5 5 e -1.3634609 0.04385812 j -0.65643595 -0.1452877
And I'd like to turn the columns into one long table, like so:
replicate group key value
1 1 a V1 0.34291665
2 2 b V1 -1.31837322
3 3 c V1 -0.79083580
4 4 d V1 -0.99636772
5 5 e V1 -1.36346088
6 1 a V2 -2.30336406
7 2 b V2 -0.13516849
8 3 c V2 -0.10746447
9 4 d V2 -1.82557058
10 5 e V2 0.04385812
11 1 f V1 0.05363582
12 2 g V1 -0.42586417
13 3 h V1 1.05134242
14 4 i V1 -1.14532536
15 5 j V1 -0.65643595
16 1 f V2 1.64540784
17 2 g V2 0.15415408
18 3 h V2 1.42973499
19 4 i V2 1.08157329
20 5 j V2 -0.14528774
I can do this by selecting the two groups of columns individually, tidying, and then rbinding together (code below). However, this approach doesn't seem particularly elegant, and it becomes cumbersome if there are more than two groups of columns. I'm wondering whether there's a more elegant approach, using a single pipe chain of data transformations.
The fundamental question here is: How do we automate the process of breaking the table into groups of columns, tidying those, and then combining back together.
My current code:
library(dplyr)
library(tidyr)
# generate example code
df_wide <- data.frame(replicate = 1:5,
groupA = letters[1:5],
VA1 = rnorm(5),
VA2 = rnorm(5),
groupB = letters[6:10],
VB1 = rnorm(5),
VB2 = rnorm(5))
# tidy columns with A in the name
dfA <- select(df_wide, replicate, groupA, VA1, VA2) %>%
gather(key, value, VA1, VA2) %>%
mutate(key = case_when(key == "VA1" ~ "V1",
key == "VA2" ~ "V2")) %>%
select(replicate, group = groupA, key, value)
# tidy columns with B in the name
dfB <- select(df_wide, replicate, groupB, VB1, VB2) %>%
gather(key, value, VB1, VB2) %>%
mutate(key = case_when(key == "VB1" ~ "V1",
key == "VB2" ~ "V2")) %>%
select(replicate, group = groupB, key, value)
# combine
df_long <- rbind(dfA, dfB)
Note: Similar questions have been asked here and here, but I think the accepted answer shows that this here is a subtly different problem.
1
Although the question asked for a tidyverse solution, there is a convenient option with melt from data.table, which also can take multiple patterns in the measure argument.
library(data.table)
setnames(melt(melt(setDT(df1), measure = patterns('group', 'VA', 'VB')),
id.var = 1:3)[, -4, with = FALSE], 2:3, c('key', 'group'))[]
2. a
with tidyverse we can subset the datasets into a list, then loop through the list with map_df convert it to 'long' format with gather to get a single data.frame
library(tidyverse)
list(df1[1:4], df1[c(1,5:7)]) %>%
map_df(~gather(., key, value, 3:4) %>%
{names(.)[2] <- 'group';.}) %>%
mutate(key = sub('(.).(.)', '\\1\\2', key))
# replicate group key value
#1 1 a V1 0.34291660
#2 2 b V1 -1.31837320
#3 3 c V1 -0.79083580
#4 4 d V1 -0.99636770
#5 5 e V1 -1.36346090
#6 1 a V2 -2.30336406
#7 2 b V2 -0.13516849
#8 3 c V2 -0.10746447
#9 4 d V2 -1.82557058
#10 5 e V2 0.04385812
#11 1 f V1 0.05363582
#12 2 g V1 -0.42586417
#13 3 h V1 1.05134242
#14 4 i V1 -1.14532536
#15 5 j V1 -0.65643595
#16 1 f V2 1.64540780
#17 2 g V2 0.15415410
#18 3 h V2 1.42973500
#19 4 i V2 1.08157330
#20 5 j V2 -0.14528770
2.b
If we need to split based on the occurence of 'group'
split.default(df1[-1], cumsum(grepl('group', names(df1)[-1]))) %>%
map(~bind_cols(df1[1], .)) %>%
map_df(~gather(., key, value, 3:4) %>%
{names(.)[2] <- 'group';.}) %>%
mutate(key = sub('(.).(.)', '\\1\\2', key))
2.c
Included rename_at instead of names assignment in the spirit of tidyverse options
df1[-1] %>%
split.default(cumsum(grepl('group', names(df1)[-1]))) %>%
map_df(~bind_cols(df1[1], .) %>%
gather(., key, value, 3:4) %>%
rename_at(2, funs(substring(.,1, 5))))
NOTE:
1) Both 2.a, 2.b, 2.c used tidyverse functions
2) It doesn't depend upon on the substring 'A' or 'B' in the column names
3) Assumed the patterns in the OP's dataset will be 'group' followed by value columns
1) This solution consists of a:
gather which generates the desired number of rows
a mutate which combines the groupA and groupB columns and changes the key column to that requested and
select which picks out the columns wanted.
First gather the columns whose names start with V and then create a new group column from groupA and groupB choosing groupA if the key has an A in it and groupB if the key has B in it. (We used mapply(switch, ...) here for easy extension to the 3+ group case but we could have used an ifelse, viz. ifelse(grepl("A", key), as.character(groupA), as.character(groupB)), given that we have only two groups.) The mutate also reduces the key names from VA1 to V1, etc. and finally select out the columns desired.
DF %>%
gather(key, value, starts_with("V")) %>%
mutate(group = mapply(switch, gsub("[^AB]", "", key), A = groupA, B = groupB),
key = sub("[AB]", "", key)) %>%
select(replicate, group, key, value)
giving:
replicate group key value
1 1 a V1 0.34291660
2 2 b V1 -1.31837320
3 3 c V1 -0.79083580
4 4 d V1 -0.99636770
5 5 e V1 -1.36346090
6 1 a V2 -2.30336406
7 2 b V2 -0.13516849
8 3 c V2 -0.10746447
9 4 d V2 -1.82557058
10 5 e V2 0.04385812
11 1 f V1 0.05363582
12 2 g V1 -0.42586417
13 3 h V1 1.05134242
14 4 i V1 -1.14532536
15 5 j V1 -0.65643595
16 1 f V2 1.64540780
17 2 g V2 0.15415410
18 3 h V2 1.42973500
19 4 i V2 1.08157330
20 5 j V2 -0.14528770
2) Another approach would be to split the columns into groups such that all columns in a group have the same name after removing A and B from their names. Performi unlist on each such group to reduce the list to a list of plain vectors and convert that list to a data.frame. Finally gather the V columns and rearrange. Note that rownames_to_column is from the tibble package.
DF %>%
as.list %>%
split(sub("[AB]", "", names(.))) %>%
lapply(unlist) %>%
as.data.frame %>%
rownames_to_column %>%
gather(key, value, starts_with("V")) %>%
arrange(gsub("[^AB]", "", rowname), key) %>%
select(replicate, group, key, value)
2a) If the row order is not important then the rownames_to_column, arrange and select lines could be omitted shortening it to this:
DF %>%
as.list %>%
split(sub("[AB]", "", names(.))) %>%
lapply(unlist) %>%
as.data.frame %>%
gather(key, value, starts_with("V"))
Solutions (2) and (2a) could easily be converted to base-only solutions by replacing the gather with the appropriate reshape from base as in the second reshape, i.e. the one producing d2, in (3).
3) Although the question asked for a tidyverse solution there is a fairly convenient base solution consisting of two reshape calls. The varying produced by the split is: list(group = c("groupA", "groupB"), V1 = c("VA1", "VB1"), V2 = c("VA2", "VB2")) -- that is it matches up the ith column in each set of columns.
varying <- split(names(DF)[-1], gsub("[AB]", "", names(DF))[-1])
d <- reshape(DF, dir = "long", varying = varying, v.names = names(varying))
d <- subset(d, select = -c(time, id))
d2 <- reshape(d, dir = "long", varying = list(grep("V", names(d))), v.names = "value",
timevar = "key")
d2 <- subset(d2, select = c(replication, group, key, value))
d2
Note: The input in reproducible form is:
DF <- structure(list(replicate = 1:5, groupA = structure(1:5, .Label = c("a",
"b", "c", "d", "e"), class = "factor"), VA1 = c(0.3429166, -1.3183732,
-0.7908358, -0.9963677, -1.3634609), VA2 = c(-2.30336406, -0.13516849,
-0.10746447, -1.82557058, 0.04385812), groupB = structure(1:5, .Label = c("f",
"g", "h", "i", "j"), class = "factor"), VB1 = c(0.05363582, -0.42586417,
1.05134242, -1.14532536, -0.65643595), VB2 = c(1.6454078, 0.1541541,
1.429735, 1.0815733, -0.1452877)), .Names = c("replicate", "groupA",
"VA1", "VA2", "groupB", "VB1", "VB2"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5"))

Resources