Create combination of all variables within a group - r

I have a dataframe as follows
group x y
a 1 2
a 3 1
b 1 3
c 1 1
c 2 3
I want to be able to generate all combinations of the x and y columns within a group, like so
group xy
a 1-2
a 1-1
a 3-2
a 3-1
b 1-3
c 1-1
c 1-3
c 2-1
c 2-3
I've tried using the following code, but it seems as though the group_by function is not working as expected
library(dplyr)
library(tidyr)
combn <- df %>%
group_by(group) %>%
expand(x, y)
My current results are instead giving me every combination of all three columns
head(combn)
group x y
a 1 1
a 1 2
a 1 3
a 2 1
a 2 2
a 2 3
Dput:
structure(list(group = structure(c(1L, 1L, 2L, 3L, 3L), .Label = c("a",
"b", "c"), class = "factor"), x = structure(c(1L, 3L, 1L, 1L,
2L), .Label = c("1", "2", "3"), class = "factor"), y = structure(c(2L,
1L, 3L, 1L, 3L), .Label = c("1", "2", "3"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))

You could use crossing from purrr to create combinations within a group and then unnest to create them as separate rows.
library(dplyr)
df1 <- df %>%
group_by(group) %>%
summarise(xy = list(crossing(x, y))) %>%
tidyr::unnest(xy)
df1
# group a b
# <fct> <int> <int>
#1 a 1 2
#2 a 3 2
#3 a 1 1
#4 a 3 1
#5 b 1 3
#6 c 1 1
#7 c 2 1
#8 c 1 3
#9 c 2 3
If you want to combine the two columns, you could use unite :
tidyr::unite(df1, xy, a, b, sep = "-")
# group xy
# <fct> <chr>
#1 a 1-2
#2 a 3-2
#3 a 1-1
#4 a 3-1
#5 b 1-3
#6 c 1-1
#7 c 2-1
#8 c 1-3
#9 c 2-3

Related

How to create multiple columns in r dataframe by implementing some query conditions

I have a dataset which is similar to the following:
Age Food_1_1 Food_1_2 Food_1_3 Amount_1_1 Amount_1_2 Amount_1_3
6-9 a b a 2 3 4
6-9 b b c 1 2 3
6-9 c a 4 1
9-10 c c b 1 3 1
9-10 c a b 1 2 1
Using R, I want to get the following data set which contains new set of columns a, b and c by adding the corresponding values:
Age Food_1_1 Food_1_2 Food_1_3 Amount_1_1 Amount_1_2 Amount_1_3 a b c
6-9 a b a 2 3 4 6 3 0
6-9 b b c 1 2 3 0 3 3
6-9 c a 4 1 1 0 4
9-10 c c b 1 3 1 0 1 4
9-10 c a b 1 2 1 2 1 1
Note: My data also contains missing values. The variables Monday:Wednesday are factors and the variables Value1:Value3 are numeric. For more clearity: 1st row of column "a" contains the addition of all values through Value1 to Value3 related to a (say for example 2+4 =6).
One way using base R:
data$id <- 1:nrow(data) # Create a unique id
vlist <- list(grep("day$", names(data)), grep("^Value", names(data)))
d1 <- reshape(data, direction="long", varying=vlist, v.names=c("Day","Value"))
d2 <- aggregate(Value~id+Day, FUN=sum, na.rm=TRUE, data=d1)
d3 <- reshape(d2, direction="wide", v.names="Value", timevar="Day")
d3[is.na(d3)] <- 0
merge(data, d3, by="id", all.x=TRUE)
# id Age Monday Tuesday Wednesday Value1 Value2 Value3 Value.a Value.b Value.c
#1 1 6-9 a b a 2 3 4 6 3 0
#2 2 6-9 b b c 1 2 3 0 3 3
#3 3 6-9 <NA> c a NA 4 1 1 0 4
#4 4 9-10 c c b 1 3 1 0 1 4
#5 5 9-10 c a b 1 2 1 2 1 1
Data:
data <- structure(list(Age = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("6-9",
"9-10"), class = "factor"), Monday = structure(c(1L, 2L, NA,
3L, 3L), .Label = c("a", "b", "c"), class = "factor"), Tuesday = structure(c(2L,
2L, 3L, 3L, 1L), .Label = c("a", "b", "c"), class = "factor"),
Wednesday = structure(c(1L, 3L, 1L, 2L, 2L), .Label = c("a",
"b", "c"), class = "factor"), Value1 = c(2L, 1L, NA, 1L,
1L), Value2 = c(3L, 2L, 4L, 3L, 2L), Value3 = c(4L, 3L, 1L,
1L, 1L)), class = "data.frame", row.names = c(NA, -5L))
You can use below code:
data[] <- lapply(data, as.character)
data$rownumber<-rownames(data)
x<-gather(data[,c(1:4,8)], Day, Letter, Monday:Wednesday) %>% mutate(row2 = rownames(x))
y<-gather(data[,c(1,5:7,8)], Day, Value, Value1:Value3)%>% mutate(row2 = rownames(y))
z<-left_join(x, y, by =c("Age","rownumber", "row2")) %>% group_by(Age, rownumber, Letter) %>% dplyr::summarise(suma = sum(as.numeric(Value), na.rm = T)) %>% mutate(suma = replace_na(suma, 0))
z<-dcast(z, rownumber ~ Letter , value.var="suma") %>% left_join(data, z, by = "rownumber")
z$Var.2<-NULL
z[is.na(z)]<-0
Output:
rownumber a b c Age Monday Tuesday Wednesday Value1 Value2 Value3
1 1 6 3 0 6-9 a b a 2 3 4
2 2 0 3 3 6-9 b b c 1 2 3
3 3 1 0 4 6-9 c a 0 4 1
4 4 0 1 4 9-10 c c b 1 3 1
5 5 2 1 1 9-10 c a b 1 2 1

Aggregate multiple rows based on common values

I have a dataset like this below
W X Y Z
A 2 3 4
A 2 3 6
B 1 2 3
C 3 2 1
B 1 3 4
B 1 2 2
I am want to combine/collapse the values in column Z only if the values in column W, X, Y are similar.
The final dataset will be like this.
W X Y Z
A 2 3 4,6
B 1 2 3,2
C 3 2 1
B 1 3 4
Not sure how to do this, any suggestions is much appreciated.
We can group by 'W', 'X', 'Y' and paste the values of 'Z' (toString is paste(..., collapse=", "))
library(dplyr)
df1 %>%
group_by(W, X, Y) %>%
summarise(Z = toString(unique(Z)))
# A tibble: 4 x 4
# Groups: W, X [3]
# W X Y Z
# <chr> <int> <int> <chr>
#1 A 2 3 4, 6
#2 B 1 2 3, 2
#3 B 1 3 4
#4 C 3 2 1
Or with aggregate from base R
aggregate(Z ~ ., unique(df1), toString)
# W X Y Z
#1 B 1 2 3, 2
#2 C 3 2 1
#3 B 1 3 4
#4 A 2 3 4, 6
data
df1 <- structure(list(W = c("A", "A", "B", "C", "B", "B"), X = c(2L,
2L, 1L, 3L, 1L, 1L), Y = c(3L, 3L, 2L, 2L, 3L, 2L), Z = c(4L,
6L, 3L, 1L, 4L, 2L)), class = "data.frame", row.names = c(NA,
-6L))

Average (/2) of each row in ONE column in R

DF
ID B C D
1 A 1 1 3
2 B 2 3 1
3 C 1 1 1
4 D 3 1 1
5 E 1 0 0
Given a dataframe such the one mentioned above, how can I quickly calculate the means for each row in one column and store them in another column of the dataframe? For example the average of column B would be: 0.5, 1, 0.5, 1,5, 0.5.
And is it possible to have a function that does it automatically for several columns at once?
Option is to get the matching row element from 'ID' to divide the column with the value
f1 <- function(dat, colNm) transform(dat,
newCol = dat[[colNm]]/dat[match(colNm, ID), colNm])
f1(DF, 'B')
# ID B C D newCol
#1 A 1 1 3 0.5
#2 B 2 3 1 1.0
#3 C 1 1 1 0.5
#4 D 3 1 1 1.5
#5 E 1 0 0 0.5
If it is to divide by a constant value, then just do
DF[-1] <- DF[-1]/2
data
DF <- structure(list(ID = c("A", "B", "C", "D", "E"), B = c(1L, 2L,
1L, 3L, 1L), C = c(1L, 3L, 1L, 1L, 0L), D = c(3L, 1L, 1L, 1L,
0L)), class = "data.frame", row.names = c("1", "2", "3", "4",
"5"))

Counting occurrence of a variable without taking account duplicates

I have a big data frame, called data with 1 004 490 obs, and I want to analyse the success of a treatment.
ID POSITIONS TREATMENT
1 0 A
1 1 A
1 2 B
2 0 C
2 1 D
3 0 B
3 1 B
3 2 C
3 3 A
3 4 A
3 5 B
So firstly, I want to count the number of time that one treatment is applicated to a patient (ID), but one treatment can be given several times to an iD. So, do I need to first delete all the duplicates and after count or there is a function that don't take into account all the duplicates.
What I want to have :
A : 2
B : 2
C : 2
D : 1
Then, I want to know how many time the treatment was given at the last position, but the last position is always different according to the ID.
What I want to have :
A : 0
B : 2 (for ID = 1 and 3)
C : 0
D : 1 (for ID = 1)
Thanks for your help, I am a new user of R !
Using base R, we can do,
merge(aggregate(ID ~ TREATMENT, df, FUN = function(i) length(unique(i))),
aggregate(ID ~ TREATMENT, df[!duplicated(df$ID, fromLast = TRUE),], toString),
by = 'TREATMENT', all = TRUE)
Which gives,
TREATMENT ID.x ID.y
1 A 2 <NA>
2 B 2 1, 3
3 C 2 <NA>
4 D 1 2
Here is a tidyverse approach, where we get the distinct rows based on 'ID', 'TREATMENT' and get the count of 'TREATMENT'
library(tidyverse)
df1 %>%
distinct(ID, TREATMENT) %>%
count(TREATMENT)
# A tibble: 4 x 2
# TREATMENT n
# <chr> <int>
#1 A 2
#2 B 2
#3 C 2
#4 D 1
and for second output, after grouping by 'ID', slice the last row (n()), create a column 'ind' and fill that with 0 for all missing combinations of 'TREATMENT' with complete, then get the sum of 'ind' after grouping by 'TREATMENT'
df1 %>%
group_by(ID) %>%
slice(n()) %>%
mutate(ind = 1) %>%
complete(TREATMENT = unique(df1$TREATMENT), fill = list(ind=0)) %>%
group_by(TREATMENT) %>%
summarise(n = sum(ind))
# A tibble: 4 x 2
# TREATMENT n
# <chr> <dbl>
#1 A 0
#2 B 2
#3 C 0
#4 D 1
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L), POSITIONS = c(0L, 1L, 2L, 0L, 1L, 0L, 1L, 2L, 3L, 4L, 5L
), TREATMENT = c("A", "A", "B", "C", "D", "B", "B", "C", "A",
"A", "B")), .Names = c("ID", "POSITIONS", "TREATMENT"),
class = "data.frame", row.names = c(NA, -11L))

R - sample and resample a person-period file

I am working with a gigantic person-period file and I thought that
a good way to deal with a large dataset is by using sampling and re-sampling technique.
My person-period file look like this
id code time
1 1 a 1
2 1 a 2
3 1 a 3
4 2 b 1
5 2 c 2
6 2 b 3
7 3 c 1
8 3 c 2
9 3 c 3
10 4 c 1
11 4 a 2
12 4 c 3
13 5 a 1
14 5 c 2
15 5 a 3
I have actually two distinct issues.
The first issue is that I am having trouble in simply sampling a person-period file.
For example, I would like to sample 2 id-sequences such as :
id code time
1 a 1
1 a 2
1 a 3
2 b 1
2 c 2
2 b 3
The following line of code is working for sampling a person-period file
dt[which(dt$id %in% sample(dt$id, 2)), ]
However, I would like to use a dplyr solution because I am interested in resampling and in particular I would like to use replicate.
I am interested in doing something like replicate(100, sample_n(dt, 2), simplify = FALSE)
I am struggling with the dplyr solution because I am not sure what should be the grouping variable.
library(dplyr)
dt %>% group_by(id) %>% sample_n(1)
gives me an incorrect result because it does not keep the full sequence of each id.
Any clue how I could both sample and re-sample person-period file ?
data
dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L, 4L, 4L, 4L, 5L, 5L, 5L), .Label = c("1", "2", "3", "4", "5"
), class = "factor"), code = structure(c(1L, 1L, 1L, 2L, 3L,
2L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 1L), .Label = c("a", "b",
"c"), class = "factor"), time = structure(c(1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2",
"3"), class = "factor")), .Names = c("id", "code", "time"), row.names = c(NA,
-15L), class = "data.frame")
I think the idiomatic way would probably look like
set.seed(1)
samp = df %>% select(id) %>% distinct %>% sample_n(2)
left_join(samp, df)
id code time
1 2 b 1
2 2 c 2
3 2 b 3
4 5 a 1
5 5 c 2
6 5 a 3
This extends straightforwardly to more grouping variables and fancier sampling rules.
If you need to do this many times...
nrep = 100
ng = 2
samps = df %>% select(id) %>% distinct %>%
slice(rep(1:n(), nrep)) %>% mutate(r = rep(1:nrep, each = n()/nrep)) %>%
group_by(r) %>% sample_n(ng)
repdat = left_join(samps, df)
# then do stuff with it:
repdat %>% group_by(r) %>% do_stuff
I imagine you are doing some simulations and may want to do the subsetting many times. You probably also want to try this data.table method and utilize the fast binary search feature on the key column:
library(data.table)
setDT(dt)
setkey(dt, id)
replicate(2, dt[list(sample(unique(id), 2))], simplify = F)
#[[1]]
# id code time
#1: 3 c 1
#2: 3 c 2
#3: 3 c 3
#4: 5 a 1
#5: 5 c 2
#6: 5 a 3
#[[2]]
# id code time
#1: 3 c 1
#2: 3 c 2
#3: 3 c 3
#4: 4 c 1
#5: 4 a 2
#6: 4 c 3
We can use filter with sample
dt %>%
filter(id %in% sample(unique(id),2, replace = FALSE))
NOTE: The OP specified using dplyr method and this solution does uses the dplyr.
If we need to do replicate one option would be using map from purrr
library(purrr)
dt %>%
distinct(id) %>%
replicate(2, .) %>%
map(~sample(., 2, replace=FALSE)) %>%
map(~filter(dt, id %in% .))
#$id
# id code time
#1 1 a 1
#2 1 a 2
#3 1 a 3
#4 4 c 1
#5 4 a 2
#6 4 c 3
#$id
# id code time
#1 4 c 1
#2 4 a 2
#3 4 c 3
#4 5 a 1
#5 5 c 2
#6 5 a 3

Resources