Transformation of dataframe applied on 3 variables - r

I wrote several commands to transform a dataframe but i would like to simplify the code that I wrote in four parts. Part 1,2 and 3 are to make calculation of column 1, 2 and 3 (count the number of time a value is repeated for each column and complete for missing number comprised between 0 and the max of value of the three column). The fourth part is to join the previous output.
I would like to simplify it in order to make the transformation of the 3 column in one block of code instead of 4. Is it possible to do it without using function ?
Thank you in advance.
set.seed(1234)
# Data
A=sample(0:10, 20, replace = TRUE)
B=sample(0:10, 20, replace = TRUE)
C=sample(0:10, 20, replace = TRUE)
df=data.frame(A,B,C)
A B C
1 9 2 0
2 5 3 5
3 4 9 7
4 8 4 2
5 4 1 5
6 5 7 0
7 3 10 0
8 1 3 8
9 6 2 7
10 5 6 9
11 9 8 0
12 5 2 10
13 3 5 7
14 7 3 9
15 3 7 5
16 3 9 2
17 4 10 8
18 7 1 2
19 3 4 5
20 7 5 8
# Count for A
df2=data.frame(A=0:max(max(df$A),max(df$B),max(df$C)))
df3_A= df %>%
select(A) %>%
group_by(A) %>%
mutate(A_number= n()) %>%
distinct(A_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_A$A_number[is.na(df3_A$A_number)]=0
# Count for B
df2=data.frame(B=0:max(max(df$A),max(df$B),max(df$C)))
df3_B= df %>%
select(B) %>%
group_by(B) %>%
mutate(B_number= n()) %>%
distinct(B_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_B$B_number[is.na(df3_B$B_number)]=0
# Count for C
df2=data.frame(C=0:max(max(df$A),max(df$B),max(df$C)))
df3_C= df %>%
select(C) %>%
group_by(C) %>%
mutate(C_number= n()) %>%
distinct(C_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_C$C_number[is.na(df3_C$C_number)]=0
# Join
df3= df3_A %>%
left_join(df3_B, by=c("A"="B")) %>%
left_join(df3_C, by=c("A"="C"))
A A_number B_number C_number
<int> <dbl> <dbl> <dbl>
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1

Using base: stack and table:
as.data.frame.matrix(table(stack(df)))
# A B C
# 0 0 0 4
# 1 1 2 0
# 2 0 3 3
# 3 5 3 0
# 4 3 2 0
# 5 4 2 4
# 6 1 1 0
# 7 3 2 3
# 8 1 1 3
# 9 2 2 2
# 10 0 2 1

You can reshape to long, count the values by variables, then reshape back to wide filling missings with zero:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything()) %>%
count(name, value) %>%
pivot_wider(values_from = n, values_fill = 0) %>%
arrange(value)
# A tibble: 11 × 4
value A B C
<int> <int> <int> <int>
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1

You can use vctrs::vec_count over the columns and then merge the data.frames altogether:
library(dplyr)
library(purrr)
df %>%
mutate(across(A:C, factor, levels = 0:10, ordered = TRUE)) %>%
map(vctrs::vec_count) %>%
imap(~ {name <- paste0("count", .y) %>%
rename_with(.x, ~ name, count)}) %>%
reduce(full_join, by = "key") %>%
replace(is.na(.), 0) %>%
arrange(key)
output
key countA countB countC
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1

Related

how to pipe all these commands dplyr aggregate groupby mixedorder in R

assume this is my dataset
library(gtools)
library(dplyr)
df <- data.frame(grp=c(0.5,0.6,1,2,2,2,4.5,10,22,"kids","Parents","Teachers"),
f1= c(1,0,3,2,4,0,3,0,1,6,8,4),
f2= c(1,0,3,1,4,0,1,0,1,5,8,4),
f3= c(1,0,3,2,4,6,1,2,1,6,8,4))
df
grp f1 f2 f3
1 0.5 1 1 1
2 0.6 0 0 0
3 1 3 3 3
4 2 2 1 2
5 2 4 4 4
6 2 0 0 6
7 4.5 3 1 1
8 10 0 0 2
9 22 1 1 1
10 kids 6 5 6
11 Parents 8 8 8
12 Teachers 4 4 4
and this is my desired output
df_final
grp f1 f2 f3
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
5 kids 6 5 6
6 Parents 8 8 8
7 Teachers 4 4 4
This is what I did + commenting my questions:
############ how NOT to splot set into two subsets of data
df_1 <- df %>%
filter(grepl('kids|Parents|Teachers', grp))
df_1
grp f1 f2 f3
1 kids 6 5 6
2 Parents 8 8 8
3 Teachers 4 4 4
df_2 <- df %>%
filter(!grepl('kids|Parents|Teachers', grp)) %>%
mutate(across(.cols = grp, .fns = as.numeric)) %>%
mutate(grp= cut(grp, breaks=c(-999,2,10,21,999) , labels=c("<=1", "2-9","10-19",">20"), right=F))
df_2
grp f1 f2 f3
1 <=1 1 1 1
2 <=1 0 0 0
3 <=1 3 3 3
4 2-9 2 1 2
5 2-9 4 4 4
6 2-9 0 0 6
7 2-9 3 1 1
8 10-19 0 0 2
9 >20 1 1 1
### how to pipe both aggregate and mixedorder/sort instead of separate lined of codes
df_2 <- aggregate(.~grp, data = df_2, FUN=sum)
df2[mixedorder(df2$grp, decreasing = T),]
df_2
grp f1 f2 f3
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
### how to make sure 10-19 does not come before 2-9 in case of actual dataset
grp a b d
1 <=1 53 48 53
2 10-15 65 63 65
3 2-9 30 40 30
df_final <- rbind(df_2, df_1)
df_final
grp f1 f2 f3
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
5 kids 6 5 6
6 Parents 8 8 8
7 Teachers 4 4 4
Is there any neat way to get from original df to df_final all in dplyr by just piping commands?
how NOT to splot set into two subsets of data?
how to pipe both aggregate and mixedorder/sort instead of separate lined of codes?
how to make sure 10-19 does not come before 2-9 in case of actual dataset?
Here is one option - create a second column ('grp2') with the cut values on the numeric elements only, then coalesce with the original column, while appending the levels, and then do a group_by summarise with across. In this way, we don't have to use mixedsort, as the cut already had the grouping sorted
library(dplyr)
library(stringr)
df %>%
mutate(grp2 = case_when(str_detect(grp, '^[0-9.]+$')
~ cut(as.numeric(grp), breaks=c(-999,2,10,21,999) ,
labels=c("<=1", "2-9","10-19",">20"), right=FALSE))) %>%
mutate(grp =factor(coalesce(grp2, grp),
levels = c(levels(grp2), unique(grp[is.na(grp2)]))), .keep = "unused") %>%
group_by(grp) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 7 × 4
grp f1 f2 f3
<fct> <dbl> <dbl> <dbl>
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
5 kids 6 5 6
6 Parents 8 8 8
7 Teachers 4 4 4

Use R to find values for which a condition is first met

Consider the following sample dataset. Id is an individual identifier.
rm(list=ls()); set.seed(1)
n<-100
X<-rbinom(n, 1, 0.5) #binary covariate
j<-rep (1:n)
dat<-data.frame(id=1:n, X)
ntp<- rep(4, n)
mat<-matrix(ncol=3,nrow=1)
m=0; w <- mat
for(l in ntp)
{
m=m+1
ft<- seq(from = 2, to = 8, length.out = l)
# ft<- seq(from = 1, to = 9, length.out = l)
ft<-sort(ft)
seq<-rep(ft,each=2)
seq<-c(0,seq,10)
matid<-cbind( matrix(seq,ncol=2,nrow=l+1,byrow=T ) ,m)
w<-rbind(w,matid)
}
d<-data.frame(w[-1,])
colnames(d)<-c("time1","time2","id")
D <- round( merge(d,dat,by="id") ,2) #merging dataset
nr<-nrow(D)
D$Survival_time<-round(rexp(nr, 0.1)+1,3)
head(D,15)
id time1 time2 X Survival_time
1 1 0 2 0 21.341
2 1 2 4 0 18.987
3 1 4 6 0 4.740
4 1 6 8 0 13.296
5 1 8 10 0 6.397
6 2 0 2 0 10.566
7 2 2 4 0 2.470
8 2 4 6 0 14.907
9 2 6 8 0 8.620
10 2 8 10 0 13.376
11 3 0 2 1 45.239
12 3 2 4 1 11.545
13 3 4 6 1 11.352
14 3 6 8 1 19.760
15 3 8 10 1 7.547
How can I obtain the value at which Survival_time is less that time2 for the very first time per individual. I should end up with the following values
id Survival_time
1 4.740
2 2.470
3 7.547
Also, how can I subset the data to stop individualwise when this condition occurs. i.e obtain
id time1 time2 X Survival_time
1 1 0 2 0 21.341
2 1 2 4 0 18.987
3 1 4 6 0 4.740
6 2 0 2 0 10.566
7 2 2 4 0 2.470
11 3 0 2 1 45.239
12 3 2 4 1 11.545
13 3 4 6 1 11.352
14 3 6 8 1 19.760
15 3 8 10 1 7.547
Using data.table
library(data.table)
setDT(D)[, .SD[seq_len(.N) <= which(Survival_time < time2)[1]], id]
-output
id time1 time2 X Survival_time
1: 1 0 2 0 21.341
2: 1 2 4 0 18.987
3: 1 4 6 0 4.740
4: 2 0 2 0 10.566
5: 2 2 4 0 2.470
6: 3 0 2 1 45.239
7: 3 2 4 1 11.545
8: 3 4 6 1 11.352
9: 3 6 8 1 19.760
10: 3 8 10 1 7.547
Slight variation:
library(dplyr)
D %>% # Take D, and then
group_by(id) %>% # group by id, and then
filter(Survival_time < time2) %>% # keep Survival times < time2, and then
slice(1) %>% # keep the first row per id, and then
ungroup() # ungroup
You can use -
library(dplyr)
D %>%
group_by(id) %>%
summarise(Survival_time = Survival_time[match(TRUE, Survival_time < time2)])
#Also using which.max
#summarise(Survival_time = Survival_time[which.max(Survival_time < time2)])
# id Survival_time
# <int> <dbl>
#1 1 4.74
#2 2 2.47
#3 3 7.55
To select the rows you may till that point you may use -
D %>%
group_by(id) %>%
filter(row_number() <= match(TRUE, Survival_time < time2)) %>%
ungroup
# id time1 time2 X Survival_time
# <int> <int> <int> <int> <dbl>
# 1 1 0 2 0 21.3
# 2 1 2 4 0 19.0
# 3 1 4 6 0 4.74
# 4 2 0 2 0 10.6
# 5 2 2 4 0 2.47
# 6 3 0 2 1 45.2
# 7 3 2 4 1 11.5
# 8 3 4 6 1 11.4
# 9 3 6 8 1 19.8
#10 3 8 10 1 7.55

DPLYR - merging rows together using a column value as a conditional

I have a series of rows in a single dataframe. I'm trying to aggregate the first two rows for each ID- i.e. - I want to combine events 1 and 2 for ID 1 into a single row, events 1 and 2 for ID 2 into a singlw row etc, but leave event 3 completely untouched.
id <- c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5)
event <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
score <- c(3,NA,1,3,NA,2,6,NA,1,8,NA,2,4,NA,1)
score2 <- c(NA,4,1,NA,5,2,NA,0,3,NA,5,6,NA,8,7)
df <- tibble(id, event, score, score2)
# A tibble: 15 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 NA
2 1 2 NA 4
3 1 3 1 1
4 2 1 3 NA
5 2 2 NA 5
6 2 3 2 2
7 3 1 6 NA
8 3 2 NA 0
9 3 3 1 3
10 4 1 8 NA
11 4 2 NA 5
12 4 3 2 6
13 5 1 4 NA
14 5 2 NA 8
15 5 3 1 7
I've tried :
df_merged<- df %>% group_by (id) %>% summarise_all(funs(min(as.character(.),na.rm=TRUE))),
which aggregates these nicely, but then I struggle to merge these back into the orignal dataframe/tibble (there are really about 300 different "score" columns in the full dataset, so a right_join is a headache with score.x, score.y, score2.x, score2.y all over the place...)
Ideally, the situation would need to be dplyr as the rest of my code runs on this!
EDIT:
Ideally, my expected output would be:
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
3 1 3 1 1
4 2 1 3 5
6 2 3 2 2
7 3 1 6 0
9 3 3 1 3
10 4 1 8 5
12 4 3 2 6
13 5 1 4 8
15 5 3 1 7
We may change the order of NA elements with replace
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('score'),
~replace(., 1:2, .[1:2][order(is.na(.[1:2]))]))) %>%
ungroup %>%
filter(if_all(starts_with('score'), Negate(is.na)))
-output
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
Here is an alternative way to achieve your task with fill from tidyr package:
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1,3)
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
How about this?
library(dplyr)
df_e12 <- df %>%
filter(event %in% c(1, 2)) %>%
group_by(id) %>%
mutate(across(starts_with("score"), ~min(.x, na.rm = TRUE))) %>%
ungroup() %>%
distinct(id, .keep_all = TRUE)
df_e3 <- df %>%
filter(event == 3)
df <- bind_rows(df_e12, df_e3) %>%
arrange(id, event)
df
> df
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7

Count number of new and lost friends between two data frames in R

I have two data frames of the same respondents, one from Time 1 and the next from Time 2. In each wave they nominated their friends, and I want to know:
1) how many friends are nominated in Time 2 but not in Time 1 (new friends)
2) how many friends are nominated in Time 1 but not in Time 2 (lost friends)
Sample data:
Time 1 DF
ID friend_1 friend_2 friend_3
1 4 12 7
2 8 6 7
3 9 NA NA
4 15 7 2
5 2 20 7
6 19 13 9
7 12 20 8
8 3 17 10
9 1 15 19
10 2 16 11
Time 2 DF
ID friend_1 friend_2 friend_3
1 4 12 3
2 8 6 14
3 9 NA NA
4 15 7 2
5 1 17 9
6 9 19 NA
7 NA NA NA
8 7 1 16
9 NA 10 12
10 7 11 9
So the desired DF would include these columns (EDIT filled in columns):
ID num_newfriends num_lostfriends
1 1 1
2 1 1
3 0 0
4 0 0
5 3 3
6 0 1
7 0 3
8 3 3
9 2 3
10 2 1
EDIT2:
I've tried doing an anti join
df3 <- anti_join(df1, df2)
But this method doesn't take into account friend id numbers that might appear in a different column in time 2 (For example respondent #6 friend 9 and 19 are in T1 and T2 but in different columns in each time)
Another option:
library(tidyverse)
left_join(
gather(df1, key, x, -ID),
gather(df2, key, y, -ID),
by = c("ID", "key")
) %>%
group_by(ID) %>%
summarise(
num_newfriends = sum(!y[!is.na(y)] %in% x[!is.na(x)]),
num_lostfriends = sum(!x[!is.na(x)] %in% y[!is.na(y)])
)
Output:
# A tibble: 10 x 3
ID num_newfriends num_lostfriends
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
Simple comparisons would be an option
library(tidyverse)
na_sums_old <- rowSums(is.na(time1))
na_sums_new <- rowSums(is.na(time2))
kept_friends <- map_dbl(seq(nrow(time1)), ~ sum(time1[.x, -1] %in% time2[.x, -1]))
kept_friends <- kept_friends - na_sums_old * (na_sums_new >= 1)
new_friends <- 3 - na_sums_new - kept_friends
lost_friends <- 3 - na_sums_old - kept_friends
tibble(ID = time1$ID, new_friends = new_friends, lost_friends = lost_friends)
# A tibble: 10 x 3
ID new_friends lost_friends
<int> <dbl> <dbl>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
You can make anti_join work by first pivoting to a "long" data frame.
df1 <- df1 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
df2 <- df2 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
head(df1)
#> # A tibble: 6 x 3
#> ID name friend
#> <int> <chr> <int>
#> 1 1 friend_1 4
#> 2 1 friend_2 12
#> 3 1 friend_3 7
#> 4 2 friend_1 8
#> 5 2 friend_2 6
#> 6 2 friend_3 7
lost_friends <- anti_join(df1, df2, by = c("ID", "friend"))
new_fiends <- anti_join(df2, df1, by = c("ID", "friend"))
respondents <- distinct(df1, ID)
respondents %>%
full_join(
count(lost_friends, ID, name = "num_lost_friends")
) %>%
full_join(
count(new_fiends, ID, name = "num_new_friends")
) %>%
mutate_at(vars(starts_with("num_")), replace_na, 0)
#> Joining, by = "ID"
#> Joining, by = "ID"
#> # A tibble: 10 x 3
#> ID num_lost_friends num_new_friends
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 0 0
#> 4 4 0 0
#> 5 5 3 3
#> 6 6 1 0
#> 7 7 3 0
#> 8 8 3 3
#> 9 9 3 2
#> 10 10 2 2
Created on 2019-11-01 by the reprex package (v0.3.0)

Get edge list that includes alter's alters

I need a data frame that includes three columns: i, j (alter), and k (j's alter). I have an adjacency matrix (sample below). From there I can get a graph object and extract the edge list. How can I manipulate the data to get an output like the WANT data frame below?
HAVE (matrix & edgelist):
1 2 3 4 5
1 0 0 0 1 0
2 0 0 1 1 1
3 0 0 0 0 0
4 1 1 0 0 1
5 1 1 0 1 0
g <- graph_from_adjacency_matrix(mat)
get.edgelist(g)
i j
1 4
2 3
2 4
2 5
4 1
4 2
4 5
5 1
5 2
5 4
WANT (ijk edge list):
i j k
1 4 2
1 4 5
2 4 1
2 4 5
4 2 3
4 5 1
4 5 2
5 1 4
5 2 3
5 2 4
5 4 1
5 4 2
the ijk edge list should so all possible triples with ij, excluding self loops(ex: 1 4 1)
Data:
as.matrix(read.table(text = "0 0 0 1 0
0 0 1 1 1
0 0 0 0 0
1 1 0 0 1
1 1 0 1 0",
header = F, stringsAsFactors = F)) -> m1
dimnames(m1) <- list(1:5, 1:5)
Libraries:
library(igraph)
library(dplyr)
library(tidyr)
library(magrittr)
Solution:
g1 <- graph_from_adjacency_matrix(m1)
e1 <- get.edgelist(g1) %>% as.data.frame %>% mutate_if(is.factor, as.character)
e1 %>%
group_by(V1) %>%
nest(V2) %>%
right_join(e1,.,by = c("V2"="V1")) %>%
unnest %>%
filter(V1 != V21) %>%
set_colnames(c("i", "j", "k"))
Output:
#> i j k
#> 1 1 4 2
#> 2 1 4 5
#> 3 2 4 1
#> 4 2 4 5
#> 5 2 5 1
#> 6 2 5 4
#> 7 4 2 3
#> 8 4 2 5
#> 9 4 5 1
#> 10 4 5 2
#> 11 5 1 4
#> 12 5 2 3
#> 13 5 2 4
#> 14 5 4 1
#> 15 5 4 2
I was actually able to get a way to do it using igraph and dplyr:
# make graph of matrix
g <- graph_from_adjacency_matrix(mat)
# put edgelist into two objects, one where columns are "i, j" and the other "j, k"
df1 <- get.edgelist(g) %>%
as.data.frame() %>%
select(i = V1, j = V2)
df2 <- get.edgelist(g) %>%
as.data.frame() %>%
select(j = V1, k = V2)
# combine the dataframes, filter out rows where i and k are the same observation
df_combn <- inner_join(df1, df2, by = c("j" = "j")) %>%
mutate_all(as.character) %>%
filter(., !(i == k))

Resources