R Reshape Wide To Long Using Column Stub Strings - r

data1=data.frame("School"=c(1,1,2,2,3,3,4,4),
"Fund"=c(0,1,0,1,0,1,0,1),
"Total_A_Grade5"=c(22,20,21,24,24,26,25,22),
"Group1_A_Grade5"=c(10,6,6,10,9,9,9,10),
"Group2_A_Grade5"=c(5,9,9,8,10,8,8,6),
"Total_B_Grade5"=c(23,33,19,21,19,23,20,21),
"Group1_B_Grade5"=c(8,7,7,10,9,9,5,5),
"Group2_B_Grade5"=c(6,10,7,6,6,5,9,9),
"Total_A_Grade6"=c(18,24,16,24,26,25,16,19),
"Group1_A_Grade6"=c(7,7,5,9,10,9,5,7),
"Group2_A_Grade6"=c(5,8,6,7,10,8,8,9),
"Total_B_Grade6"=c(26,23,22,24,21,22,24,19),
"Group1_B_Grade6"=c(10,10,6,10,7,8,8,7),
"Group2_B_Grade6"=c(9,6,9,6,7,6,9,9),
"Total_A_Grade7"=c(20,19,18,25,16,21,19,26),
"Group1_A_Grade7"=c(9,7,7,9,7,7,5,8),
"Group2_A_Grade7"=c(8,5,7,9,6,5,5,9),
"Total_B_Grade7"=c(25,21,24,25,18,18,27,18),
"Group1_B_Grade7"=c(10,10,10,7,5,6,8,5),
"Group2_B_Grade7"=c(9,6,8,10,8,6,10,6))
data2=data.frame("School"=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
"Fund"=c(0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1),
"Type"=c('Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2'),
"Class"=c('A','A','A','A','A','A','B','B','B','B','B','B','A','A','A','A','A','A','B','B','B','B','B','B'),
"Grade"=c(5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6),
"Score"=c(22,20,10,6,5,9,23,33,8,7,6,10,18,24,7,7,5,8,26,23,10,10,9,6))
I have 'data1' and want to reshape to make 'data2' which just shows example for School 1 grade 5 and 6 but I want all of data1 reshaped.
The column names of 'data1' contain rich information. For example, Group2_B_Grade6 indicated 'Type' = Group2, 'Class' = B, 'Grade' = 6. I wish to reshape 'data1' and then use these stubs separated by "_" as colnames to prepare 'data2'
data3=data.frame("School"=c(1,1,2,2,3,3,4,4),
"Fund"=c(0,1,0,1,0,1,0,1),
"Grade_5"=c(22,20,21,24,24,26,25,22),
"Grade_6"=c(10,6,6,10,9,9,9,10),
"Grade_7"=c(5,9,9,8,10,8,8,6))

You can do this directly with pivot_longer with some regex in names_pattern.
tidyr::pivot_longer(data1,
cols = -c(School, Fund),
names_to = c('Type', 'Class', 'Grade'),
names_pattern = '(.*?)_([A-Z])_Grade(\\d+)',
values_to = 'Score')
# A tibble: 144 x 6
# School Fund Type Class Grade Score
# <dbl> <dbl> <chr> <chr> <chr> <dbl>
# 1 1 0 Total A 5 22
# 2 1 0 Group1 A 5 10
# 3 1 0 Group2 A 5 5
# 4 1 0 Total B 5 23
# 5 1 0 Group1 B 5 8
# 6 1 0 Group2 B 5 6
# 7 1 0 Total A 6 18
# 8 1 0 Group1 A 6 7
# 9 1 0 Group2 A 6 5
#10 1 0 Total B 6 26
# … with 134 more rows

Using dplyr (and tidyr):
library(dplyr)
library(tidyr)
data2 <- data1 %>%
pivot_longer(-c(School, Fund)) %>%
separate(name, into = c('Type', 'Class', 'Grade')) %>%
extract(Grade, 'Grade', "([0-9]+)")
data2
#> # A tibble: 144 x 6
#> School Fund Type Class Grade value
#> <dbl> <dbl> <chr> <chr> <chr> <dbl>
#> 1 1 0 Total A 5 22
#> 2 1 0 Group1 A 5 10
#> 3 1 0 Group2 A 5 5
#> 4 1 0 Total B 5 23
#> 5 1 0 Group1 B 5 8
#> 6 1 0 Group2 B 5 6
#> 7 1 0 Total A 6 18
#> 8 1 0 Group1 A 6 7
#> 9 1 0 Group2 A 6 5
#> 10 1 0 Total B 6 26
#> # … with 134 more rows
Created on 2020-04-06 by the reprex package (v0.3.0)

We can use melt from data.table
library(data.table)
melt(setDT(data1), id.var = c('School', 'Fund'))[,
c('Type', 'Class', 'Grade') := tstrsplit(variable, "_")][,
Grade := sub('Grade', '', Grade)][, variable := NULL][]
# School Fund value Type Class Grade
# 1: 1 0 22 Total A 5
# 2: 1 1 20 Total A 5
# 3: 2 0 21 Total A 5
# 4: 2 1 24 Total A 5
# 5: 3 0 24 Total A 5
# ---
#140: 2 1 10 Group2 B 7
#141: 3 0 8 Group2 B 7
#142: 3 1 6 Group2 B 7
#143: 4 0 10 Group2 B 7
#144: 4 1 6 Group2 B 7

Related

How to calculate cumulative sum for each group in time?

For each unique ID and rep, I want to calculate the cumulative number of babies at each age?
For instance, A1, the cumulative sum should look like 1,3,6
I tried the folowing method
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df$csum <- ave(df$babies, c(df$id,df$age, df$age), FUN=cumsum)
The result is cumulative sum is calculated over ID alone but not replicate or age. Any suggestions?
How about this:
library(dplyr)
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df %>%
group_by(id, rep) %>%
arrange(age, .by_group = TRUE) %>%
mutate(csum = cumsum(babies))
#> # A tibble: 15 × 5
#> # Groups: id, rep [4]
#> id rep age babies csum
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 0 1 1
#> 2 A 1 1 2 3
#> 3 A 1 2 3 6
#> 4 A 2 0 0 0
#> 5 A 2 1 1 1
#> 6 A 2 2 3 4
#> 7 B 1 0 0 0
#> 8 B 1 1 1 1
#> 9 B 1 2 5 6
#> 10 B 1 3 1 7
#> 11 B 2 0 0 0
#> 12 B 2 1 0 0
#> 13 B 2 2 12 12
#> 14 B 2 3 1 13
#> 15 B 2 4 1 14
Created on 2022-12-08 by the reprex package (v2.0.1)

reset a ranking when a variable exceeds a value using dplyr

Suppose I have the following data:
df <- tibble(ID=c(1,2,3,4,5,6,7,8,9,10),
ID2=c(1,1,1,1,2,2,2,3,4,4),
VAR=c(25,10,120,60,85,90,20,40,60,150))
I want to add a new column with a ranking that would be reset either when the ID2 changes or when VAR is greater than 100.
The desired result is:
# A tibble: 10 x 4
ID ID2 VAR RANK
<dbl> <dbl> <dbl> <dbl>
1 1 1 25 1
2 2 1 10 2
3 3 1 120 1
4 4 1 60 2
5 5 2 85 1
6 6 2 90 2
7 7 2 20 3
8 8 3 40 1
9 9 4 60 1
10 10 4 150 1
I know how to add a new column with a ranking that would be reset only when the ID2 changes:
df %>%
arrange(ID2) %>%
group_by(ID2) %>%
mutate(RANK = row_number())
... but treating both conditions at the same time is more difficult. How should I do using dplyr?
You can group_by ID2 and cumsum(VAR > 100), i.e.:
library(dplyr)
df %>%
group_by(ID2, cumVAR = cumsum(VAR > 100)) %>%
mutate(RANK = row_number())
output
# A tibble: 10 x 5
# Groups: ID2, cumVAR [6]
ID ID2 VAR cumVAR RANK
<dbl> <dbl> <dbl> <int> <int>
1 1 1 25 0 1
2 2 1 10 0 2
3 3 1 120 1 1
4 4 1 60 1 2
5 5 2 85 1 1
6 6 2 90 1 2
7 7 2 20 1 3
8 8 3 40 1 1
9 9 4 60 1 1
10 10 4 150 2 1
rowid from data.table would be useful as well
library(dplyr)
library(data.table)
df %>%
mutate(RANK = rowid(ID2, cumsum(VAR > 100)))
-output
# A tibble: 10 × 4
ID ID2 VAR RANK
<dbl> <dbl> <dbl> <int>
1 1 1 25 1
2 2 1 10 2
3 3 1 120 1
4 4 1 60 2
5 5 2 85 1
6 6 2 90 2
7 7 2 20 3
8 8 3 40 1
9 9 4 60 1
10 10 4 150 1

Parse one column of json and bind with other column to make dataframe

I have data that takes the format:
have <- structure(list(V1 = c(4L, 28L, 2L),
V2 = c("[{\"group\":1,\"topic\":\"A\"},{\"group\":1,\"topic\":\"B\"},{\"group\":2,\"topic\":\"C\"},{\"group\":2,\"topic\":\"T\"},{\"group\":2,\"topic\":\"U\"},{\"group\":3,\"topic\":\"V\"},{\"group\":3,\"topic\":\"D\"},{\"group\":3,\"topic\":\"R\"},{\"group\":4,\"topic\":\"A\"},{\"group\":4,\"topic\":\"Q\"},{\"group\":4,\"topic\":\"S\"},{\"group\":4,\"topic\":\"W\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"F\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"H\"},{\"group\":6,\"topic\":\"I\"},{\"group\":6,\"topic\":\"J\"},{\"group\":6,\"topic\":\"K\"},{\"group\":6,\"topic\":\"L\"},{\"group\":6,\"topic\":\"M\"},{\"group\":6,\"topic\":\"N\"}]",
"[]",
"[{\"group\":2,\"topic\":\"C\"},{\"group\":3,\"topic\":\"D\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"M\"}]")
),
row.names = c(NA, 3L),
class = "data.frame")
The contents of V2 are nested groupings for each row like [{"group":1,"topic":"A"},{"group":1,"topic":"B"}...]
I want to get a wide dataframe that has an indicator (1/0) for each combination of group+topic (see also_have) for each row. Something like this:
# A tibble: 3 x 4
id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
<dbl> <dbl> <dbl> <dbl>
1 4 1 1 0
2 28 0 0 0
3 2 0 0 0
The first step is to parse the json.
I can use purrr::map(have$V2, jsonlite::fromJSON) to unnest into a list, but I'm not sure how to bind the V1 column (that we might rename to id) to each element of the resulting list (note that list element two is empty because V1==28 is empty). Here's a snippet of what the first element might look like with the id (V1) added.
[[1]]
group topic id
1 1 A 4
2 1 B 4
3 2 C 4
4 2 T 4
...
Alternatively, I think purrr::map_df(have$V2, jsonlite::fromJSON) would get me closer to what I ultimately need, but here too I'm not sure how to add the row id (V1).
df <- purrr::map_df(have$V2, jsonlite::fromJSON)
df
What I get:
group topic
1 1 A
2 1 B
3 2 C
4 2 T
...
What I want (notice `V1==28` does not appear):
group topic id
1 1 A 4
2 1 B 4
3 2 C 4
4 2 T 4
5 2 U 4
6 3 V 4
7 3 D 4
8 3 R 4
9 4 A 4
10 4 Q 4
11 4 S 4
12 4 W 4
13 6 O 4
14 6 P 4
15 6 E 4
16 6 F 4
17 6 G 4
18 6 H 4
19 6 I 4
20 6 J 4
21 6 K 4
22 6 L 4
23 6 M 4
24 6 N 4
25 2 C 2
26 3 D 2
27 6 O 2
28 6 P 2
29 6 E 2
30 6 G 2
31 6 M 2
STOP.
I think if I can get the above dataframe with id I can get the rest of the way. The ultimate goal is to join this info with also_have and then pivot wide.
# join
also_have <- expand_grid(c(1:6), c(LETTERS)) %>%
mutate(topic_id = 1:n()) %>%
magrittr::set_colnames(c("group", "topic", "topic_id")) %>%
select(topic_id, group, topic)
# pivot wide
# A tibble: 3 x 4
id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
<dbl> <dbl> <dbl> <dbl>
1 4 1 1 0
2 28 0 0 0
3 2 0 0 0
Update:
Applying #akrun's solution:
purrr::map_dfr(setNames(have$V2, have$V1),
jsonlite::fromJSON,
.id = 'V1') %>%
rename(id = V1) %>%
left_join(also_have, by=c("group", "topic")) %>%
select(-group, -topic) %>%
mutate(value = 1) %>%
pivot_wider(id_cols = id,
names_from = topic_id,
names_prefix = "topic_id",
values_from = value,
values_fill = 0
) %>%
full_join(tibble(id = as.character(have$V1))) %>%
replace(is.na(.), 0)
# A tibble: 3 x 25
id topic_id1 topic_id2 topic_id29 topic_id46 topic_id47 topic_id74 topic_id56
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4 1 1 1 1 1 1 1
2 2 0 0 1 0 0 0 1
3 28 0 0 0 0 0 0 0
# … with 17 more variables: topic_id70 <dbl>, topic_id79 <dbl>, topic_id95 <dbl>,
# topic_id97 <dbl>, topic_id101 <dbl>, topic_id145 <dbl>, topic_id146 <dbl>,
# topic_id135 <dbl>, topic_id136 <dbl>, topic_id137 <dbl>, topic_id138 <dbl>,
# topic_id139 <dbl>, topic_id140 <dbl>, topic_id141 <dbl>, topic_id142 <dbl>,
# topic_id143 <dbl>, topic_id144 <dbl>
We could pass a named vector and then use .id in map_dfr
purrr::map_dfr(setNames(have$V2, have$V1), jsonlite::fromJSON, .id = 'id')
-output
id group topic
1 4 1 A
2 4 1 B
3 4 2 C
4 4 2 T
5 4 2 U
6 4 3 V
7 4 3 D
8 4 3 R
9 4 4 A
10 4 4 Q
11 4 4 S
12 4 4 W
...
Or this can be done within in dplyr framework itself after using rowwise
library(tidyr)
have %>%
rowwise %>%
transmute(ID = V1, V2 = list(fromJSON(V2))) %>%
ungroup %>%
unnest(c(V2), keep_empty = TRUE) %>%
select(-V2)
# A tibble: 32 x 3
ID group topic
<int> <int> <chr>
1 4 1 A
2 4 1 B
3 4 2 C
4 4 2 T
5 4 2 U
6 4 3 V
7 4 3 D
8 4 3 R
9 4 4 A
10 4 4 Q
# … with 22 more rows
For the second step do a join
out <- have %>%
rowwise %>%
transmute(ID = V1, V2 = list(fromJSON(V2))) %>%
ungroup %>%
unnest(c(V2), keep_empty = TRUE) %>%
select(-V2) %>% right_join(also_have)
out
Joining, by = c("group", "topic")
# A tibble: 163 x 4
ID group topic topic_id
<int> <int> <chr> <int>
1 4 1 A 1
2 4 1 B 2
3 4 2 C 29
4 4 2 T 46
5 4 2 U 47
6 4 3 V 74
7 4 3 D 56
8 4 3 R 70
9 4 4 A 79
10 4 4 Q 95
# … with 153 more rows

Create a dummy variable indicating whether a value is observed before

I have a huge dataset and wanted to create a binary dummy variable indicating whether a value is observed before. Here is the sample data set.
data.frame(
id = c(rep("A",3),rep("B",3),rep("C",3)),
time = rep(seq(1:3),3),
item = c(11,12,13,11,11,13,22,11,22))
From the dataset, here is the desired column,
observed_b4 = c(NA,0,0,NA,1,0,NA,0,1)
For each group, I want to have information about whether item is observed before or not. I can do it with for-loop but the data size is too big to do.
Using duplicated:
base:
cbind(x, flag = as.integer(duplicated(paste(x$id, x$item))))
# id time item flag
# 1 A 1 11 0
# 2 A 2 12 0
# 3 A 3 13 0
# 4 B 1 11 0
# 5 B 2 11 1
# 6 B 3 13 0
# 7 C 1 22 0
# 8 C 2 11 0
# 9 C 3 22 1
or dplyr:
library(dplyr)
x %>%
group_by(id) %>%
mutate(flag = as.integer(duplicated(item)))
## A tibble: 9 x 4
## Groups: id [3]
# id time item flag
# <chr> <int> <dbl> <int>
#1 A 1 11 0
#2 A 2 12 0
#3 A 3 13 0
#4 B 1 11 0
#5 B 2 11 1
#6 B 3 13 0
#7 C 1 22 0
#8 C 2 11 0
#9 C 3 22 1
A solution with base R that uses: ave and duplicated.
ave allows you to apply a function over df$item for each group made by df$id. duplicated checks whether an item was already shown. ave returns automatically a numeric vector (the name class of the input vector).
df$observed_b4 <- ave(df$item, df$id, FUN = duplicated)
df
#> id time item observed_b4
#> 1 A 1 11 0
#> 2 A 2 12 0
#> 3 A 3 13 0
#> 4 B 1 11 0
#> 5 B 2 11 1
#> 6 B 3 13 0
#> 7 C 1 22 0
#> 8 C 2 11 0
#> 9 C 3 22 1
However, to get exactly what you're looking for, you can use this:
df$observed_b4 <- ave(df$item, df$id, FUN = function(x) replace(duplicated(x),1,NA))
df
#> id time item observed_b4
#> 1 A 1 11 NA
#> 2 A 2 12 0
#> 3 A 3 13 0
#> 4 B 1 11 NA
#> 5 B 2 11 1
#> 6 B 3 13 0
#> 7 C 1 22 NA
#> 8 C 2 11 0
#> 9 C 3 22 1
We could group by 'id', 'item', create a logical vector with row_number() and coerce it to binary (+)
library(dplyr)
df1 %>%
group_by(id, item) %>%
mutate(flag = +(row_number() != 1))
-output
# A tibble: 9 x 4
# Groups: id, item [7]
# id time item flag
# <chr> <int> <dbl> <int>
#1 A 1 11 0
#2 A 2 12 0
#3 A 3 13 0
#4 B 1 11 0
#5 B 2 11 1
#6 B 3 13 0
#7 C 1 22 0
#8 C 2 11 0
#9 C 3 22 1

Count number of new and lost friends between two data frames in R

I have two data frames of the same respondents, one from Time 1 and the next from Time 2. In each wave they nominated their friends, and I want to know:
1) how many friends are nominated in Time 2 but not in Time 1 (new friends)
2) how many friends are nominated in Time 1 but not in Time 2 (lost friends)
Sample data:
Time 1 DF
ID friend_1 friend_2 friend_3
1 4 12 7
2 8 6 7
3 9 NA NA
4 15 7 2
5 2 20 7
6 19 13 9
7 12 20 8
8 3 17 10
9 1 15 19
10 2 16 11
Time 2 DF
ID friend_1 friend_2 friend_3
1 4 12 3
2 8 6 14
3 9 NA NA
4 15 7 2
5 1 17 9
6 9 19 NA
7 NA NA NA
8 7 1 16
9 NA 10 12
10 7 11 9
So the desired DF would include these columns (EDIT filled in columns):
ID num_newfriends num_lostfriends
1 1 1
2 1 1
3 0 0
4 0 0
5 3 3
6 0 1
7 0 3
8 3 3
9 2 3
10 2 1
EDIT2:
I've tried doing an anti join
df3 <- anti_join(df1, df2)
But this method doesn't take into account friend id numbers that might appear in a different column in time 2 (For example respondent #6 friend 9 and 19 are in T1 and T2 but in different columns in each time)
Another option:
library(tidyverse)
left_join(
gather(df1, key, x, -ID),
gather(df2, key, y, -ID),
by = c("ID", "key")
) %>%
group_by(ID) %>%
summarise(
num_newfriends = sum(!y[!is.na(y)] %in% x[!is.na(x)]),
num_lostfriends = sum(!x[!is.na(x)] %in% y[!is.na(y)])
)
Output:
# A tibble: 10 x 3
ID num_newfriends num_lostfriends
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
Simple comparisons would be an option
library(tidyverse)
na_sums_old <- rowSums(is.na(time1))
na_sums_new <- rowSums(is.na(time2))
kept_friends <- map_dbl(seq(nrow(time1)), ~ sum(time1[.x, -1] %in% time2[.x, -1]))
kept_friends <- kept_friends - na_sums_old * (na_sums_new >= 1)
new_friends <- 3 - na_sums_new - kept_friends
lost_friends <- 3 - na_sums_old - kept_friends
tibble(ID = time1$ID, new_friends = new_friends, lost_friends = lost_friends)
# A tibble: 10 x 3
ID new_friends lost_friends
<int> <dbl> <dbl>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
You can make anti_join work by first pivoting to a "long" data frame.
df1 <- df1 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
df2 <- df2 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
head(df1)
#> # A tibble: 6 x 3
#> ID name friend
#> <int> <chr> <int>
#> 1 1 friend_1 4
#> 2 1 friend_2 12
#> 3 1 friend_3 7
#> 4 2 friend_1 8
#> 5 2 friend_2 6
#> 6 2 friend_3 7
lost_friends <- anti_join(df1, df2, by = c("ID", "friend"))
new_fiends <- anti_join(df2, df1, by = c("ID", "friend"))
respondents <- distinct(df1, ID)
respondents %>%
full_join(
count(lost_friends, ID, name = "num_lost_friends")
) %>%
full_join(
count(new_fiends, ID, name = "num_new_friends")
) %>%
mutate_at(vars(starts_with("num_")), replace_na, 0)
#> Joining, by = "ID"
#> Joining, by = "ID"
#> # A tibble: 10 x 3
#> ID num_lost_friends num_new_friends
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 0 0
#> 4 4 0 0
#> 5 5 3 3
#> 6 6 1 0
#> 7 7 3 0
#> 8 8 3 3
#> 9 9 3 2
#> 10 10 2 2
Created on 2019-11-01 by the reprex package (v0.3.0)

Resources