Reshaping data by appending rows from different groups to the same row - r

I have data as follows:
DT <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 8 x 6
Area Year Group Population_Count Male_Count Female_Count
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5
2 A 1 2 12 7 5
3 A 2 1 10 5 5
4 A 2 2 12 4 8
5 B 1 1 10 5 5
6 B 1 2 13 8 5
7 B 2 1 10 5 5
8 B 2 2 11 6 5
I would like to keep one observations per Area-Year, without losing any information. I tried to do
DTcast <- dcast(DT, Area + Year ~ Group + Population_Count + Male_Count + Female_Count)
But that results in a lot of rubbish:
Area Year 1_10_5_5 2_11_6_5 2_12_4_8 2_12_7_5 2_13_8_5
1 A 1 5 NA NA 5 NA
2 A 2 5 NA 8 NA NA
3 B 1 5 NA NA NA 5
4 B 2 5 5 NA NA NA
In addition, when I apply it to the actual data, I get:
Using 'H_FEMALE' as value column. Use 'value.var' to override
Error in CJ(1:72284, 1:1333365) :
Cross product of elements provided to CJ() would result in 96380955660 rows which exceeds .Machine$integer.max == 2147483647
So I think I am doing something wrong. I think it maybe has to do with the value.var which I do not know how to select.
Desired result:
# A tibble: 4 x 9
Area Year Group `Population_Count_ Group_1` `Male_Count_ Group_1` `Female_Count_ Group_1` `Population_Count_ Group_2` `Male_Count_ Group_2` `Female_Count_ Group_2`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5 12 7 5
2 A 2 1 10 5 5 12 4 8
3 B 1 1 10 5 5 13 8 5
4 B 2 1 10 5 5 11 6 5

library(tidyverse)
DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
> DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
# A tibble: 4 x 8
Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2 Female_Count_1 Female_Count_2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 10 12 5 7 5 5
2 A 2 10 12 5 4 5 8
3 B 1 10 13 5 8 5 5
4 B 2 10 11 5 6 5 5
This will name your columns as desired
DT %>% pivot_wider(id_cols = c("Area", "Year"),
names_from = "Group",
values_from = 4:6,
names_sep = "_Group_")

use data.table
library(data.table)
dt <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
setDT(dt)
dcast(
dt,
formula = Area + Year ~ Group,
value.var = grep("_Count", names(dt), value = T)
)
#> Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2
#> 1: A 1 10 12 5 7
#> 2: A 2 10 12 5 4
#> 3: B 1 10 13 5 8
#> 4: B 2 10 11 5 6
#> Female_Count_1 Female_Count_2
#> 1: 5 5
#> 2: 5 8
#> 3: 5 5
#> 4: 5 5
Created on 2020-12-18 by the reprex package (v0.3.0)

Related

R incrementing a variable in dplyr

I have the following grouped data frame:
library(dplyr)
# Create a sample dataframe
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
I want to update the age of each student so that it is one plus the age in the previous year, with their age in the first year they appear in the dataset remaining unchanged. For example, student A's age should be NA, 6, 7, student B's age should be 7,8,9, and student C's age should be NA, NA, 9.
How about this:
library(dplyr)
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
df %>%
group_by(student) %>%
mutate(age = age + cumsum(!is.na(age))-1)
#> # A tibble: 9 × 3
#> # Groups: student [3]
#> student grade age
#> <chr> <dbl> <dbl>
#> 1 A 1 NA
#> 2 A 2 6
#> 3 A 3 7
#> 4 B 1 7
#> 5 B 2 8
#> 6 B 3 9
#> 7 C 1 NA
#> 8 C 2 NA
#> 9 C 3 9
Created on 2022-12-30 by the reprex package (v2.0.1)
in data.table, assuming the order of the rows is the 'correct' order:
library(data.table)
setDT(df)[, new_age := age + rowid(age) - 1, by = .(student)]
# student grade age new_age
# 1: A 1 NA NA
# 2: A 2 6 6
# 3: A 3 6 7
# 4: B 1 7 7
# 5: B 2 7 8
# 6: B 3 7 9
# 7: C 1 NA NA
# 8: C 2 NA NA
# 9: C 3 9 9

How to subtract value of one group from other groups in R

I am trying to subtract the value of one group from another. I am hoping to use tidyverse
structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,
-9L))
That is my data, and I want to subtract all values of group A from B and C, and store the difference in one variable.
baseR solution
df$new <- df$value - ave(df$value, df$A, FUN = function(x) mean(x[df$group == 'a'], na.rm = T) )
> df
A group value new
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
dplyr method (assumption there is not more than one a value per group, else R will confuse which value to substract and result in error)
df %>% group_by(A) %>% mutate(new = ifelse(group != 'a', value - value[group == 'a'], value) )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 10
2 1 b 11 1
3 1 c 12 2
4 2 a 11 11
5 2 b 40 29
6 2 c 23 12
7 3 a 71 71
8 3 b 72 1
9 3 c 91 20
or if you want to change all values
df %>% group_by(A) %>% mutate(new = value - value[group == 'a'] )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
I only used data.table rather than data.frame because I'm more familiar.
library(data.table)
data <- setDT(structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,-9L)))
for (i in 1:length(unique(data$A))){
data[A == i, substraction := data[A == i, 'value'] - data[A == i & group == 'a', value]]
}

Assign value to a column where column name is a concatenation of other columns' values

Assume the following data:
dat <- structure(list(row = c("467", "537", "236", "257"), x_11 = c(5,
5, 5, 4), x_12 = c(5, 5, 6, 1), x_13 = c(4, 7, 6, 5), x_14 = c(4,
6, 4, 1), x_15 = c(4, 5, 4, 4), x_16 = c(2, 6, 5, 2), x_17 = c(3,
4, 3, 3), mode_1 = c(4, 5, 4, 1), mode_2 = c(NA, NA, 5, 4), mode_3 = c(NA,
NA, 6, NA), mean = c(3.85714285714286, 5.42857142857143, 4.71428571428571,
2.85714285714286), sd = c(1.0690449676497, 0.975900072948533,
1.11269728052837, 1.57359158493889), nearest = c(1L, 1L, 2L,
2L)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
which gives:
# A tibble: 4 x 14
row x_11 x_12 x_13 x_14 x_15 x_16 x_17 mode_1 mode_2 mode_3 mean sd nearest
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 467 5 5 4 4 4 2 3 4 NA NA 3.86 1.07 1
2 537 5 5 7 6 5 6 4 5 NA NA 5.43 0.976 1
3 236 5 6 6 4 4 5 3 4 5 6 4.71 1.11 2
4 257 4 1 5 1 4 2 3 1 4 NA 2.86 1.57 2
I now want to create a new column based on the following condition:
if mode_2 is NA, then take the value from mode_1
if mode_2 is NOT NA, then take the value from the column position that is specified in "nearest". Note: the column position in "nearest" refers to the column position of the mode_ columns, NOT the overall column positions of the data frame.
I tried the following, but always getting an error that object "take" is not found:
dat %>%
mutate(test = case_when(is.na(mode_2) ~ x_1,
TRUE ~ !!paste0("mode_", nearest))
Expected output:
# A tibble: 4 x 15
row [...] mode_1 mode_2 mode_3 mean sd nearest test
<chr> [...] <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 467 [...] 4 NA NA 3.86 1.07 1 4
2 537 [...] 5 NA NA 5.43 0.976 1 5
3 236 [...] 4 5 6 4.71 1.11 2 5
4 257 [...] 1 4 NA 2.86 1.57 2 4
Note in reality I have ~ 20-50 mode_ columns so I can't hard code all potential combinations.
You can create a new column which has corresponding value from take and use coalesce to select any one of the non-NA value.
library(dplyr)
dat %>%
mutate(take_value = as.numeric(.[cbind(1:n(),
match(paste0('x_', take), names(.)))]),
test = coalesce(take_value, x_1)) %>%
select(-take_value)
# x_1 x_2 take test
#1 1 NA 2 1
#2 9 2 1 9
#3 3 NA 2 3
#4 7 8 2 8
#5 5 NA 1 5
Using base R :
dat$take_value <- dat[cbind(1:nrow(dat), match(paste0('x_', dat$take), names(dat)))]
transform(dat, test = ifelse(is.na(take_value), x_1, take_value))

build a network edge table from a sparse table

I don't know exactly how to explain it but...
I have a sparse table where each group represents a level. The columns are ordered, it means, the downstream (left) column represents a child node and upstream (right) node represents a parent node.
I'd like a two columns table where the 1st column is the parent node and the 2nd is the child node. If possible, a 3rd columns with the length (sum of the number of final nodes) of the parents.
Follow the example:
>tt <- tibble(
ID = letters[1:8],
`1` = c( 1, 1, 1, 1, 2, 2, 2, 2),
`2` = c( 3, 3, 4, 4, 5, 5, 5, 6),
`3` = c( 7, 7, 8, 9,10,10,11,12)
)
> tt
# A tibble: 8 x 4
ID `1` `2` `3`
<chr> <dbl> <dbl> <dbl>
1 a 1 3 7
2 b 1 3 7
3 c 1 4 8
4 d 1 4 9
5 e 2 5 10
6 f 2 5 10
7 g 2 5 11
8 h 2 6 12
>dput(tt)
structure(list(ID = c("a", "b", "c", "d", "e", "f", "g", "h"),
`1` = c(1, 1, 1, 1, 2, 2, 2, 2), `2` = c(3, 3, 4, 4, 5, 5,
5, 6), `3` = c(7, 7, 8, 9, 10, 10, 11, 12)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
the result should be:
>ttt <- tibble(
parent = c(1,1,2,2,3,4,4, 5, 5, 6, 7,7,8,9,10,10,11,12),
child = c(3,4,5,6,7,8,9,10,11,12, letters[1:8] ),
length = c(4,4,4,4,2,2,2, 3, 3, 1, 2,2,1,1, 2, 2, 1, 1)
)
>ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
> dput(ttt)
structure(list(parent = c(1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 7,
8, 9, 10, 10, 11, 12), child = c("3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "a", "b", "c", "d", "e", "f", "g", "h"
), length = c(4, 4, 4, 4, 2, 2, 2, 3, 3, 1, 2, 2, 1, 1, 2, 2,
1, 1)), row.names = c(NA, -18L), class = c("tbl_df", "tbl", "data.frame"
))
Any help is appreciated.
Thanks in advance.
This gets you 90% of the way there:
tt_correct <- tt[, c(2,3,4,1)]
ttt <- do.call(
rbind,
lapply(seq_len(length(tt)-1),
function(i){
DF <- tt_correct[, c(i, i+1)]
names(DF) <- c('parent', 'child')
DF$length <- ave(DF$parent, DF$parent, FUN = length)
unique(DF)
}
)
)
ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
The first part is correcting the order. Your expected output indicates that the 1st column is a child of the 4th column. The lapply() statement largely walks along the data.frame and stacks the data.
This is 90% of the way because the answer doesn't agree with your expected output for lengths. I think this is correct but I could be wrong.
Finally, and I'm not that good with igraph, you could likely find additional information doing:
library(igraph)
plot(graph_from_data_frame(ttt[, 1:2]))

How to add a column with progressive number based on condition

I am trying to add a column to my existing data set.
The data set has three columns:
Student (which is the column with the participant ID),
Week (the number of the week of the year during which the data were collected),
and
Day (the number of the weekday during which the data were
collected).
Now, a new column Obs that I am trying to create would contain a progressive number (from 1 to n) referring to the week during which every student was tested.
I have tried to use group_by in combination with rep but it does not seem to produce the result I want:
Week <- c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4)
Day <- c(1, 2, 3, 2, 3, 5, 1, 3, 2, 3, 4, 5)
Student <- c("A", "A", "A", "B", "B", "B", "B", "B", "C", "C", "C", "C")
fake.db <- data.frame(Student, Week, Day)
library(dplyr)
fake.db %>%
group_by(Student) %>%
mutate(Obs = rep(1:length(Student), each = Week))
# Student Week Day Obs
# <fct> <dbl> <dbl> <int>
# 1 A 1 1 1
# 2 A 1 2 2
# 3 A 1 3 3
# 4 B 2 2 1
# 5 B 2 3 2
# 6 B 2 5 3
# 7 B 3 1 4
# 8 B 3 3 5
# 9 C 4 2 1
#10 C 4 3 2
#11 C 4 4 3
#12 C 4 5 4
What I would like to obtain is different. For the first week of data collection, 1 should be reported, and for the students for whom data were collected during a second week, 2 should be reported, etc.:
# Student Week Day Obs
#1 A 1 1 1
#2 A 1 2 1
#3 A 1 3 1
#4 B 2 2 1
#5 B 2 3 1
#6 B 2 5 1
#7 B 3 1 2
#8 B 3 3 2
#9 C 4 2 1
#10 C 4 3 1
#11 C 4 4 1
#12 C 4 5 1
One dplyr possibility could be:
fake.db %>%
group_by(Student) %>%
mutate(Obs = cumsum(!duplicated(Week)))
Student Week Day Obs
<fct> <dbl> <dbl> <int>
1 A 1 1 1
2 A 1 2 1
3 A 1 3 1
4 B 2 2 1
5 B 2 3 1
6 B 2 5 1
7 B 3 1 2
8 B 3 3 2
9 C 4 2 1
10 C 4 3 1
11 C 4 4 1
12 C 4 5 1
It groups by "Student" column and calculates the cumulative sum of non-duplicate "Week" values.
Or:
fake.db %>%
group_by(Student) %>%
mutate(Obs = with(rle(Week), rep(seq_along(lengths), lengths)))
It groups by "Student" column and creates a run-length type group ID around "Week" column".
Or:
fake.db %>%
group_by(Student) %>%
mutate(Obs = dense_rank(Week))
It groups by "Student" column and ranks the values in "Week" column.
What I understand the issue to be is that you want to count the weeks since the first test week for each student. I.e. Week 2 is student B's first week of testing, so it gets Obs = 1. That means you can do a grouped mutate:
library(dplyr)
fake.db <- structure(list(Student = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), Week = c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4), Day = c(1, 2, 3, 2, 3, 5, 1, 3, 2, 3, 4, 5)), class = "data.frame", row.names = c(NA, -12L))
fake.db %>%
group_by(Student) %>%
mutate(Obs = Week - min(Week) + 1)
#> # A tibble: 12 x 4
#> # Groups: Student [3]
#> Student Week Day Obs
#> <fct> <dbl> <dbl> <dbl>
#> 1 A 1 1 1
#> 2 A 1 2 1
#> 3 A 1 3 1
#> 4 B 2 2 1
#> 5 B 2 3 1
#> 6 B 2 5 1
#> 7 B 3 1 2
#> 8 B 3 3 2
#> 9 C 4 2 1
#> 10 C 4 3 1
#> 11 C 4 4 1
#> 12 C 4 5 1
Created on 2019-05-10 by the reprex package (v0.2.1)
A brief method with by
unlist(by(fake.db, fake.db[, 1], function(x) as.numeric(factor(x[, 2]))))
# A1 A2 A3 B1 B2 B3 B4 B5 C1 C2 C3 C4
# 1 1 1 1 1 1 2 2 1 1 1 1
Data
fake.db <- structure(list(Student = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
Week = c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4), Day = c(1,
2, 3, 2, 3, 5, 1, 3, 2, 3, 4, 5)), class = "data.frame", row.names = c(NA,
-12L))
You can see if there is a non-zero difference
fake.db %>%
group_by(Student) %>%
arrange(Week) %>%
mutate(Obs = cumsum(c(1, diff(Week)!=0)))
or if they values arne't numeric, you can compare to the lag value
fake.db %>%
group_by(Student) %>%
arrange(Week) %>%
mutate(Obs = cumsum(Week != lag(Week, default=first(Week))) + 1)

Resources