How to subtract value of one group from other groups in R - r

I am trying to subtract the value of one group from another. I am hoping to use tidyverse
structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,
-9L))
That is my data, and I want to subtract all values of group A from B and C, and store the difference in one variable.

baseR solution
df$new <- df$value - ave(df$value, df$A, FUN = function(x) mean(x[df$group == 'a'], na.rm = T) )
> df
A group value new
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
dplyr method (assumption there is not more than one a value per group, else R will confuse which value to substract and result in error)
df %>% group_by(A) %>% mutate(new = ifelse(group != 'a', value - value[group == 'a'], value) )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 10
2 1 b 11 1
3 1 c 12 2
4 2 a 11 11
5 2 b 40 29
6 2 c 23 12
7 3 a 71 71
8 3 b 72 1
9 3 c 91 20
or if you want to change all values
df %>% group_by(A) %>% mutate(new = value - value[group == 'a'] )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20

I only used data.table rather than data.frame because I'm more familiar.
library(data.table)
data <- setDT(structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,-9L)))
for (i in 1:length(unique(data$A))){
data[A == i, substraction := data[A == i, 'value'] - data[A == i & group == 'a', value]]
}

Related

R incrementing a variable in dplyr

I have the following grouped data frame:
library(dplyr)
# Create a sample dataframe
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
I want to update the age of each student so that it is one plus the age in the previous year, with their age in the first year they appear in the dataset remaining unchanged. For example, student A's age should be NA, 6, 7, student B's age should be 7,8,9, and student C's age should be NA, NA, 9.
How about this:
library(dplyr)
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
df %>%
group_by(student) %>%
mutate(age = age + cumsum(!is.na(age))-1)
#> # A tibble: 9 × 3
#> # Groups: student [3]
#> student grade age
#> <chr> <dbl> <dbl>
#> 1 A 1 NA
#> 2 A 2 6
#> 3 A 3 7
#> 4 B 1 7
#> 5 B 2 8
#> 6 B 3 9
#> 7 C 1 NA
#> 8 C 2 NA
#> 9 C 3 9
Created on 2022-12-30 by the reprex package (v2.0.1)
in data.table, assuming the order of the rows is the 'correct' order:
library(data.table)
setDT(df)[, new_age := age + rowid(age) - 1, by = .(student)]
# student grade age new_age
# 1: A 1 NA NA
# 2: A 2 6 6
# 3: A 3 6 7
# 4: B 1 7 7
# 5: B 2 7 8
# 6: B 3 7 9
# 7: C 1 NA NA
# 8: C 2 NA NA
# 9: C 3 9 9

Transform multiple columns of the same name and different suffixes into a panel structure

I need to put multiple variables of the same name but with different suffixes in a panel structure. For example, transforming:
In this structure:
I tried to combine the pivot_longer and pivot_wider functions from the tidyverse package, but I was not successful, as the variables are distributed between numerics, integers, characters, etc.
I appreciate any help.
Here's the reproducible example:
structure(list(class.x = c(4, 4, 4, 4, 4), class.y = c("a", "a",
"a", "a", "a"), class.x.x = structure(c(9.88131291682493e-324,
9.88131291682493e-324, 9.88131291682493e-324, 9.88131291682493e-324,
9.88131291682493e-324), class = "integer64"), var1.x = c(1, 1,
1, 1, 1), var1.y = c(0, 0, 0, 0, 0), var1.x.x = c("b", "b", "b",
"b", "b"), var2.x = c(9, 9, 9, 9, 9), var2.y = c(5, 5, 5, 5,
5), var2.x.x = c("c", "c", "c", "c", "c")), class = "data.frame", row.names = c(NA,
-5L))
df %>%
pivot_longer(everything(),
names_to = c('.value','Variable'),
names_pattern = '([^.]+)[.](.*)',
values_transform = as.character)
# A tibble: 15 x 4
Variable class var1 var2
<chr> <chr> <chr> <chr>
1 x 4 1 9
2 y a 0 5
3 x.x 0 b c
4 x 4 1 9
5 y a 0 5
6 x.x 0 b c
7 x 4 1 9
8 y a 0 5
9 x.x 0 b c
10 x 4 1 9
11 y a 0 5
12 x.x 0 b c
13 x 4 1 9
14 y a 0 5
15 x.x 0 b c
Note the provided dput varies from the picture you posted:
First we could create names that are all separated by one .
Then we have to transform all to character: I do it with mutate(across... KU99 did it more elegantly with values_transform!
Now we can apply pivot_longer with names_sep argument.
Finally bring data in shape.
library(tidyverse)
df %>%
rename_with(~str_replace_all(., ".x.x", ".z")) %>%
mutate(across(everything(), as.character)) %>%
pivot_longer(
everything(),
names_to = c(".value", "var1_2"),
names_sep ="\\."
) %>%
arrange(var1_2) %>%
mutate(Variable=ifelse(var1_2 == "z", "x.x", var1_2), .keep="unused")
class var1 var2 Variable
<chr> <chr> <chr> <chr>
1 4 1 9 x
2 4 1 9 x
3 4 1 9 x
4 4 1 9 x
5 4 1 9 x
6 a 0 5 y
7 a 0 5 y
8 a 0 5 y
9 a 0 5 y
10 a 0 5 y
11 9.88131291682493e-324 b c x.x
12 9.88131291682493e-324 b c x.x
13 9.88131291682493e-324 b c x.x
14 9.88131291682493e-324 b c x.x
15 9.88131291682493e-324 b c x.x

R: Create numbering within each group

The data that I have:
x = tibble(
study = c("A", "B", "C", "A", "B", "A", "B", "C", "A", "B"),
ID = c(001, 001, 001, 005, 005, 007, 007, 007, 012, 012)
)
The goal is to create the 'number' variable which shows the same number for each unique ID in sequence starting from 1.
goal = tibble(
study = c("A", "B", "C", "A", "B", "A", "B", "C", "A", "B"),
ID = c(001, 001, 001, 005, 005, 007, 007, 007, 012, 012),
number = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4)
)
And then if within each ID group, the studies are incomplete (e.g., for number = 2, the studies are only A and B, instead of A, B, C), then how to remove the obs associated with that ID (e.g., remove obs that have a number of '2')?
Thanks!
Updated follow-up question on part B:
Once we have the goal dataset, I would like to remove the obs grouped by ID, that meet the following requirements in terms of the study variable:
A and D are required, one of B and C is required (so either B or C), and sometimes each letter will appear more than once.
x = tibble(
study = c("A", "B", "C", "D", "A", "B", "A", "B", "C", "A", "B", "C", "D", "D", "A", "B", "D", "B", "C", "D"),
ID = c(001, 001, 001, 001, 005, 005, 007, 007, 007, 012, 012, 012, 012, 012, 013, 013, 013, 018, 018, 018),
number = c(1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 6, 6, 6)
)
So in the goal dataset above, I would like to remove:
(1) Obs #5 and 6 which share a group number of 2, because they don't have A, B or C, and D in the study variable.
(2) Obs #18, 19, 20 which share a group number of 6, for the same reason as (1).
I would like to keep the rest of the obs because within each number group, they have A, B or C, and D. I cannot use filter(n() > 3) here, because that would delete obs with the number 5.
We could use cur_group_id()
library(dplyr)
x %>%
group_by(ID) %>%
mutate(number = cur_group_id())
study ID number
<chr> <dbl> <int>
1 A 1 1
2 B 1 1
3 C 1 1
4 A 5 2
5 B 5 2
6 A 7 3
7 B 7 3
8 C 7 3
9 A 12 4
10 B 12 4
OR
library(dplyr)
x %>%
mutate(number = cumsum(ID != lag(ID, default = first(ID)))+1)
study ID number
<chr> <dbl> <dbl>
1 A 1 1
2 B 1 1
3 C 1 1
4 A 5 2
5 B 5 2
6 A 7 3
7 B 7 3
8 C 7 3
9 A 12 4
10 B 12 4
A) The dplyr package offers group_indices() for adding unique group indentifiers:
library(dplyr)
df$number <- df %>%
group_indices(ID)
df
# A tibble: 10 × 3
study ID number
<chr> <dbl> <int>
1 A 1 1
2 B 1 1
3 C 1 1
4 A 5 2
5 B 5 2
...
B) You can drop observations where the group size is less than 3 (i.e., "A", "B" and "C") with filter():
df %>%
group_by(ID) %>%
filter(n() == 3)
# A tibble: 6 × 3
# Groups: ID [2]
study ID number
<chr> <dbl> <int>
1 A 1 1
2 B 1 1
3 C 1 1
4 A 7 3
5 B 7 3
6 C 7 3
A and D are required, one of B and C is required (so either B or C)
df %>%
group_by(ID) %>%
mutate(
flag =
(
any(study %in% c("A")) &
any(study %in% c("D"))
) &
(
any(study %in% c("B")) |
any(study %in% c("C"))
)
) %>%
filter(flag)
# A tibble: 12 × 4
# Groups: ID [3]
study ID number flag
<chr> <dbl> <dbl> <lgl>
1 A 1 1 TRUE
2 B 1 1 TRUE
3 C 1 1 TRUE
4 D 1 1 TRUE
5 A 12 4 TRUE
6 B 12 4 TRUE
7 C 12 4 TRUE
8 D 12 4 TRUE
9 D 12 4 TRUE
10 A 13 5 TRUE
11 B 13 5 TRUE
12 D 13 5 TRUE

Reshaping data by appending rows from different groups to the same row

I have data as follows:
DT <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 8 x 6
Area Year Group Population_Count Male_Count Female_Count
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5
2 A 1 2 12 7 5
3 A 2 1 10 5 5
4 A 2 2 12 4 8
5 B 1 1 10 5 5
6 B 1 2 13 8 5
7 B 2 1 10 5 5
8 B 2 2 11 6 5
I would like to keep one observations per Area-Year, without losing any information. I tried to do
DTcast <- dcast(DT, Area + Year ~ Group + Population_Count + Male_Count + Female_Count)
But that results in a lot of rubbish:
Area Year 1_10_5_5 2_11_6_5 2_12_4_8 2_12_7_5 2_13_8_5
1 A 1 5 NA NA 5 NA
2 A 2 5 NA 8 NA NA
3 B 1 5 NA NA NA 5
4 B 2 5 5 NA NA NA
In addition, when I apply it to the actual data, I get:
Using 'H_FEMALE' as value column. Use 'value.var' to override
Error in CJ(1:72284, 1:1333365) :
Cross product of elements provided to CJ() would result in 96380955660 rows which exceeds .Machine$integer.max == 2147483647
So I think I am doing something wrong. I think it maybe has to do with the value.var which I do not know how to select.
Desired result:
# A tibble: 4 x 9
Area Year Group `Population_Count_ Group_1` `Male_Count_ Group_1` `Female_Count_ Group_1` `Population_Count_ Group_2` `Male_Count_ Group_2` `Female_Count_ Group_2`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5 12 7 5
2 A 2 1 10 5 5 12 4 8
3 B 1 1 10 5 5 13 8 5
4 B 2 1 10 5 5 11 6 5
library(tidyverse)
DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
> DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
# A tibble: 4 x 8
Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2 Female_Count_1 Female_Count_2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 10 12 5 7 5 5
2 A 2 10 12 5 4 5 8
3 B 1 10 13 5 8 5 5
4 B 2 10 11 5 6 5 5
This will name your columns as desired
DT %>% pivot_wider(id_cols = c("Area", "Year"),
names_from = "Group",
values_from = 4:6,
names_sep = "_Group_")
use data.table
library(data.table)
dt <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
setDT(dt)
dcast(
dt,
formula = Area + Year ~ Group,
value.var = grep("_Count", names(dt), value = T)
)
#> Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2
#> 1: A 1 10 12 5 7
#> 2: A 2 10 12 5 4
#> 3: B 1 10 13 5 8
#> 4: B 2 10 11 5 6
#> Female_Count_1 Female_Count_2
#> 1: 5 5
#> 2: 5 8
#> 3: 5 5
#> 4: 5 5
Created on 2020-12-18 by the reprex package (v0.3.0)

Pivot from wide one time-repeated column to wide [duplicate]

This question already has an answer here:
Using Reshape from wide to long in R [closed]
(1 answer)
Closed 2 years ago.
Suppose I have the following data in that wide format:
data = tibble::tribble(
~ID, ~Time, ~Value, ~ValueX,
"A", 1, 11, 41,
"A", 2, 12, 42,
"A", 3, 13, 43,
"B", 1, 21, 41,
"B", 2, 22, 42,
"B", 3, 23, 43,
"C", 1, 31, 41,
"C", 2, 32, 42,
"C", 3, 33, 43
)
Since ValueX is a repeated variable that does not vary within ID group variable, I just want to add it as new rows identified by ID. This will be the desired output:
data.desired = tibble::tribble(
~ID, ~Time, ~Value,
"A", 1, 11,
"A", 2, 12,
"A", 3, 13,
"B", 1, 21,
"B", 2, 22,
"B", 3, 23,
"C", 1, 31,
"C", 2, 32,
"C", 3, 33,
"ValueX", 1, 41,
"ValueX", 2, 42,
"ValueX", 3, 41
)
Here is a way via base R. You can aggregate ValueX per Time and get the first observation each. Then create a data frame with same names as your original data and simply rbind, i.e.
rbind(data[-ncol(data)],
setNames(data.frame('ValueX', aggregate(ValueX ~ Time, data, head, 1)),
names(data[-ncol(data)])))
# A tibble: 12 x 3
# ID Time Value
# <chr> <dbl> <dbl>
# 1 A 1 11
# 2 A 2 12
# 3 A 3 13
# 4 B 1 21
# 5 B 2 22
# 6 B 3 23
# 7 C 1 31
# 8 C 2 32
# 9 C 3 33
#10 ValueX 1 41
#11 ValueX 2 42
#12 ValueX 3 43
use tidyverse
addCase <- distinct(data, Time, ValueX) %>%
pivot_longer(-Time, names_to = "ID", values_to = "Value")
data %>%
select(-ValueX) %>%
add_case(addCase)
# A tibble: 12 x 3
ID Time Value
<chr> <dbl> <dbl>
1 A 1 11
2 A 2 12
3 A 3 13
4 B 1 21
5 B 2 22
6 B 3 23
7 C 1 31
8 C 2 32
9 C 3 33
10 ValueX 1 41
11 ValueX 2 42
12 ValueX 3 43

Resources