Create new columns based on 2 columns - r

So I have this kind of table df
Id
Type
QTY
unit
1
A
5
1
2
B
10
2
3
C
5
3
2
A
10
4
3
B
5
5
1
C
10
6
I want to create this data frame df2
Id
A_QTY
A_unit
B_QTY
B_unit
C_QTY
C_unit
1
5
1
0
0
10
6
2
10
4
10
2
0
0
3
0
0
5
5
5
3
This means that I want to create a new column for every "Type's" "QTY" and "unit" for each "Id". I was thinking to use a loop to first create a new column for each Type, to get something like this :
Id
Type
QTY
unit
A_QTY
A_unit
B_QTY
B_unit
C_QTY
C_unit
1
A
5
1
5
1
0
0
0
0
2
B
10
2
0
0
10
2
0
0
3
C
5
3
0
0
0
0
5
3
2
A
10
4
10
4
0
0
0
0
3
B
5
5
0
0
5
5
0
0
1
C
10
6
0
0
0
0
10
6
, and then group_by() to agregate them resulting in df2. But I get stuck when it comes to creating the new columns. I have tried the for loop but my level on R is still not that great yet. I can't manage to create new columns from those existing columns...
I'll appreciate any suggestions you have for me!

You can use pivot_wider from the tidyr package:
library(dplyr)
library(tidyr)
df %>%
pivot_wider(names_from = "Type", # Columns to get the names from
values_from = c("QTY", "unit"), # Columns to get the values from
names_glue = "{Type}_{.value}", # Column naming
values_fill = 0, # Fill NAs with 0
names_vary = "slowest") # To get the right column ordering
output
# A tibble: 3 × 7
Id A_QTY A_unit B_QTY B_unit C_QTY C_unit
<int> <int> <int> <int> <int> <int> <int>
1 1 5 1 0 0 10 6
2 2 10 4 10 2 0 0
3 3 0 0 5 5 5 3

library(tidyverse)
df %>%
pivot_longer(-c(Id, Type)) %>%
mutate(name = str_c(Type, name, sep = "_")) %>%
select(-Type) %>%
pivot_wider(names_from = "name", values_from = "value", values_fill = 0)
# A tibble: 3 × 7
Id A_QTY A_unit B_QTY B_unit C_QTY C_unit
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 5 1 0 0 10 6
2 2 10 4 10 2 0 0
3 3 0 0 5 5 5 3

Related

Transformation of dataframe applied on 3 variables

I wrote several commands to transform a dataframe but i would like to simplify the code that I wrote in four parts. Part 1,2 and 3 are to make calculation of column 1, 2 and 3 (count the number of time a value is repeated for each column and complete for missing number comprised between 0 and the max of value of the three column). The fourth part is to join the previous output.
I would like to simplify it in order to make the transformation of the 3 column in one block of code instead of 4. Is it possible to do it without using function ?
Thank you in advance.
set.seed(1234)
# Data
A=sample(0:10, 20, replace = TRUE)
B=sample(0:10, 20, replace = TRUE)
C=sample(0:10, 20, replace = TRUE)
df=data.frame(A,B,C)
A B C
1 9 2 0
2 5 3 5
3 4 9 7
4 8 4 2
5 4 1 5
6 5 7 0
7 3 10 0
8 1 3 8
9 6 2 7
10 5 6 9
11 9 8 0
12 5 2 10
13 3 5 7
14 7 3 9
15 3 7 5
16 3 9 2
17 4 10 8
18 7 1 2
19 3 4 5
20 7 5 8
# Count for A
df2=data.frame(A=0:max(max(df$A),max(df$B),max(df$C)))
df3_A= df %>%
select(A) %>%
group_by(A) %>%
mutate(A_number= n()) %>%
distinct(A_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_A$A_number[is.na(df3_A$A_number)]=0
# Count for B
df2=data.frame(B=0:max(max(df$A),max(df$B),max(df$C)))
df3_B= df %>%
select(B) %>%
group_by(B) %>%
mutate(B_number= n()) %>%
distinct(B_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_B$B_number[is.na(df3_B$B_number)]=0
# Count for C
df2=data.frame(C=0:max(max(df$A),max(df$B),max(df$C)))
df3_C= df %>%
select(C) %>%
group_by(C) %>%
mutate(C_number= n()) %>%
distinct(C_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_C$C_number[is.na(df3_C$C_number)]=0
# Join
df3= df3_A %>%
left_join(df3_B, by=c("A"="B")) %>%
left_join(df3_C, by=c("A"="C"))
A A_number B_number C_number
<int> <dbl> <dbl> <dbl>
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1
Using base: stack and table:
as.data.frame.matrix(table(stack(df)))
# A B C
# 0 0 0 4
# 1 1 2 0
# 2 0 3 3
# 3 5 3 0
# 4 3 2 0
# 5 4 2 4
# 6 1 1 0
# 7 3 2 3
# 8 1 1 3
# 9 2 2 2
# 10 0 2 1
You can reshape to long, count the values by variables, then reshape back to wide filling missings with zero:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything()) %>%
count(name, value) %>%
pivot_wider(values_from = n, values_fill = 0) %>%
arrange(value)
# A tibble: 11 × 4
value A B C
<int> <int> <int> <int>
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1
You can use vctrs::vec_count over the columns and then merge the data.frames altogether:
library(dplyr)
library(purrr)
df %>%
mutate(across(A:C, factor, levels = 0:10, ordered = TRUE)) %>%
map(vctrs::vec_count) %>%
imap(~ {name <- paste0("count", .y) %>%
rename_with(.x, ~ name, count)}) %>%
reduce(full_join, by = "key") %>%
replace(is.na(.), 0) %>%
arrange(key)
output
key countA countB countC
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1

Is there a good way to make the rest of value 1 within a id if the previous value is 1 in r

I am trying to make the rest of visit value to 1 if the previous value is 1 within the same id, but just can't find a good way without a bunch of slow for loops.
Here is the example:
data.frame(ID = c(1,1,1,1,1,2,2,2,2,2,2,3,3,3),
Visit = c(1,2,3,4,5,1,2,3,4,5,6,1,2,3),
Value = c(0,0,1,0,0,0,0,0,0,1,0,0,1,0))
And here is what I want to get:
data.frame(ID = c(1,1,1,1,1,2,2,2,2,2,2,3,3,3),
Visit = c(1,2,3,4,5,1,2,3,4,5,6,1,2,3),
Value = c(0,0,1,1,1,0,0,0,0,1,1,0,1,1))
Thanks in advance!
We could use cummax to return 1 for rest of the values after grouping by 'ID'
library(dplyr)
df1 %>%
group_by(ID) %>%
mutate(Value = cummax(Value))%>%
ungroup
-output
# A tibble: 13 × 3
ID Visit Value
<dbl> <dbl> <int>
1 1 1 0
2 1 2 0
3 1 3 1
4 1 4 1
5 2 1 0
6 2 2 0
7 2 3 0
8 2 4 0
9 2 5 1
10 2 6 1
11 3 1 0
12 3 2 1
13 3 3 1

R - Grouping rows by matching value then adding rows to matching columns in another data frame

I am trying to add values from one data frame (ex2) to an existing data frame (ex1) based on two different columns. As you can see, there is an ID column in both data frames. But in ex2, each column of ex1 is represented by a different row instead of a column. For each matching ID, I want to add the result from ex2$result to the matching row in ex1 under the appropriate column heading (if ex2$alpha[i] = a then ex2$result[i] gets added to ex1$a[z] where ex2$id[i]=ex1$id[z]). Another complication is that not all of the columns from ex1 will have alpha value in ex2, so those should be set as 'NA'.
ex1 <- data.frame(
id = c(1:20),
a = c(rep(1,5),rep(0,5),rep(NA,10)),
b = c(rep(c(1,0),5),rep(NA,10)),
c = c(rep(c(0,1),5),rep(NA,10)),
d = c(rep(0,5),rep(1,5),rep(NA,10))
)
ex2 <- data.frame(
id = c(rep(11,3),rep(12,3),rep(13,3),
rep(14,2),rep(15,2),
rep(16,4),rep(17,4),rep(18,4),rep(19,4),rep(20,4)),
alpha = c(rep(c('a','b','d'),3),rep(c('a','b'),2),
rep(c('a','b','c','d'),5)),
result = c(rep(c(0,1,1),11))
)
Thanks for your help!
I believe the attached snippet does what you want it to do. But it is hard to know from your toy data if it is feasible to write out the columns a to d in the mutate statement. There surely is a more clever programmatic way to approach this problem.
ex1 <- data.frame(
id = c(1:20),
a = c(rep(1,5),rep(0,5),rep(NA,10)),
b = c(rep(c(1,0),5),rep(NA,10)),
c = c(rep(c(0,1),5),rep(NA,10)),
d = c(rep(0,5),rep(1,5),rep(NA,10))
)
ex2 <- data.frame(
id = c(rep(11,3),rep(12,3),rep(13,3),
rep(14,2),rep(15,2),
rep(16,4),rep(17,4),rep(18,4),rep(19,4),rep(20,4)),
alpha = c(rep(c('a','b','d'),3),rep(c('a','b'),2),
rep(c('a','b','c','d'),5)),
result = c(rep(c(0,1,1),11))
)
library(tidyverse)
ex_2_wide <- pivot_wider(ex2, id_cols = id, names_from = alpha, values_from = result )
joined <- full_join(ex1, ex_2_wide, by = c("id" = "id")) %>%
mutate(a = coalesce(a.x, a.y)) %>%
mutate(b = coalesce(b.x, b.y)) %>%
mutate(c = coalesce(c.x, c.y)) %>%
mutate(d = coalesce(d.x, d.y)) %>%
select(-(a.x:c.y))
joined
#> id a b c d
#> 1 1 1 1 0 0
#> 2 2 1 0 1 0
#> 3 3 1 1 0 0
#> 4 4 1 0 1 0
#> 5 5 1 1 0 0
#> 6 6 0 0 1 1
#> 7 7 0 1 0 1
#> 8 8 0 0 1 1
#> 9 9 0 1 0 1
#> 10 10 0 0 1 1
#> 11 11 0 1 NA 1
#> 12 12 0 1 NA 1
#> 13 13 0 1 NA 1
#> 14 14 0 1 NA NA
#> 15 15 1 0 NA NA
#> 16 16 1 1 0 1
#> 17 17 1 0 1 1
#> 18 18 0 1 1 0
#> 19 19 1 1 0 1
#> 20 20 1 0 1 1
Created on 2021-01-07 by the reprex package (v0.3.0)
EDIT:
If we turn the problem around (we first make long tables, followed by join and merge, then pivot back wide), there is only a single step for merger, no matter how many columns you have.
library(tidyverse)
ex1_long <- pivot_longer(ex1, cols = a:d, names_to = "alpha")
joined <- full_join(ex1_long, ex2, by = c("id" = "id", "alpha" = "alpha")) %>%
mutate(value = coalesce(value, result)) %>% select(-result) %>%
pivot_wider(id_cols = id, names_from = alpha, values_from = value)
joined
#> # A tibble: 20 x 5
#> id a b c d
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 0 0
#> 2 2 1 0 1 0
#> 3 3 1 1 0 0
#> 4 4 1 0 1 0
#> 5 5 1 1 0 0
#> 6 6 0 0 1 1
#> 7 7 0 1 0 1
#> 8 8 0 0 1 1
#> 9 9 0 1 0 1
#> 10 10 0 0 1 1
#> 11 11 0 1 NA 1
#> 12 12 0 1 NA 1
#> 13 13 0 1 NA 1
#> 14 14 0 1 NA NA
#> 15 15 1 0 NA NA
#> 16 16 1 1 0 1
#> 17 17 1 0 1 1
#> 18 18 0 1 1 0
#> 19 19 1 1 0 1
#> 20 20 1 0 1 1
Created on 2021-01-07 by the reprex package (v0.3.0)

Identifying duplicate within groups by latest date

I currently have a data frame that looks like this:
ID Value Date
1 1 A 1/1/2018
2 1 B 2/3/1988
3 1 B 6/3/1994
4 2 A 12/6/1999
5 2 B 24/12/1957
6 3 A 9/8/1968
7 3 B 20/9/2016
8 3 C 15/4/1993
9 3 C 9/8/1994
10 4 A 8/8/1988
11 4 C 6/4/2001
Within each ID I would like to identify a row where there is a duplicate Value. The Value that I would like to identify is the duplicate with the most recent Date.
The resulting data frame should look like this:
ID Value Date mostRecentDuplicate
1 1 A 1/1/2018 0
2 1 B 2/3/1988 0
3 1 B 6/3/1994 1
4 2 A 12/6/1999 0
5 2 B 24/12/1957 0
6 3 A 9/8/1968 0
7 3 B 20/9/2016 0
8 3 C 15/4/1993 0
9 3 C 9/8/1994 1
10 4 A 8/8/1988 0
11 4 C 6/4/2001 0`
How do I go about doing this?
Using dplyr we can first convert Date to actual date value, then group_by ID and Value and assign value 1 in the group where there is more than 1 row and the row_number is same as row number of maximum Date.
library(dplyr)
df %>%
mutate(Date = as.Date(Date, "%d/%m/%Y")) %>%
group_by(ID, Value) %>%
mutate(mostRecentDuplicate = +(n() > 1 & row_number() == which.max(Date))) %>%
ungroup()
# A tibble: 11 x 4
# ID Value Date mostRecentDuplicate
# <int> <fct> <date> <int>
# 1 1 A 2018-01-01 0
# 2 1 B 1988-03-02 0
# 3 1 B 1994-03-06 1
# 4 2 A 1999-06-12 0
# 5 2 B 1957-12-24 0
# 6 3 A 1968-08-09 0
# 7 3 B 2016-09-20 0
# 8 3 C 1993-04-15 0
# 9 3 C 1994-08-09 1
#10 4 A 1988-08-08 0
#11 4 C 2001-04-06 0

Removing rows with a rule from another column

I have data similar to this:
PatientID=c(1,1,1,1,2,2,2,3,3,3,3,3)
VisitId=c(1,5,6,9,2,3,12,4,7,8,10,11)
target=c(0,0,0,1,0,0,0,0,0,0,1,0)
as.data.frame(cbind(PatientID,VisitId,target))
PatientID VisitId target
1 1 1 0
2 1 5 0
3 1 6 0
4 1 9 1
5 2 2 0
6 2 3 0
7 2 12 0
8 3 4 0
9 3 7 0
10 3 8 0
11 3 10 1
12 3 11 0
I would need to delete rows, that have a VisitId per PatientID equal or larger than the VisitId of a row where the target is 1.
Ie in the example case rows 4, 11 and 12 should be eliminated, because those are rows that occurred for this patient at the same time or after the target incident happened - which I wish to predict...
Here is an idea using dplyr. This makes the assumption that you only have 1 or none 1 as target in each patientid
library(dplyr)
df %>%
group_by(PatientID) %>%
mutate(new = ifelse(target == 1, VisitId, 0),
new = replace(new, new == 0, max(new))) %>%
filter(target != 1 & VisitId < new | new == 0) %>%
select(-new)
which gives,
# A tibble: 9 x 3
# Groups: PatientID [3]
PatientID VisitId target
<dbl> <dbl> <dbl>
1 1 1 0
2 1 5 0
3 1 6 0
4 2 2 0
5 2 3 0
6 2 12 0
7 3 4 0
8 3 7 0
9 3 8 0

Resources