How do I cast data into non-equi columns? - r

I have a dataset of events, grouped by let like so:
set.seed(3)
events <- data.frame(
let = rep(LETTERS[1:2], each=3),
age = c(0,sample(1:20, size=2),
0,sample(1:20, size=2)),
value = sample(1:100, size=6))
let age value
1 A 0 61
2 A 4 60
3 A 16 13
4 B 0 29
5 B 8 56
6 B 7 99
How can I cast the data frame so that age is multiple columns grouped into weeks? So for each column, take the value of the largest age that is less than or equal to 0, 7, 14, 21 days.
events.cast <- data.frame(
let = LETTERS[1:2],
T0_value = c(61,29),
T1_value = c(60,99),
T2_value = c(60,56),
T3_value = c(13,56))
let T0_value T1_value T2_value T3_value
1 A 61 60 60 13
2 B 29 99 56 56

One option is to cut the 'age' into buckets, get the max row by that group and 'let', then reshape into 'wide' format
library(dplyr)
library(tidyr)
library(stringr)
events %>%
group_by(grp = cut(age, breaks = c(-Inf,0, 7, 14, 21),
labels = str_c("T", 0:3, "_value")), let) %>%
slice(which.max(value)) %>%
ungroup %>%
select(-age) %>%
group_by(let) %>%
complete(grp = unique(.$grp)) %>%
fill(value) %>%
pivot_wider(names_from = grp, values_from = value)
# A tibble: 2 x 5
# Groups: let [2]
# let T0_value T1_value T2_value T3_value
# <chr> <int> <int> <int> <int>
#1 A 61 60 60 13
#2 B 29 99 56 56
data
events <- structure(list(let = c("A", "A", "A", "B", "B", "B"), age = c(0L,
4L, 16L, 0L, 8L, 7L), value = c(61L, 60L, 13L, 29L, 56L, 99L)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))

Related

Add a new column with sum of count to a dataframe according to informations from another in R

I would need help in order to add count column into a table called tab1 according to another tab2.
Here is the first tab :
tab1
Event_Groups Other_column
1 1_G1,2_G2 A
2 2_G1 B
3 4_G4 C
4 7_G5,8_G5,9_G5 D
as you can see in Event_Groups column I have 2 information (Event and Groups numbers separated by a "_"). These informations will also be found in tab2$Group and tab2$Event and the idea is for each element within rows in tab1 (separated by a comma) , to count the number of rows within tab2 where VALUE1 < 10 AND VALUE2 > 30 and then add this count into tab1 in a new column called Sum_count.
Here is the
tab2
Group Event VALUE1 VALUE2
1 G1 1 5 50 <- VALUE1 < 10 & VALUE2 > 30 : count 1
2 G1 2 6 20 <- VALUE2 < 30 : count 0
3 G2 2 50 50 <- VALUE1 > 10 : count 0
4 G3 3 0 0
5 G4 1 0 0
6 G4 4 2 40 <- VALUE1 < 10 & VALUE2 > 30 : count 1
7 G5 7 1 70 <- VALUE1 < 10 & VALUE2 > 30 : count 1
8 G5 8 4 67 <- VALUE1 < 10 & VALUE2 > 30 : count 1
9 G5 9 3 60 <- VALUE1 < 10 & VALUE2 > 30 : count 1
Example :
For instance for the first element of row1 in tab1: 1_G1
we see in tab2 (row1) that VALUE1 < 10 & VALUE2 > 30, so I count 1.
For the seconde element (row1) : 2_G2 we see in tab2 (row3) that VALUE1 > 10, so I count 0.
And here is the expected result tab1 dataframe;
Event_Groups Other_column Sum_count
1_G1,2_G2 A 1
2_G1 B 0
4_G4 C 1
7_G5,8_G5,9_G5 D 3
I dot not know if I am clear enough, do not hesitate to ask questions.
Here are the two tables in dput format if it can helps:
tab1
structure(list(Event_Groups = structure(1:4, .Label = c("1_G1,2_G2",
"2_G1", "4_G4", "7_G5,8_G5,9_G5"), class = "factor"), Other_column =
structure(1:4, .Label = c("A", "B", "C", "D"), class = "factor")),
class = "data.frame", row.names = c(NA,
-4L))
tab2
structure(list(Group = structure(c(1L, 1L, 2L, 3L, 4L, 4L, 5L,
5L, 5L), .Label = c("G1", "G2", "G3", "G4", "G5"), class = "factor"),
Event = c(1L, 2L, 2L, 3L, 1L, 4L, 7L, 8L, 9L), VALUE1 = c(5L,
6L, 50L, 0L, 0L, 2L, 1L, 4L, 3L), VALUE2 = c(50, 20, 50,
0, 0, 40, 70, 67, 60)), class = "data.frame", row.names = c(NA,
-9L))
Here is one way to do it:
library(dplyr)
library(tidyr)
tab1 %>%
mutate(Event_Groups = as.character(Event_Groups)) %>%
separate_rows(Event_Groups, sep = ",") %>%
left_join(.,
tab2 %>%
unite(col = "Event_Groups", Event, Group) %>%
mutate(count = if_else(VALUE1 < 10 & VALUE2 > 30,1L, 0L))) %>%
group_by(Other_column) %>%
summarise(Event_Groups = paste(unique(Event_Groups), collapse = ","),
Sum_count = sum(count)) %>%
select(Event_Groups, everything())
#> Joining, by = "Event_Groups"
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 4 x 3
#> Event_Groups Other_column Sum_count
#> <chr> <fct> <int>
#> 1 1_G1,2_G2 A 1
#> 2 2_G1 B 0
#> 3 4_G4 C 1
#> 4 7_G5,8_G5,9_G5 D 3
Created on 2021-07-29 by the reprex package (v0.3.0)
You can try a tidyverse
library(tidyverse)
tab1 %>%
rownames_to_column() %>%
separate_rows(Event_Groups, sep = ",") %>%
separate(Event_Groups, into = c("Event", "Group"), sep="_", convert = T) %>%
left_join(tab2 %>%
mutate(count = as.numeric(VALUE1 < 10 & VALUE2 > 30)),
by = c("Event", "Group")) %>%
unite(Event_Groups, Event, Group) %>%
group_by(rowname) %>%
summarise(Event_Groups = toString(Event_Groups),
Other_column = unique(Other_column),
count =sum(count))
# A tibble: 4 x 4
rowname Event_Groups Other_column count
<chr> <chr> <chr> <dbl>
1 1 1_G1, 2_G2 A 1
2 2 2_G1 B 0
3 3 4_G4 C 1
4 4 7_G5, 8_G5, 9_G5 D 3

Find index based on the minimum number for every group

I want to extract the index based of the minimum number for every Group
Group <- c("A","A","A","A","A","B","B","C","C","C","C")
Number <- c(12,45,15,65,54,21,23,12,3,5,6,11,34,656,754)
data.frame(Group,Number)
Group Number
1 A 12
2 A 45
3 A 15
4 A 65
5 A 54
6 B 21
7 B 23
8 C 12
9 C 3
10 C 5
11 C 6
The result should be a vector that contain the indices:
Answer
vector <- (1,6,9)
Create a sequence column, grouped by 'Group', summarise by returning the corresponding row number based on the index of min value of 'Number' (which.min) and pull the column as a vector
library(dplyr)
df1 %>%
mutate(rn = row_number()) %>%
group_by(Group) %>%
summarise(n = rn[which.min(Number)]) %>%
pull(n)
#[1] 1 6 9
data
df1 <- structure(list(Group = c("A", "A", "A", "A", "A", "B", "B", "C",
"C", "C", "C"), Number = c(12L, 45L, 15L, 65L, 54L, 21L, 23L,
12L, 3L, 5L, 6L)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10", "11"))
Does this work for you?
library(dplyr)
df %>%
mutate(row_n = row_number()) %>%
group_by(Group) %>%
slice_min(Number)
# A tibble: 3 x 3
# Groups: Group [3]
Group Number row_n
<chr> <dbl> <int>
1 A 12 1
2 B 12 7
3 C 3 8
The row numbers are in column row_n. If you want outputted only the row numbers, add %>% ungroup() %>% select(-c(1:2)) like so:
df %>%
mutate(row_n = row_number()) %>%
group_by(Group) %>%
slice_min(Number) %>%
ungroup() %>%
select(-c(1:2))
# A tibble: 3 x 1
row_n
<int>
1 1
2 7
3 8
Data:
Group <- c("A","A","A","A","A","B","B","C","C","C","C")
Number <- c(12,45,65,54,21,23,12,3,5,6,34)
df <- data.frame(Group,Number)
This function returns the index i of the smallest value in v
FUN = function(v, i) i[which.min(v)]
Here are the values by group
v = split(df$Number, df$Group)
and the index into the original data.frame by group
i = split(seq_along(df$Number), df$Group)
Apply our function to each group
mapply(FUN, v, i)
In one go:
FUN = function(v, i) i[which.min(v)]
v = split(df$Number, df$Group)
i = split(seq_along(df$Number), df$Group)
mapply(FUN, v, i)

How to reconcile two different IDs as one, then apply to a df with both IDs but count the subject only once in R?

I have two different IDs for the same subject(patient).
In this other vector of IDs, the two IDs are both in there that indicate the same patient. How do I only count the patient once(by ID1), instead of two different patients with different IDs?
ID1 ID2
11 12
13 14
15 16
vector
11,12,13,13,14,16
I want to count only the unique patients by ID1, such that I would get
x=11,13,15
Thank you!
I think probably you need this
df %>% filter((ID1 %in% vector) | (ID2 %in% vector)) %>%
select(ID1)
ID1
1 11
2 13
3 15
Check it on a better sample
df <- structure(list(ID1 = c(11L, 13L, 15L, 17L, 19L, 21L), ID2 = c(12L,
14L, 16L, 18L, 20L, 22L)), class = "data.frame", row.names = c(NA,
-6L)
> df
ID1 ID2
1 11 12
2 13 14
3 15 16
4 17 18
5 19 20
6 21 22
vector <- c(11, 12, 13, 13, 14, 16, 18, 18)
> df %>% filter((ID1 %in% vector) | (ID2 %in% vector)) %>% select(ID1)
ID1
1 11
2 13
3 15
4 17
By slightly modifying Ronak's code, you can get same results
df %>%
mutate(ID = row_number()) %>%
tidyr::pivot_longer(cols = c(ID1, ID2)) %>%
inner_join(tibble::enframe(vector), by = 'value') %>%
distinct(ID, .keep_all = T) %>%
select(ID, value) %>%
inner_join(df %>% mutate(ID = row_number()), by = 'ID') %>%
select(ID1)
Create a unique ID number for each patient, get the data in long format so both the ID's are in same column, join it with the vector select vector values for distinct ID values.
library(dplyr)
df %>%
mutate(ID = row_number()) %>%
tidyr::pivot_longer(cols = c(ID1, ID2)) %>%
inner_join(tibble::enframe(vector), by = 'value') %>%
distinct(ID, .keep_all = TRUE) %>%
select(value)
# value
# <dbl>
#1 11
#2 13
#3 16
data
df <- structure(list(ID1 = c(11L, 13L, 15L), ID2 = c(12L, 14L, 16L)),
class = "data.frame", row.names = c(NA, -3L))
vector <- c(11, 12, 13, 13, 14, 16)
You can use any with %in% by selecting the rows with apply to subset ID1.
ID$ID1[apply(ID, 1, function(z) any(v %in% z))]
#[1] 11 13 15
or use rowSums.
ID$ID1[rowSums(sapply(ID, "%in%", v)) > 0]
#[1] 11 13 15
Data:
ID <- read.table(header=TRUE, text="ID1 ID2
11 12
13 14
15 16")
v <- c(11,12,13,13,14,16)

reorganizing dataframe drastically in R using tidyr

I have a dataframe that consists of vegetation data. Columns are species names and rows are their relative abundances per site. Site, plotcode and year are also variables. Data looks like this:
Site Code Year speca specb specc
A A1 2001 0 1 10
A A2 2001 5 5 15
B B1 2001 0 5 20
B B1 2004 15 75 0
C C1 2006 50 0 15
I want the datatable to look like this:
species A1_2001 A2_2001 B1_2001 B1_2004 C1_2006
speca 0 5 0 15 50
specb 1 5 5 75 0
specc 10 15 20 0 15
I tried using the tidyr:pivot_longer function, but this does not give the result i want.
tidyr::pivot_longer(df, 4:length(df), names_to = "species", values_to = "abundance")
Is there a way to achieve this in a codefriendly way, preferably using tidyr (tidyverse)?
We reshape it to 'long' format and then do the 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with('spec'), names_to = 'species') %>%
unite(CodeYear, Code, Year) %>%
select(-Site) %>%
pivot_wider(names_from = CodeYear, values_from = value)
# A tibble: 3 x 6
# species A1_2001 A2_2001 B1_2001 B1_2004 C1_2006
# <chr> <int> <int> <int> <int> <int>
#1 speca 0 5 0 15 50
#2 specb 1 5 5 75 0
#3 specc 10 15 20 0 15
data
df <- structure(list(Site = c("A", "A", "B", "B", "C"), Code = c("A1",
"A2", "B1", "B1", "C1"), Year = c(2001L, 2001L, 2001L, 2004L,
2006L), speca = c(0L, 5L, 0L, 15L, 50L), specb = c(1L, 5L, 5L,
75L, 0L), specc = c(10L, 15L, 20L, 0L, 15L)), class = "data.frame",
row.names = c(NA,
-5L))
In data.table:
library(data.table)
DT <- data.table(Site = c('A1','A2','B1','B1','C1'),
Year = c(2001, 2001, 2001, 2004, 2006),
speca = c(0,5,0,15,50),
specb = c(1,5,5,75,0),
specc = c(10,15,20,0,15))
DT <- melt(DT, id.vars = c('Site', 'Year'),
measure.vars = c('speca', 'specb', 'specc') , variable.name = 'species')
DT <- dcast(DT, species ~ Site + Year, value.var = c('value'))
> DT
species A1_2001 A2_2001 B1_2001 B1_2004 C1_2006
1: speca 0 5 0 15 50
2: specb 1 5 5 75 0
3: specc 10 15 20 0 15
You mainly need a pivot_wider() to follow your pivot_longer():
library(tidyverse)
df <- tribble(~Site, ~Code, ~Year, ~speca, ~specb, ~specc,
"A", "A1", 2001, 0, 1, 10,
"A", "A2", 2001, 5, 5, 15,
"B", "B1", 2001, 0, 5, 20,
"B", "B1", 2004, 15, 75, 0,
"C", "C1", 2006, 50, 0, 15)
df %>%
mutate(Code = paste(Code, Year, sep = "_")) %>%
select(-Site, -Year) %>%
pivot_longer(starts_with("spec"), names_to = "species", values_to = "abundance") %>%
pivot_wider(names_from = Code, values_from = abundance)
The result is
# A tibble: 3 x 6
species A1_2001 A2_2001 B1_2001 B1_2004 C1_2006
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 speca 0 5 0 15 50
2 specb 1 5 5 75 0
3 specc 10 15 20 0 15

How to add additional columns using tidyr group_by function in R?

This question is a follow up to my post from this answer.
Data
df1 <- structure(list(Date = c("6/24/2020", "6/24/2020", "6/24/2020",
"6/24/2020", "6/25/2020", "6/25/2020"), Market = c("A", "A",
"A", "A", "A", "A"), Salesman = c("MF", "RP", "RP", "FR", "MF",
"MF"), Product = c("Apple", "Apple", "Banana", "Orange", "Apple",
"Banana"), Quantity = c(20L, 15L, 20L, 20L, 10L, 15L), Price = c(1L,
1L, 2L, 3L, 1L, 1L), Cost = c(0.5, 0.5, 0.5, 0.5, 0.6, 0.6)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
Solution
library(dplyr) # 1.0.0
library(tidyr)
df1 %>%
group_by(Date, Market) %>%
group_by(Revenue = c(Quantity %*% Price),
TotalCost = c(Quantity %*% Cost),
Product, .add = TRUE) %>%
summarise(Sold = sum(Quantity)) %>%
pivot_wider(names_from = Product, values_from = Sold)
# A tibble: 2 x 7
# Groups: Date, Market, Revenue, TotalCost [2]
# Date Market Revenue TotalCost Apple Banana Orange
# <chr> <chr> <dbl> <dbl> <int> <int> <int>
#1 6/24/2020 A 135 37.5 35 20 20
#2 6/25/2020 A 25 15 10 15 NA
#akrun's solution works well. Now I'd like to know how to add three more columns for quantity sold by salesmen to the existing results so the final output will look like this:
Date Market Revenue Total Cost Apples Sold Bananas Sold Oranges Sold MF RP FR
6/24/2020 A 135 37.5 35 20 20 20 35 20
6/25/2020 A 25 15 15 25 NA 25 NA NA
One option would be to do the group by operations separately as these are done on separate columns and then do a join by the common columns i.e. 'Date', 'Market'
library(dplyr)
library(tidyr)
out1 <- df1 %>%
group_by(Date, Market) %>%
group_by(Revenue = c(Quantity %*% Price),
TotalCost = c(Quantity %*% Cost),
Product, .add = TRUE) %>%
summarise(Sold = sum(Quantity)) %>%
pivot_wider(names_from = Product, values_from = Sold)
out2 <- df1 %>%
group_by(Date, Market, Salesman) %>%
summarise(SalesSold = sum(Quantity)) %>%
pivot_wider(names_from = Salesman, values_from = SalesSold)
left_join(out1, out2)
# A tibble: 2 x 10
# Groups: Date, Market, Revenue, TotalCost [2]
# Date Market Revenue TotalCost Apple Banana Orange FR MF RP
# <chr> <chr> <dbl> <dbl> <int> <int> <int> <int> <int> <int>
#1 6/24/2020 A 135 37.5 35 20 20 20 20 35
#2 6/25/2020 A 25 15 10 15 NA NA 25 NA

Resources