sum every 3 values by group - r

I am trying to sum 3 values within a data frame by group.
For example:
Name Data
A 3
A 5
A 9
A 4
A 3
A 5
A 9
A 4
A 3
A 5
A 9
A 4
B 1
B 5
B 1
B 9
Here is what I want:
Name Data
A 17
A 18
A 16
A 12
A 17
A 18
A 17
A 12
A 17
A 18
A N/A
A N/A
B 7
B 15
B N/A
B N/A
I am trying to do this iwth dplyr, is there a better way?

We could use rollsum (from zoo) after grouping by 'Name'
library(dplyr)
library(zoo)
df1 %>%
group_by(Name) %>%
mutate(new = rollsum(Data, 3, fill = NA, align = 'left')) %>%
ungroup
-ouput
# A tibble: 16 x 3
Name Data new
<chr> <int> <int>
1 A 3 17
2 A 5 18
3 A 9 16
4 A 4 12
5 A 3 17
6 A 5 18
7 A 9 16
8 A 4 12
9 A 3 17
10 A 5 18
11 A 9 NA
12 A 4 NA
13 B 1 7
14 B 5 15
15 B 1 NA
16 B 9 NA
Or using frollsum from data.table
library(data.table)
setDT(df1)[, new := frollsum(Data, 3, align = 'left'), by = Name]
df1
Name Data new
1: A 3 17
2: A 5 18
3: A 9 16
4: A 4 12
5: A 3 17
6: A 5 18
7: A 9 16
8: A 4 12
9: A 3 17
10: A 5 18
11: A 9 NA
12: A 4 NA
13: B 1 7
14: B 5 15
15: B 1 NA
16: B 9 NA
data
df1 <- structure(list(Name = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "B", "B", "B", "B"), Data = c(3L, 5L, 9L,
4L, 3L, 5L, 9L, 4L, 3L, 5L, 9L, 4L, 1L, 5L, 1L, 9L)),
class = "data.frame", row.names = c(NA,
-16L))

Related

R Create multiple rows from 1 row based on presence of values in certain columns

I have a data frame that looks like the following:
ID Date Participant_1 Participant_2 Participant_3 Covariate 1 Covariate 2 Covariate 3
1 9/1 A B 16 2 1
2 5/4 B 4 2 2
3 6/3 C A B 8 3 6
4 2/8 A 7 8 4
5 9/3 C A 7 1 3
I need to expand this data frame so that a row is present for all of the participants present at each event "ID", with the date and all other variables in all the created rows. The multiple participant columns would now only be one column for participant. The output would therefore be:
ID Date Participant Covariate 1 Covariate 2 Covariate 3
1 9/1 A 16 2 1
1 9/1 B 16 2 1
2 5/4 B 4 2 2
3 6/3 C 8 3 6
3 6/3 A 8 3 6
3 6/3 B 8 3 6
4 2/8 A 7 8 4
5 9/3 C 7 1 3
5 9/3 A 7 1 3
Is there a way to do this efficiently? Perhaps with a pivot function?
We can use pivot_longer and then some formatting
library(tidyr)
df %>%
pivot_longer(starts_with("Participant"), values_to = "Participant") %>%
select(-name) %>%
relocate(Participant, .before = Covariate_1) %>%
drop_na()
# A tibble: 9 × 6
ID Date Participant Covariate_1 Covariate_2 Covariate_3
<int> <chr> <chr> <int> <int> <int>
1 1 9/1 A 16 2 1
2 1 9/1 B 16 2 1
3 2 5/4 B 4 2 2
4 3 6/3 C 8 3 6
5 3 6/3 A 8 3 6
6 3 6/3 B 8 3 6
7 4 2/8 A 7 8 4
8 5 9/3 C 7 1 3
9 5 9/3 A 7 1 3
Here's the example data used:
df <- structure(list(ID = 1:5, Date = c("9/1", "5/4", "6/3", "2/8",
"9/3"), Participant_1 = c("A", "B", "C", "A", "C"), Participant_2 = c("B",
NA, "A", NA, "A"), Participant_3 = c(NA, NA, "B", NA, NA), Covariate_1 = c(16L,
4L, 8L, 7L, 7L), Covariate_2 = c(2L, 2L, 3L, 8L, 1L), Covariate_3 = c(1L,
2L, 6L, 4L, 3L)), class = "data.frame", row.names = c(NA, -5L
))

Remove columns that have one zero value

I have data frame like this
class col2 col3 col4 col5 col6
A AA 0 5 4 2 15
B AA 4 10 14 12 25
C AA 19 2 8 5 3
D SS 17 5 5 32 12
E AA 14 2 12 14 55
F II 12 17 1 9 0
G SS 10 37 8 2 17
H II 17 7 5 7 14
I want to remove all columns that have zero values
class col3 col4 col5
A AA 5 4 2
B AA 10 14 12
C AA 2 8 5
D SS 5 5 32
E AA 2 12 14
F II 17 1 9
G SS 37 8 2
H II 7 5 7
So the result I want is just want those columns which do not contain any zeros
Thank you
Based on your description I assume you want to remove rows with zero values, not columns. Here's how you can do it with dplyr:
library(dplyr)
filter(df, across(everything(), ~.!=0))
#> # A tibble: 4 x 6
#> class col2 col3 col4 col5 col6
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 AA 4 10 14 12 25
#> 2 AA 19 2 8 5 3
#> 3 AA 14 2 12 14 55
#> 4 SS 10 37 8 2 17
A possible solution:
df[apply(df == 0, 2, sum) == 0]
#> class col3 col4 col5
#> A AA 5 4 2
#> B AA 10 14 12
#> C AA 2 8 5
#> D SS 5 5 32
#> E AA 2 12 14
#> F II 17 1 9
#> G SS 37 8 2
#> H II 7 5 7
With the new dataset:
base R:
In base R we can use Filter and negate any:
Filter(function(x) !any(x %in% 0), df)
class col3 col4 col5
A AA 5 4 2
B AA 10 14 12
C AA 2 8 5
D SS 5 5 32
E AA 2 12 14
F II 17 1 9
G SS 37 8 2
H II 7 5 7
One base R option could be:
df_so[,!sapply(df_so, function(x) any(x == 0))]
# class col3 col4 col5
#A AA 5 4 2
#B AA 10 14 12
#C AA 2 8 5
#D SS 5 5 32
#E AA 2 12 14
#F II 17 1 9
#G SS 37 8 2
#H II 7 5 7
Not my answer, but #user2974951 provided a very fast and straightforward answer as a comment in the Original Post:
df[,colSums(df==0)==0]
Here is another option using a combination of select and where:
library(tidyverse)
df %>%
select(where(~!any(. == 0)))
Output
class col3 col4 col5
A AA 5 4 2
B AA 10 14 12
C AA 2 8 5
D SS 5 5 32
E AA 2 12 14
F II 17 1 9
G SS 37 8 2
H II 7 5 7
Before select_if was deprecated, we could have written it like:
df %>%
select_if( ~ !any(. == 0))
Data Table
Here is a possible data.table solution:
library(data.table)
dt <- as.data.table(df)
dt[, .SD, .SDcols = !names(dt)[(colSums(dt == 0) > 0)]]
Data
df <- structure(list(class = c("AA", "AA", "AA", "SS", "AA", "II",
"SS", "II"), col2 = c(0L, 4L, 19L, 17L, 14L, 12L, 10L, 17L),
col3 = c(5L, 10L, 2L, 5L, 2L, 17L, 37L, 7L), col4 = c(4L,
14L, 8L, 5L, 12L, 1L, 8L, 5L), col5 = c(2L, 12L, 5L, 32L,
14L, 9L, 2L, 7L), col6 = c(15L, 25L, 3L, 12L, 55L, 0L, 17L,
14L)), class = "data.frame", row.names = c("A", "B", "C",
"D", "E", "F", "G", "H"))

Subset groups in a data.table using conditions on two columns

I have a data.table with a high number of groups. I would like to subset whole groups (not just rows) based on the conditions on multiple columns. Consider the following data.table:
DT <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L),
group = c("A", "A", "A", "A", "B", "B", "B", "B", "C", "C", "C", "C"),
y = c(14, 19, 16, 10, 6, 8, 14, 19, 10, 9, 6, 8),
x = c(3, 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3)),
row.names = c(NA, -12L),
class = c("data.table", "data.frame"))
>DT
id group y x
1: 1 A 14 3
2: 2 A 19 3
3: 3 A 16 2
4: 4 A 10 3
5: 5 B 6 3
6: 6 B 8 3
7: 7 B 14 3
8: 8 B 19 2
9: 9 C 10 2
10: 10 C 9 3
11: 11 C 6 3
12: 12 C 8 3
I would like to keep groups that have y=6 and x=3 in the same row. So that I would have only class B and C (preferably using data.table package in R):
id group y x
1: 5 B 6 3
2: 6 B 8 3
3: 7 B 14 3
4: 8 B 19 2
5: 9 C 10 2
6: 10 C 9 3
7: 11 C 6 3
8: 12 C 8 3
All my attempts gave me only those rows containing y=6 and x=3, which I do not want:
id group y x
1: 5 B 6 3
2: 11 C 6 3
With data.table:
DT[,.SD[any(x == 3 & y == 6)], by=group]
group id y x
<char> <int> <num> <num>
1: B 5 6 3
2: B 6 8 3
3: B 7 14 3
4: B 8 19 2
5: C 9 10 2
6: C 10 9 3
7: C 11 6 3
8: C 12 8 3
Another possibly faster option:
DT[, if (any(x == 3 & y == 6)) .SD, by=group]
Try dplyr package
#select groups containing y and x
groups = DT %>% filter(y == 6, x == 3) %>% select(group) %>% unique() %>% unlist() %>% as.vector()
# filter for selected groups
DT %>% filter(group %in% groups)
A data.table option
> DT[group %in% DT[.(3, 6), group, on = .(x, y)]]
id group y x
1: 5 B 6 3
2: 6 B 8 3
3: 7 B 14 3
4: 8 B 19 2
5: 9 C 10 2
6: 10 C 9 3
7: 11 C 6 3
8: 12 C 8 3

Select i-th element if a condition occurs with for loop

I have a dataframe (df) like this:
Rif dd A A A A A B B B B B C C C C C
a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5
b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8
c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4
d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7
I have a vector
rif <- c(0, 15, 50, 90, 110)
I would like to add to the df a column such that if dd(i) >= rif(i-1) & dd(i)
Rif dd A A A A A B B B B B C C C C C V1
a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8
b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1
c1 100 1 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14
d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8
The same should be done for V2 and V3 with respect to Bs and Cs columns.
ref <- c(0, 15, 50, 90, 110)
for (i in 2:length(ref)) {
for (j in 1:nrow(df)) {
if (df$dd >= ref[i-1] && df$dd< ref[i]) {
df[,"V1"] <- df[j,i]
}
}
}
I get the following error:
Error in if (..) :
missing value where TRUE/FALSE needed
Probably the if command is not the correct one.
could you help me?
I think you just need to better specify the rows and columns:
df <- data.frame(
c("a1","b1","c1","d1")
, c(10,20,100,70), c(5,12,11,4), c(8,7,6,3), c(10,1,8,7), c(2,5,1,8), c(6,9,14,11)
, c(9,10,1,19), c(6,5,11,2), c(5,3,9,6), c(7,8,3,7), c(9,7,6,1)
, c(1,3,10,20), c(5,6,8,18), c(6,1,13,7), c(4,9,8,10), c(5,8,4,7)
)
colnames(df) <- c("Rif", "dd", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C")
ref <- c(0, 15, 50, 90, 110)
for (i in 2:length(ref)) {
for (j in 1:nrow(df)) {
if (df$dd[j] >= ref[i-1] && df$dd[j] < ref[i]) {
df$V1[j] <- df[j,i+2]
df$V2[j] <- df[j,i+2+5]
df$V3[j] <- df[j,i+2+10]
}
}
}
which gives:
Rif dd A A A A A B B B B B C C C C C V1 V2 V3
1 a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8 6 5
2 b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1 3 1
3 c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14 6 4
4 d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8 7 10
Another option in base R:
lters <- c(A="A", B="B", C="C")
firstcol <- lapply(lters, function(x) match(x, colnames(DF)))
idx <- findInterval(DF$dd, rif)
for (l in lters)
DF[, paste0("V_", l)] <- as.integer(DF[cbind(seq_len(nrow(DF)), idx + firstcol[[l]])])
DF
output:
Rif dd A A.1 A.2 A.3 A.4 B B.1 B.2 B.3 B.4 C C.1 C.2 C.3 C.4 V_A V_B V_C
1 a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8 6 5
2 b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1 3 1
3 c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14 6 4
4 d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8 7 10
data:
DF <- structure(list(Rif = c("a1", "b1", "c1", "d1"), dd = c(10L, 20L,
100L, 70L), A = c(5L, 12L, 11L, 4L), A = c(8L, 7L, 6L, 3L), A = c(10L,
1L, 8L, 7L), A = c(2L, 5L, 1L, 8L), A = c(6L, 9L, 14L, 11L),
B = c(9L, 10L, 1L, 19L), B = c(6L, 5L, 11L, 2L), B = c(5L,
3L, 9L, 6L), B = c(7L, 8L, 3L, 7L), B = c(9L, 7L, 6L, 1L),
C = c(1L, 3L, 10L, 20L), C = c(5L, 6L, 8L, 18L), C = c(6L,
1L, 13L, 7L), C = c(4L, 9L, 8L, 10L), C = c(5L, 8L, 4L, 7L
)), class = "data.frame", row.names = c(NA, -4L))
rif <- c(0, 15, 50, 90, 110)
Another way is reorganize the data by separating the lookup values into another table and perform an update join using data.table:
library(data.table)
setDT(DF)
out <- DF[, .(rn=.I, Rif, dd)]
#reorganizing data
lc <- grepl("A|B|C", names(DF))
lutbl <- data.table(COL=names(DF)[lc], transpose(DF[, ..lc]))
lutbl <- melt(lutbl, measure.vars=patterns("V"), variable.name="rn")[,
c("rn", "rif") := .(as.integer(gsub("V", "", rn)), rep(rif, sum(lc)*nrow(DF)/length(rif)))]
#lookup and update
for (l in lters)
out[, paste0("NEW", l) := lutbl[COL==l][out, on=c("rn", "rif"="dd"), roll=-Inf, value]]
out:
rn Rif dd NEWA NEWB NEWC
1: 1 a1 10 8 6 5
2: 2 b1 20 1 3 1
3: 3 c1 100 14 6 4
4: 4 d1 70 8 7 10

R- Specific merging of rows in a dataframe within unique groups

I have a huge data frame in R like the following:
df <- data.frame("ITEM" = c(1,1,1,2,2,3,3,3,3,4),
"ID" = c("A","B","C","D","E","F","G","A","B","C"),
"Score" = c(7,8,7,3,5,4,6,9,10,5),
"Date" = = c("1/1/2018","1/3/2018","1/6/2018","1/7/2017","1/10/2017","1/1/2003","1/3/2004","1/5/2008","1/7/2010","1/8/2010"))
ITEM ID Score Date
1 1 A 7 1/1/2018
2 1 B 8 1/3/2018
3 1 C 7 1/6/2018
4 2 D 3 1/7/2017
5 2 E 5 1/10/2017
6 3 F 4 1/1/2003
7 3 G 6 1/3/2004
8 3 A 9 1/5/2008
9 3 B 10 1/7/2010
10 4 C 5 1/8/2010
11 4 H 8 1/3/2011
The data is already grouped by unique items and in ascending date order. I would like to transpose the data into the following:
ITEM ID Score Date ID_2 Score_2 Date_2
1 1 A 7 1/1/2018 B 8 1/3/2018
2 1 B 8 1/3/2018 C 7 1/6/2018
4 2 D 3 1/7/2017 E 5 1/10/2017
6 3 F 4 1/1/2003 G 6 1/3/2004
7 3 G 6 1/3/2004 A 9 1/5/2008
8 3 A 9 1/5/2008 B 10 1/7/2010
10 4 C 5 1/8/2010 H 8 1/3/2011
Each item has an owner and is transferred to another person and given a score. E.g. Item 1 is held by A who gets a score of 7, then it moves to B who scores 8, then C who scores 7.
I would like to get it in the above format...to merge each row with the above row (but within the item groups) - I tried reshaping the data using dcast from what I know, but you would get ID_3, ID_4 columns as well for some items whereas I only want the columns for ID_2, Score_2 and Date_2.
Any ideas? Thanks.
Based on the expected output, we could split by 'ITEM', cbind the rows with the lag of rows and then convert the list of data.frame to a single data.frame with rbind
out <- do.call(rbind, lapply(split(df, df$ITEM),
function(x) cbind(x[-nrow(x), ], x[-1, -1])))
row.names(out) <- NULL
out
# ITEM ID Score Date ID Score Date
#1 1 A 7 1/1/2018 B 8 1/3/2018
#2 1 B 8 1/3/2018 C 7 1/6/2018
#3 2 D 3 1/7/2017 E 5 1/10/2017
#4 3 F 4 1/1/2003 G 6 1/3/2004
#5 3 G 6 1/3/2004 A 9 1/5/2008
#6 3 A 9 1/5/2008 B 10 1/7/2010
#7 4 C 5 1/8/2010 H 8 1/3/2011
Or using tidyverse
library(tidyverse)
df %>%
group_by(ITEM) %>%
nest %>%
mutate(data = map(data, ~ bind_cols(.x[-nrow(.x), ], .x[-1, ]))) %>%
unnest
# A tibble: 7 x 7
# ITEM ID Score Date ID1 Score1 Date1
# <int> <chr> <int> <chr> <chr> <int> <chr>
#1 1 A 7 1/1/2018 B 8 1/3/2018
#2 1 B 8 1/3/2018 C 7 1/6/2018
#3 2 D 3 1/7/2017 E 5 1/10/2017
#4 3 F 4 1/1/2003 G 6 1/3/2004
#5 3 G 6 1/3/2004 A 9 1/5/2008
#6 3 A 9 1/5/2008 B 10 1/7/2010
#7 4 C 5 1/8/2010 H 8 1/3/2011
data
df <- structure(list(ITEM = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 4L,
4L), ID = c("A", "B", "C", "D", "E", "F", "G", "A", "B", "C",
"H"), Score = c(7L, 8L, 7L, 3L, 5L, 4L, 6L, 9L, 10L, 5L, 8L),
Date = c("1/1/2018", "1/3/2018", "1/6/2018", "1/7/2017",
"1/10/2017", "1/1/2003", "1/3/2004", "1/5/2008", "1/7/2010",
"1/8/2010", "1/3/2011")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11"))

Resources